In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
np.random.seed(500)

In [3]:
train=pd.read_csv('tweet_train.csv')

In [4]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['TWEET'] = train['TWEET'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['TWEET'].head()


0    hitting gym! afterwards looking forward rewiri...
1                              doesnt want go work lol
2    @SherriGarrity Sorry I missed responding yeste...
3                                 wanna come bohol!!!!
4                                          I cold buee
Name: TWEET, dtype: object

In [5]:
freq = pd.Series(' '.join(train['TWEET']).split()).value_counts()[:10]
freq

I       2962
I'm      580
like     482
get      473
-        466
work     396
go       395
good     394
day      344
got      334
dtype: int64

In [6]:
freq = list(freq.index)
train['TWEET'] = train['TWEET'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['TWEET'].head()

0    hitting gym! afterwards looking forward rewiri...
1                                      doesnt want lol
2    @SherriGarrity Sorry missed responding yesterd...
3                                 wanna come bohol!!!!
4                                            cold buee
Name: TWEET, dtype: object

In [7]:
freq = pd.Series(' '.join(train['TWEET']).split()).value_counts()[-10:]
freq

eggs                                                        1
Dizzle                                                      1
http://bit.ly/gKdmC                                         1
@odaraia                                                    1
Melbourne!                                                  1
(and/or                                                     1
period.                                                     1
fw/proxy                                                    1
@viirak                                                     1
http://www.etsy.com/view_listing.php?listing_id=21362047    1
dtype: int64

In [12]:
from textblob import TextBlob
TextBlob(train['TWEET'][1]).words

WordList(['doesnt', 'want', 'lol'])

In [13]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['TWEET'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0    hit gym! afterward look forward rewir studio.....
1                                      doesnt want lol
2    @sherrigarr sorri miss respond yesterday roche...
3                                 wanna come bohol!!!!
4                                            cold buee
Name: TWEET, dtype: object

In [14]:
from textblob import Word
train['TWEET'] = train['TWEET'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['TWEET'].head()

0    hitting gym! afterwards looking forward rewiri...
1                                      doesnt want lol
2    @SherriGarrity Sorry missed responding yesterd...
3                                 wanna come bohol!!!!
4                                            cold buee
Name: TWEET, dtype: object

In [15]:
from nltk import pos_tag
text=train['TWEET'].str.split().map(pos_tag)
text.head()
                                

0    [(hitting, VBG), (gym!, NN), (afterwards, NNS)...
1               [(doesnt, NN), (want, VBP), (lol, NN)]
2    [(@SherriGarrity, NN), (Sorry, NNP), (missed, ...
3           [(wanna, NN), (come, VB), (bohol!!!!, NN)]
4                             [(cold, JJ), (buee, NN)]
Name: TWEET, dtype: object

In [16]:
train['sentiment'] = train['TWEET'].apply(lambda x: TextBlob(x).sentiment[0] )
train[['TWEET','sentiment']].head()

Unnamed: 0,TWEET,sentiment
0,hitting gym! afterwards looking forward rewiri...,0.0
1,doesnt want lol,0.8
2,@SherriGarrity Sorry missed responding yesterd...,-0.185938
3,wanna come bohol!!!!,0.0
4,cold buee,-0.6


In [17]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(train['TWEET'],train['POLARITY'],test_size=0.3)

In [18]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [19]:
Tfidf_vect = TfidfVectorizer(max_features=1500)
Tfidf_vect.fit(train['TWEET'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [20]:
print(Tfidf_vect.vocabulary_)

{'gym': 572, 'looking': 773, 'forward': 506, 'studio': 1223, 'yet': 1489, 'doesnt': 349, 'want': 1393, 'lol': 765, 'sorry': 1187, 'missed': 838, 'yesterday': 1488, 'far': 451, 'dark': 311, 'chocolate': 245, 'wanna': 1392, 'come': 266, 'cold': 262, 'ate': 104, 'food': 499, 'cake': 213, 'another': 72, 'driving': 369, 'back': 121, 'la': 722, 'tomorrow': 1311, 'sleep': 1156, 'still': 1213, 'outside': 936, 'least': 737, 'sunny': 1237, 'for': 501, 'running': 1076, 'low': 784, 'http': 643, 'plurk': 986, 'com': 265, 'at': 103, 'lunch': 788, 'break': 183, 'tired': 1304, 'ready': 1038, 'trouble': 1331, 'sleeping': 1157, 'finger': 478, 'hot': 636, 'cup': 298, 'tea': 1260, 'ouch': 933, 'it': 686, 'time': 1300, 'that': 1275, 'thing': 1283, 'stay': 1208, 'right': 1064, 'then': 1279, 'someone': 1174, 'mileycyrus': 829, 'problem': 1009, 'here': 614, 'let': 744, 'see': 1104, 'me': 813, 'way': 1404, 'school': 1095, 'today': 1306, 'im': 662, 'sick': 1140, 'gone': 550, 'forever': 502, 'hope': 631, 'someth

In [21]:
print(Train_X_Tfidf)

  (0, 1189)	0.4122736787865389
  (0, 773)	0.37545298361105656
  (0, 686)	0.2702763173568607
  (0, 561)	0.3282941287818739
  (0, 506)	0.3991012199466304
  (0, 415)	0.40922989206298277
  (0, 214)	0.4260143892405095
  (1, 1204)	0.49255376932058526
  (1, 1140)	0.45622100770296947
  (1, 611)	0.4981341872359417
  (1, 480)	0.5487399274386351
  (2, 1276)	0.5095029166490661
  (2, 636)	0.5204247232296284
  (2, 628)	0.6852480466060882
  (3, 1038)	0.7963404223270638
  (3, 854)	0.6048486850179585
  (4, 1214)	0.703654788589127
  (4, 28)	0.7105420033295646
  (5, 1492)	0.22830476453423423
  (5, 1369)	0.3214707321555486
  (5, 1367)	0.4310340505478191
  (5, 1272)	0.28823278317152395
  (5, 1147)	0.3255169792851955
  (5, 920)	0.41555971103017886
  (5, 33)	0.4035568808058978
  :	:
  (6995, 1140)	0.29004562112685484
  (6995, 1021)	0.3981665150291597
  (6995, 890)	0.24647750325222062
  (6995, 726)	0.26166254131374295
  (6995, 577)	0.3545983971545254
  (6995, 372)	0.4292605299623747
  (6995, 367)	0.4067316120

In [22]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf)
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  72.14261912695768


In [23]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  73.20893035654782


In [24]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
actual = Test_Y
predicted = predictions_SVM
results = confusion_matrix(actual, predicted) 
print ('Confusion Matrix :')
print(results) 
print ('Accuracy Score :',accuracy_score(actual, predicted)) 
print ('Report : ')
print (classification_report(actual, predicted)) 

Confusion Matrix :
[[1042  432]
 [ 372 1155]]
Accuracy Score : 0.7320893035654782
Report : 
             precision    recall  f1-score   support

          0       0.74      0.71      0.72      1474
          1       0.73      0.76      0.74      1527

avg / total       0.73      0.73      0.73      3001

