In [65]:
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectPercentile, chi2

In [46]:
data = pd.read_csv('../data/Sentiment Analysis Dataset.csv', 
                   sep=',', 
                   error_bad_lines=False)

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [94]:
data.sample(10)

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
1127340,1127356,1,Sentiment140,ohmishka i thought it was such a great idea to...
768481,768497,1,Sentiment140,good morning putting a colour in my hair and l...
152078,152091,1,Sentiment140,erikveld what song are you moving to le musiqu...
579422,579438,0,Sentiment140,alex_levin haha i know right
1445253,1445269,1,Sentiment140,i love all left outs video blogs check em o...
1543644,1543660,0,Sentiment140,that was a miserable rg final poor dinaraendin...
1026358,1026374,1,Sentiment140,my theme for ff is man candy avatars from my f...
359226,359239,1,Sentiment140,mia_r simalves posted them all on anoops fb fa...
272029,272042,1,Sentiment140,jessylou22 yay happy for her lt3
536578,536594,0,Sentiment140,inaperfectworld there wouldnt be racism these...


In [48]:
def clean_text(df, col):
    df[col] = df[col].str.lower()
    df[col] = df[col].str.replace(r'[^\w\s]', '')
    ## to be added
    # stemming, lemmetizing
    # twitter/known entity replacement
    # spelling correction ???

In [49]:
clean_text(data, 'SentimentText')

In [95]:
data.sample(10)

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
1510635,1510651,0,Sentiment140,officially i believe i am perhaps the bordest ...
1304240,1304256,0,Sentiment140,yeah i definately did
747870,747886,1,Sentiment140,getting ready for e3 625pm httpxboxcom micros...
641870,641886,0,Sentiment140,benjicajess i know but u aint in peeenang
1488554,1488570,0,Sentiment140,listening to careless whispers missing my sbfs
222560,222573,0,Sentiment140,jessicaheintz thats not good good luck though
129242,129255,0,Sentiment140,chevale my internet is being a again wtf i th...
976411,976427,1,Sentiment140,lbergz i love la cantera its a nice shopping a...
649850,649866,1,Sentiment140,enegizer rabbit mode keep going and going an...
884322,884338,0,Sentiment140,i just wanted to saysome people really freakin...


In [88]:
data['Sentiment'].value_counts()

1    790177
0    788435
Name: Sentiment, dtype: int64

In [51]:
le = LabelEncoder()
y = le.fit_transform(data['Sentiment'])

In [52]:
x_train, x_test, y_train, y_test = train_test_split(data['SentimentText'], y)

In [53]:
%%time
vectorizer = TfidfVectorizer(max_df=0.5,
                             min_df=1000,
                             analyzer=u'word', 
                             ngram_range=(1, 2),
                             max_features=10000,
                             stop_words='english')
vectorizer.fit(x_train)
x_train_vec = vectorizer.transform(x_train)

CPU times: user 1min 45s, sys: 832 ms, total: 1min 46s
Wall time: 1min 47s


In [86]:
### add senti features and metadata to  feature vectors
### include topic
### select features for unigram and bigram separately

In [54]:
x_train_vec.shape

(1183959, 1152)

In [55]:
%%time
fs = SelectPercentile(chi2, 10)
fs.fit(x_train_vec, y_train)
x_train_vec_sel = fs.transform(x_train_vec)

CPU times: user 364 ms, sys: 0 ns, total: 364 ms
Wall time: 363 ms


In [56]:
x_train_vec_sel.shape

(1183959, 116)

In [89]:
rfc = RandomForestClassifier(n_estimators=100,
                             max_depth=None,
                             min_samples_leaf=20,
                             class_weight='balanced',
                             n_jobs=-1)

In [90]:
%%time
rfc.fit(x_train_vec_sel, y_train)

CPU times: user 17min 9s, sys: 6.71 s, total: 17min 16s
Wall time: 7min 33s


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [91]:
%%time
pred = rfc.predict(fs.transform(vectorizer.transform(x_test)))

CPU times: user 23.3 s, sys: 100 ms, total: 23.4 s
Wall time: 15.3 s


In [92]:
confusion_matrix(y_test, pred)

array([[107301,  89799],
       [ 33606, 163947]])

In [93]:
(accuracy_score(y_test, pred) * 100).round()

69.0