In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectPercentile, chi2

In [72]:
data = pd.read_csv('../data/Sentiment Analysis Dataset.csv', 
                   sep=',', 
                   error_bad_lines=False)

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [191]:
data.drop(['ItemID', 'SentimentSource'], axis=1).sample(10)

Unnamed: 0,Sentiment,SentimentText,SentimentText_cleaned
138317,1,"@DustinJMcClure Lol, love your new picture! Br...","|USER| lol, love your new picture |EXCLAMATION..."
555895,0,@__lifeSAVER I get off work at 9:30 tomorrow,|USER| i get off work at 9:30 tomorrow
344949,1,@lrigbyphoto So happy to run into you today in...,|USER| so happy to run into you today in centr...
1439778,0,i got the hiccups. ive had them for about 15 m...,i got the hiccups. ive had them for about 15 m...
660055,0,"@Brantanamo damn your lucky, i haven't got a '...","|USER| damn your lucky, i haven't got a 'lil d..."
298458,0,@markbrown83 ouch! Poor you did it happen thi...,|USER| ouch |EXCLAMATION| poor you did it hap...
616127,0,"cris on real,this so sad!!!","cris on real,this so sad |EXCLAMATION|"
545613,1,"@very_speedy You say moral high ground, I say ...","|USER| you say moral high ground, i say surren..."
1035581,0,@MariDuB I couldn't save one of my animated on...,|USER| i couldn't save one of my animated ones...
756332,1,hm.. time for lunch..,hm time for lunch


In [187]:
def clean_text(df, col):
    df[col+'_cleaned'] = df[col].str.lower()
    df[col+'_cleaned'] = df[col+'_cleaned'].str.replace(r"([!]+?)\1+", r"\1")
    df[col+'_cleaned'] = df[col+'_cleaned'].str.replace(r"([a-z]+?)\1+", r"\1\1")
    df[col+'_cleaned'] = df[col+'_cleaned'].str.replace(r'@[^\s]+', '|USER|')
    df[col+'_cleaned'] = df[col+'_cleaned'].str.replace(r'#[^\s]+', '|HASHTAG|')
    df[col+'_cleaned'] = df[col+'_cleaned'].str.replace(r'https?\:\/\/[^\s]+', '|URL|')
    df[col+'_cleaned'] = df[col+'_cleaned'].str.replace(r'wwww.[^\s]+', '|URL|')
    df[col+'_cleaned'] = df[col+'_cleaned'].str.replace(r'?', ' |QUESTIOPUNC|')
    df[col+'_cleaned'] = df[col+'_cleaned'].str.replace(r'!', ' |EXCLAMATION|')
    df[col+'_cleaned'] = df[col+'_cleaned'].str.replace(r'\.{2,}', ' ')
    ## to be added
    # stemming, lemmetizing
    # twitter/known entity replacement
    # spelling correction ???

In [188]:
%%time
clean_text(data, 'SentimentText')

CPU times: user 37.8 s, sys: 8 ms, total: 37.8 s
Wall time: 37.7 s


In [192]:
data['Sentiment'].value_counts()

1    790177
0    788435
Name: Sentiment, dtype: int64

In [193]:
le = LabelEncoder()
y = le.fit_transform(data['Sentiment'])

In [194]:
x_train, x_test, y_train, y_test = train_test_split(data['SentimentText_cleaned'], y)

In [195]:
%%time
vectorizer = TfidfVectorizer(max_df=0.7,
                             min_df=100,
                             analyzer=u'word', 
                             ngram_range=(1, 2),
                             max_features=None,
                             stop_words='english',
                             lowercase=False)
vectorizer.fit(x_train)
x_train_vec = vectorizer.transform(x_train)

CPU times: user 1min 12s, sys: 556 ms, total: 1min 13s
Wall time: 1min 13s


In [196]:
### add senti features and metadata to  feature vectors
### include topic
### select features for unigram and bigram separately

In [197]:
x_train_vec.shape

(1183959, 11736)

In [198]:
%%time
fs = SelectPercentile(chi2, 10)
fs.fit(x_train_vec, y_train)
x_train_vec_sel = fs.transform(x_train_vec)

CPU times: user 392 ms, sys: 0 ns, total: 392 ms
Wall time: 392 ms


In [220]:
terms = vectorizer.vocabulary_
terms = pd.DataFrame.from_dict({v:k for k,v in terms.items()}, orient='index')

In [241]:
for z in sorted(zip(rfc.feature_importances_, 
                    terms[fs.get_support()].iloc[:,0].values), 
                key=lambda t: t[0],
                reverse=True):
    print(z)

(0.069154517772018781, 'hook')
(0.041719753478433548, 'ivy')
(0.034627371908391581, 'author')
(0.031841639577674892, 'tummy')
(0.022805019688061711, 'casting')
(0.022468711927108354, 'USER wake')
(0.017742260738410006, 'man love')
(0.01762295795657632, 'peek')
(0.01758428671832896, 'wtf QUESTIOPUNC')
(0.0169687432203319, 'depends')
(0.015281857973601704, 'haven watched')
(0.014740141348767079, 'USER okay')
(0.01457573034777917, 'ftl')
(0.014241430460836009, 'stop thinking')
(0.011688624881479032, 'shave')
(0.01122328352700027, 'decides')
(0.011208185952539931, 'USER mm')
(0.010570621370236273, 'stavros')
(0.010452981401810118, 'cinnamon')
(0.0093681742805501716, 'love come')
(0.0091783243052438214, 'fucking')
(0.0087039732979966034, 'rec')
(0.0084106223331011035, 'loved')
(0.0082263431539405865, 'proof')
(0.0079231944109560623, 'happy hour')
(0.0077895515481077073, 'queue')
(0.0077390468431830697, 'created')
(0.0075249336941636278, 'holds')
(0.0073504436611406994, 'sats')
(0.0072217218

In [200]:
x_train_vec_sel.shape

(1183959, 1174)

In [201]:
rfc = RandomForestClassifier(n_estimators=100,
                             max_depth=None,
                             min_samples_leaf=20,
                             class_weight='balanced',
                             n_jobs=-1)

In [202]:
%%time
rfc.fit(x_train_vec_sel, y_train)

CPU times: user 36min 20s, sys: 1.75 s, total: 36min 22s
Wall time: 9min 37s


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [203]:
%%time
pred = rfc.predict(fs.transform(vectorizer.transform(x_test)))

CPU times: user 30.8 s, sys: 68 ms, total: 30.9 s
Wall time: 14.6 s


In [204]:
confusion_matrix(y_test, pred)

array([[133465,  63717],
       [ 38198, 159273]])

In [205]:
(accuracy_score(y_test, pred) * 100).round()

74.0