In [1]:
# Run this cell just once! (or restart Kernel before second time)

import os
os.chdir('..')

In [3]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import datetime
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
from wordcloud import WordCloud
from utils.fixed import *
from nltk.stem.snowball import *
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
import numpy as np
from time import time
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.externals import joblib
import pickle

In [7]:
def get_accuracy(pipeline, x_train, y_train, x_test, y_test):
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    y_train_pred = sentiment_fit.predict(x_train)
    train_test_time = time() - t0
    accuracy_test = accuracy_score(y_test, y_pred)
    accuracy_train=accuracy_score(y_train, y_train_pred)
    return accuracy_test, accuracy_train, train_test_time


def check_accuracy(X_train, y_train, X_test, y_test, vectorizer=CountVectorizer(), classifier=LogisticRegression(), n_features=10000, stop_words=None, ngram_range=(1, 1)):
    vectorizer.set_params(stop_words=stop_words, max_features=n, ngram_range=ngram_range)
    checker_pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', classifier)
    ])
    test_accuracy, train_accuracy, train_test_time  = get_accuracy(checker_pipeline, X_train, y_train, X_test, y_test)
    return test_accuracy, train_accuracy, train_test_time


def get_classifiers_comparison_dataframe(df, classifiers,vectorizers,n_features,ngram_ranges, slang_dict):
    
    classification_data={'remove_stopwords':[],
                     'remove_shortwords':[],
                     'stemmed':[],
                     'vectorizer':[],
                     'features_number':[], 
                     'ngrams_range':[],
                     'classifier':[],
                     'test_accuracy':[],
                     'train_accuracy':[],
                     'train_test_time':[]}
    
    for remove_stopwords in [True, False]:
        for remove_shortwords in [True,False]:
            for stemmed in [True,False]:
                X, y = get_train_test_tweets(df, slang_dict, remove_stopwords, remove_shortwords, stemmed, labels = True)
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
                for key, vectorizer in vectorizers.items():
                    for n in n_features:
                        for name,classifier in classifiers.items():
                            for ngram_range in ngram_ranges:
                                print('Starting '+key+' - '+str(n)+' features - '+str(ngram_range)+' ngrams - '+name+' (remove_stopwords='+str(remove_stopwords)+', remove_shortwords='+str(remove_shortwords)+', stemmed='+str(stemmed)+')')
                                test_accuracy, train_accuracy, train_test_time = check_accuracy(X_train, y_train, 
                                                                                                X_test, y_test, 
                                                                                                vectorizer=vectorizer,
                                                                                                classifier=classifier,
                                                                                                n_features=n,
                                                                                                ngram_range=ngram_range)
                                classification_data['remove_stopwords'].append(remove_stopwords)
                                classification_data['remove_shortwords'].append(remove_shortwords)
                                classification_data['stemmed'].append(stemmed)
                                classification_data['vectorizer'].append(str(vectorizer))
                                classification_data['features_number'].append(n)
                                classification_data['ngrams_range'].append(str(ngram_range))
                                classification_data['classifier'].append(name)
                                classification_data['test_accuracy'].append("{:.2f}".format(test_accuracy*100))
                                classification_data['train_accuracy'].append("{:.2f}".format(train_accuracy*100))
                                classification_data['train_test_time'].append("{:.2f}".format(train_test_time))
    return pd.DataFrame(classification_data)

def create_save_model(X_train, y_train, vectorizer, classifier, n_features, n_grams, filename):
    vectorizer.set_params(max_features=n_features, ngram_range=n_grams)
    vectors = vectorizer.fit_transform(X_train)
    clf = classifier.fit(vectors, y_train)
    file_path = os.path.join(MAIN_PATH,MODEL_PATH,filename)
    with open(file_path, 'wb') as f:
        pickle.dump((vectorizer, clf), f)

In [5]:
TRAINING_DATA_PATH = os.path.join(MAIN_PATH, DATA_PATH, 'sentiment_m140_.csv')
CLASSIFIERS_COMPARISON_PATH = os.path.join(MAIN_PATH,RESULTS_PATH,'classifiers_ranking.csv')
SLANG_DICT=load_slang(SLANG_PATH)

### LOADING THE SENTIMENT140 DATASET

In [6]:
train_df = pd.read_csv(TRAINING_DATA_PATH,sep=',',encoding = "ISO-8859-1", lineterminator='\n',header=0)
train_df.columns=['sentiment','id','date','query','user','tweetText']
train_df=train_df.drop(['query'],axis=1)

### CHECKING CLASSES' SIZES

In [5]:
train_df['sentiment'].value_counts()

1    30000
0    30000
Name: sentiment, dtype: int64

# ACCURACY COMPARISON FOR DIFFERENT CLASSIFIERS, VECTORIZERS AND PARAMETERS

*WARNING: It takes much time to check all combination - reduce number of classifiers, vectorizers or parameteres if you do not want to wait over 10 hours to see the results!*

In [102]:
n_features = np.arange(10000,100001,10000)
ngram_ranges = [(1,1),(1,2),(1,3)]
vectorizers = {'CountVectorizer':CountVectorizer(),
             'TfidfVectorizer':TfidfVectorizer()}
classifiers = {'Logistic Regression':LogisticRegression(), 
               'LinearSVC':LinearSVC(),
               'LinearSVC with selection':Pipeline([
                 ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
                 ('classification', LinearSVC(penalty="l2"))]),
               'MultinomialNB':MultinomialNB(),
               'BernoulliNB':BernoulliNB(), 
               'RandomForestClassifier':RandomForestClassifier(), 
               'DecisionTreeClassifier':DecisionTreeClassifier(),
               'XGBClassifier':XGBClassifier()}

acc_comparison_df = get_classificators_comparison_dataframe(train_df,classifiers,vectorizers,n_features,ngram_ranges)

### DESCENDING SORT COMPARISON RESULTS BY ACCURACY

In [92]:
df_ranking=acc_comparison_df.sort_values(by=['test_accuracy'],ascending=False).reset_index(drop=True)
df_ranking.to_csv(CLASSIFIERS_COMPARISON_PATH,sep=';',encoding='utf-8')
df_ranking

Unnamed: 0,remove_stopwords,remove_shortwords,stemmed,vectorizer,features_number,ngrams_range,classifier,test_accuracy,train_accuracy,train_test_time
0,False,False,True,"TfidfVectorizer(analyzer='word', binary=False,...",50000,"(1, 3)",Logistic Regression,78.51,86.24,15.67
1,False,False,True,"TfidfVectorizer(analyzer='word', binary=False,...",60000,"(1, 3)",Logistic Regression,78.50,86.53,12.34
2,False,False,True,"TfidfVectorizer(analyzer='word', binary=False,...",100000,"(1, 2)",Logistic Regression,78.49,87.11,6.52
3,False,False,True,"TfidfVectorizer(analyzer='word', binary=False,...",50000,"(1, 2)",Logistic Regression,78.48,86.12,7.55
4,False,False,True,"TfidfVectorizer(analyzer='word', binary=False,...",80000,"(1, 2)",Logistic Regression,78.46,86.69,6.46
5,False,False,True,"TfidfVectorizer(analyzer='word', binary=False,...",70000,"(1, 3)",Logistic Regression,78.46,86.79,11.01
6,False,False,True,"TfidfVectorizer(analyzer='word', binary=False,...",90000,"(1, 2)",Logistic Regression,78.45,86.90,6.08
7,False,False,True,"TfidfVectorizer(analyzer='word', binary=False,...",40000,"(1, 2)",Logistic Regression,78.44,85.76,6.70
8,False,False,True,"TfidfVectorizer(analyzer='word', binary=False,...",100000,"(1, 3)",Logistic Regression,78.43,87.32,9.44
9,False,False,True,"TfidfVectorizer(analyzer='word', binary=False,...",40000,"(1, 3)",Logistic Regression,78.43,85.96,12.47


### Conclusions:

The best accuracy on test set with optimal time has been acheived for *Logistic Regression* classifier with the following parameters:
- remove_stopwords = False, remove_shortwords = False, stemmed = True
- TF-IDF vectorizer with 40K features, ngrams_range = (1,2)
For such parameters the test accuracy is 78.44, train accuracy - 85.76 and the train-test time = 6.70 sec.

It is worth using also:
- *LinearSVC* with TF-IDF vectorizer having 100K features and (1,2) ngrams range. Its test accuracy = 77.41 and train accuracy = 98.29
- *MultinomialNB* with TF-IDF having 80K features, (1,3) ngrams range. Its test accuracy is 77.40 and train accuracy - 88.98.

The highest scores where achieved for tweets containing stopwords and short words (remove_stopwords = False, remove_shortwords = False) but after stemming (stemmed = True). 

How did the other classifiers performed? 

- *LinearSVC* with selection with TF-IDF vectorizet having 100K features, (1,2) ngrams range: test accuracy = 77.30, train accuracy = 92.41
- *BernoulliNB* with TF-IDF vectorizer having 40K features, (1,2) ngrams range, FFT, COuntVectorizer, 76.63, 85.57 LUB TFIDF, 100K, (1,2), 76.62, 90.11
- *RandomForestClassifier* with CountVectorizer having 40K features, (1,2) ngrams range: test accuracy = 74.22, train accuracy = 98.55 (VERY HIGH TRAIN ACCURACY!). Unfortunately, the train-test time was much longer than in previous classifiers - over 25 sec (in this case 26.77 sec)
- *XGBClassifier* with Count Vectorizer having 10K features, (1,2) ngrams range: test accuracy = 71.88, train accuracy = 71.98. But:  train-test time usually over 10 sec, low test accuracy in comparison with other classifiers, test and train accuracy are on similar level, almost the same results for different numbers of features and ngrams ranges, it has higher accuracy with CountVectorizer. 
- *DecisionTreeClassifier* - it has the highest train-test time - usually over 40sec!, train accuracy is almost 100 while test is about 71.... It gives higher accuracy for CountVectorizer, it does not matter if tweet is stemmed or not. 

# TRAINING THE MODEL - BASED ON WHOLE SENTIMENT140 DATASET

In [6]:
remove_stopwords = False
remove_shortwords = False
stemmed = True
X, y =  get_processed_tweets(train_df,SLANG_DICT,remove_stopwords,remove_shortwords,stemmed,labels=True)

create_save_model(X, y, TfidfVectorizer(), LogisticRegression(), n_features = 40000, n_grams = (1,2), filename='logistic-regression.pkl')
create_save_model(X, y, TfidfVectorizer(), LinearSVC(), n_features = 100000, n_grams = (1,2), filename='linearSVC.pkl')
create_save_model(X, y, TfidfVectorizer(), MultinomialNB(), n_features = 80000, n_grams = (1,3), filename='multinomialNB.pkl')
LinearSVC_with_selection = Pipeline([
                 ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
                 ('classification', LinearSVC(penalty="l2"))])
create_save_model(X, y, TfidfVectorizer(), LinearSVC_with_selection, n_features = 100000, n_grams = (1,2), filename='SVC-with-selection.pkl')    
create_save_model(X, y, TfidfVectorizer(), BernoulliNB(), n_features = 40000, n_grams = (1,2), filename='BernoulliNB.pkl')
create_save_model(X, y, CountVectorizer(), XGBClassifier(), n_features = 10000, n_grams = (1,2), filename='XGBClassifier.pkl')



In [31]:
### LOADING MODEL ###
# with open('logistic-regression.pkl', 'rb') as f:
#     vectorizer, clf = pickle.load(f)


## DOC2VEC

In [62]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

In [63]:
def labelize_tweets_ug(tweets,indexes, prefix):
    
    
    for i, t in zip(indexes, tweets):
        
        yield LabeledSentence(t.split(), [prefix + '_%s' % str(i)])
    
   

In [27]:
d2v_labels=["label"+ '_%s' % str(i) for i in indexes]
it = labelize_tweets_ug(X, indexes, 'label')

In [28]:
len(d2v_labels)
vector_size=100

In [29]:
print("Creating Doc2Vec model...")
model = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=2, alpha=0.065, min_alpha=0.065)
model.build_vocab(it)
model.train(it, total_examples=model.corpus_count, epochs=model.epochs)

Creating Doc2Vec model...


  


In [30]:
X_d2v = np.zeros((len(d2v_labels), vector_size))


labels = np.asarray(y)
for i in range(len(d2v_labels)):
    X_d2v[i] = model[d2v_labels[i]]

In [31]:
len(X_d2v)

60000

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_d2v, labels, test_size=0.33, random_state=42)

In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [34]:
logistic_r = LogisticRegression()
logistic_r.fit(X_train, y_train)
y_pred = logistic_r.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred)
lr_report = classification_report(y_test, y_pred)
lr_matrix = confusion_matrix(y_test, y_pred)

print("\nLogistic Regression accuracy score:", lr_accuracy)
print("Logistic Regression classification report:\n", lr_report)
print("Logistic Regression confusion matrix\n", lr_matrix)



Logistic Regression accuracy score: 0.50175
Logistic Regression classification report:
              precision    recall  f1-score   support

          0       0.50      0.83      0.62      5909
          1       0.53      0.18      0.27      6091

avg / total       0.51      0.50      0.44     12000

Logistic Regression confusion matrix
 [[4925  984]
 [4995 1096]]


## ROZW Z NETA

In [64]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import multiprocessing
from sklearn import utils

In [66]:
from gensim.models.phrases import Phrases, Phraser
phrases = Phrases(list(train_df['text_tokens']))
bigram = Phraser(phrases)

In [67]:
corpus

0        aww that is a bummer you should have got david...
1        is upset that he can not update his facebook b...
2        i dived many times for the ball managed to sav...
3           my whole body feels itchy and like its on fire
4        no it is not behaving at all i am mad why am i...
5                                       not the whole crew
6                                               need a hug
7        hey long time no see yes rains a bit only a bi...
8                                nope they did not have it
9                                             que me muera
10                spring break in plain city it is snowing
11                                i just repierced my ears
12       i could not bear to watch it and i thought the...
13       it it counts I do not know why i did either yo...
14       i would have been the first but i did not have...
15       i wish i got to watch it with you i miss you a...
16       hollis death scene will hurt me severely to wa.

In [68]:
X=list(corpus)
y = list(train_df['sentiment'])  
indexes = list(train_df.index)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test,index_train,index_test = train_test_split(X,y,indexes,test_size=0.33, random_state=42)

In [72]:
def labelize_tweets_bg(tweets,indexes,label):
    result = []
    prefix = label
    for i, t in zip(indexes, tweets):
        result.append(LabeledSentence(bigram[t.split()], [prefix + '_%s' % i]))
    return result
  

all_x_w2v = labelize_tweets_bg(X,indexes, 'all')

from sklearn.linear_model import LogisticRegression


cores = multiprocessing.cpu_count()
model_ug_dbow = Doc2Vec(dm=0, size=300, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dbow.build_vocab([x for x in tqdm(all_x_w2v)])

for epoch in range(30):
    model_ug_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dbow.alpha -= 0.002
    model_ug_dbow.min_alpha = model_ug_dbow.alpha
    
def get_vectors(model, corpus,indexes, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in indexes:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs
  
train_vecs_dbow = get_vectors(model_ug_dbow, X_train, index_train, 300)
validation_vecs_dbow = get_vectors(model_ug_dbow, X_test, index_test, 300)

clf = LogisticRegression()
clf.fit(train_vecs_dbow, y_train)
clf.score(validation_vecs_dbow, y_test)


  """
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60000/60000 [00:00<00:00, 1071337.46it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60000/60000 [00:00<00:00, 1277556.76it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60000/60000 [00:00<00:00, 1304652.01it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60000/60000 [00:00<00:00, 1305017.35it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

0.7341414141414141

In [73]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(train_vecs_dbow, y_train)
clf.score(validation_vecs_dbow, y_test)

0.7343434343434343