# Building a Sentiment Classifier
* This notebook will be used to train and evaluate several sentiment classifiers.
* Models in this notebook will serve as a baseline
* The [SemEval-2017 gold dataset](https://alt.qcri.org/semeval2017/task4/?id=download-the-full-training-data-for-semeval-2017-task-4) was combined with additional positive and negative tweets, dictacted by the presence of :) or :(
 * The decision to add the recent :) and :( tweets to the classifier training dataset is to include tweets the mention Covid-19. The SemEval-2017 dataset existed long before Covid-19 came into our world. 
* Depending on the performance of VADER in classifies the tweets in this novel datset, an LSTM may later be trained and used for classifying s tream of tweets, outside the scope of this project.

In [26]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)

import re
import string
from glob import glob

import nltk
from nltk.tokenize import word_tokenize
stop_words = nltk.corpus.stopwords.words('english')
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import naive_bayes

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import pickle

In [27]:
# Read in all TSVs from SemEval-2017

data_1 = '.csv'
column_names = ['ids', 'target', 'text']

gold_tweets = pd.DataFrame()
names = ['ids', 'target', 'text']

for f in glob('*.txt'):
    tmp = pd.read_csv(f,names=names, sep='\t')

    gold_tweets = pd.concat([gold_tweets,tmp],axis=0,ignore_index=True)
    
gold_tweets['text'] = gold_tweets['text'].apply(str)

In [28]:
# Add collected sad tweets to human labeled tweet dataset
# This cell an be excluded, if desired

sad_tweets_buffer = pd.read_pickle('sad_tweets_buffer.pkl')

sad_tweets_buffer = sad_tweets_buffer[['id','processed_features','target']].copy()
sad_tweets_buffer.rename(columns={"id": "ids", "processed_features": "text"}, inplace=True)
sad_tweets_buffer.reset_index(inplace=True, drop=True)
sad_tweets_buffer['target'] = 'negative'
print('Sad Tweets Shape',sad_tweets_buffer.shape)

gold_tweets = pd.concat([gold_tweets,sad_tweets_buffer],axis=0,ignore_index=True)
gold_tweets.target.value_counts()
print('New Shape',gold_tweets.shape)
print('Training Data:\n', gold_tweets.target.value_counts())

Sad Tweets Shape (49912, 3)
New Shape (99592, 3)
Training Data:
 negative    57655
neutral     22271
positive    19629
Name: target, dtype: int64


In [29]:
# Add collected happy tweets to human labeled tweet dataset
# This cell an be excluded, if desired

happy_tweets_buffer = pd.read_pickle('happy_tweets_buffer.pkl')

happy_tweets_buffer = happy_tweets_buffer[['id','text','target']].copy()
happy_tweets_buffer.rename(columns={"id": "ids"}, inplace=True)
happy_tweets_buffer.reset_index(inplace=True, drop=True)
print('Happy Tweets Shape',happy_tweets_buffer.shape)

gold_tweets = pd.concat([gold_tweets,happy_tweets_buffer],axis=0,ignore_index=True)
gold_tweets.target.value_counts()
print('New Shape',gold_tweets.shape)
gold_tweets.target.value_counts()

Happy Tweets Shape (38026, 3)
New Shape (137618, 3)


negative    57655
positive    57655
neutral     22271
Name: target, dtype: int64

In [30]:
# Doubling down on the exiting neutral tweets to attempt handle the imbalance
# Comment out to skip and everything below will still work

# ** This will have a direct impact on and improve the train test scores of the models below, specifically the F1 score for neutral tweets and weighted avg compound F1 **
# ** This is a heuristic approach that has resulted in the improved generalization of the baseline classification model on new, unseen tweets **
# ** VADER compound scores will be used in the classification of the full twitter dataset and compared to our baseline models **

gold_temp = gold_tweets.copy()
neutral_tweets = gold_temp[gold_temp.values  == "neutral"]
neutral_tweets = neutral_tweets[['ids','text','target']]
gold_tweets = pd.concat([gold_tweets,neutral_tweets],axis=0,ignore_index=True)

# Set Index and display distribution 
gold_tweets.set_index('ids', inplace=True)
gold_tweets.target.value_counts()

negative    57655
positive    57655
neutral     44542
Name: target, dtype: int64

In [31]:
# Label Encode the tweet sentiments

def binarizer(x):
    if x == 'positive':
        return 4
    elif x == 'negative':
        return 0 
    else:
        return 2
    
gold_tweets['labels'] = gold_tweets['target'].apply(lambda x: binarizer(x))
gold_tweets.labels.value_counts()

4    57655
0    57655
2    44579
Name: labels, dtype: int64

In [32]:
# Convert ASCII based emotions to "emoneg" and "emopos".
# This function now lives in functions.py

def emoji_stringer(text):
    # Positive Emoji - Smile, Laugh, Wink,Love
    text = ' '.join(re.sub('(:\s?\)|:-\)|;\)|\(\s?:|\(-:|:\’\))','emopos',text).split()) # add this :-))
    text = ' '.join(re.sub('(:\s?D|:-D|x-?D|X-?D)','emopos',text).split()) 
    text = ' '.join(re.sub('(<3|:\*)','emopos',text).split()) 
    # Negative Emoji - Sad, Cry
    text = ' '.join(re.sub('(:\s?\(|:-\(|:\||\)\s?:|\)-:)','emoneg',text).split())
    text = ' '.join(re.sub('(:,\(|:\’\(|:"\()','emoneg',text).split())
    return text

gold_tweets['text']=gold_tweets['text'].apply(str)
gold_tweets['emo_features'] = gold_tweets['text'].apply(lambda x: emoji_stringer(x))

In [33]:
# Preprocess Tweets - lowercase, URLS, tokens and punctuation
# An improved version of this function now lives in functions.py


def preprocess(text):
    text = text.lower() # Vader considers case
    text = ' '.join(re.sub("((www\.[\S]+)|(https?://[\S]+))","URL",text).split())
    text = ' '.join(re.sub("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)","USER ",text).split())
    #text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]|(\w+:\/\/\S+))","USER ",text).split())
    text = ' '.join(re.sub("^rt","",text).split())
    #punc = ''.join([char for char in text if char not in string.punctuation])
    #tokens = word_tokenize(text)
    #stops = [word for word in tokens if word not in stop_words]
    #strings = (" ").join(stops)
    return text

gold_tweets['full_text_clean'] = gold_tweets['emo_features'].apply(lambda x: preprocess(x))
gold_tweets.head(10)

Unnamed: 0_level_0,target,text,labels,emo_features,full_text_clean
ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
628949369883000832,negative,"dear @Microsoft the newOoffice for Mac is great and all, but no Lync update? C'mon.",0,"dear @Microsoft the newOoffice for Mac is great and all, but no Lync update? C'mon.","dear USER the newooffice for mac is great and all, but no lync update? c'mon."
628976607420645377,negative,@Microsoft how about you make a system that doesn't eat my friggin discs. This is the 2nd time this has happened and I am so sick of it!,0,@Microsoft how about you make a system that doesn't eat my friggin discs. This is the 2nd time this has happened and I am so sick of it!,USER how about you make a system that doesn't eat my friggin discs. this is the 2nd time this has happened and i am so sick of it!
629023169169518592,negative,I may be ignorant on this issue but... should we celebrate @Microsoft's parental leave changes? Doesn't the gender divide suggest... (1/2),0,I may be ignorant on this issue but... should we celebrate @Microsoft's parental leave changes? Doesn't the gender divide suggest... (1/2),i may be ignorant on this issue but... should we celebrate USER 's parental leave changes? doesn't the gender divide suggest... (1/2)
629179223232479232,negative,"Thanks to @microsoft, I just may be switching over to @apple.",0,"Thanks to @microsoft, I just may be switching over to @apple.","thanks to USER , i just may be switching over to USER ."
629186282179153920,neutral,If I make a game as a #windows10 Universal App. Will #xboxone owners be able to download and play it in November? @majornelson @Microsoft,2,If I make a game as a #windows10 Universal App. Will #xboxone owners be able to download and play it in November? @majornelson @Microsoft,if i make a game as a #windows10 universal app. will #xboxone owners be able to download and play it in november? USER USER
629226490152914944,positive,"Microsoft, I may not prefer your gaming branch of business. But, you do make a damn fine operating system. #Windows10 @Microsoft",4,"Microsoft, I may not prefer your gaming branch of business. But, you do make a damn fine operating system. #Windows10 @Microsoft","microsoft, i may not prefer your gaming branch of business. but, you do make a damn fine operating system. #windows10 USER"
629345637155360768,negative,@MikeWolf1980 @Microsoft I will be downgrading and let #Windows10 be out for almost the 1st yr b4 trying it again. #Windows10fail,0,@MikeWolf1980 @Microsoft I will be downgrading and let #Windows10 be out for almost the 1st yr b4 trying it again. #Windows10fail,USER USER i will be downgrading and let #windows10 be out for almost the 1st yr b4 trying it again. #windows10fail
629394528336637953,negative,@Microsoft 2nd computer with same error!!! #Windows10fail Guess we will shelve this until SP1! http://t.co/QCcHlKuy8Q,0,@Microsoft 2nd computer with same error!!! #Windows10fail Guess we will shelve this until SP1! http://t.co/QCcHlKuy8Q,USER 2nd computer with same error!!! #windows10fail guess we will shelve this until sp1! URL
629650766580609026,positive,"Just ordered my 1st ever tablet; @Microsoft Surface Pro 3, i7/8GB 512GB SSD. Hopefully it works out for dev to replace my laptop =)",4,"Just ordered my 1st ever tablet; @Microsoft Surface Pro 3, i7/8GB 512GB SSD. Hopefully it works out for dev to replace my laptop =)","just ordered my 1st ever tablet; USER surface pro 3, i7/8gb 512gb ssd. hopefully it works out for dev to replace my laptop =)"
629797991826722816,negative,"After attempting a reinstall, it still bricks, says, ""Windows cannot finish installing,"" or somesuch. @Microsoft may have cost me $600.",0,"After attempting a reinstall, it still bricks, says, ""Windows cannot finish installing,"" or somesuch. @Microsoft may have cost me $600.","after attempting a reinstall, it still bricks, says, ""windows cannot finish installing,"" or somesuch. USER may have cost me $600."


In [34]:
gold_tweets.labels.isnull().any()

False

In [35]:
gold_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 159889 entries, 628949369883000832 to 641395811474128896
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   target           159852 non-null  object
 1   text             159889 non-null  object
 2   labels           159889 non-null  int64 
 3   emo_features     159889 non-null  object
 4   full_text_clean  159889 non-null  object
dtypes: int64(1), object(4)
memory usage: 7.3+ MB


In [36]:
# Backing up combined tweets
#gold_tweets.to_pickle('combined_train_tweets157k.pkl')

## Vectorize

In [37]:
# Create vector representations of the tweets to then use in model. 

vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, ngram_range=(1,2), max_features=40000) # 40k may be too high a feature count and cause unecessary computational use when padding future vectorizations

y = gold_tweets.labels
X = vectorizer.fit_transform(gold_tweets.full_text_clean)
print(y.shape)
print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

(159889,)
(159889, 40000)


## Model

In [38]:

logreg = LogisticRegression(n_jobs=-1)
nb = naive_bayes.MultinomialNB()

# Random Forest was also test and demonstrated poor performance value when compared to Logisitic Regression, and of course Naive Bayes Multinomial. (>10x training time of LogReg)

In [39]:
%%time
nb.fit(X_train, y_train)

CPU times: user 41.7 ms, sys: 4.49 ms, total: 46.1 ms
Wall time: 44.7 ms


MultinomialNB()

In [40]:
%%time
# Vanilla LogReg
logreg.fit(X_train, y_train)

CPU times: user 73.9 ms, sys: 138 ms, total: 212 ms
Wall time: 7.56 s


LogisticRegression(n_jobs=-1)

In [45]:
%%time

# Grid Search

pipe = Pipeline([('classifier', LogisticRegression(n_jobs=-1))])

# Create param grid

param_grid = [
    {'classifier' : [LogisticRegression(n_jobs=-1)],
     'classifier__penalty' : ['l1', 'l2'], # lbfgs only supports l2 - Separate solvers into two groups inside grid to resolve warnings
     'classifier__C' : np.logspace(-4, 4, 20),
     'classifier__solver' : ['lbfgs','liblinear']}
]

# Create grid search object

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

# Fit on data

best_clf = clf.fit(X_train, y_train)


Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 16.3min finished
  " = {}.".format(effective_n_jobs(self.n_jobs)))


CPU times: user 1min 17s, sys: 4.63 s, total: 1min 22s
Wall time: 16min 39s


In [18]:
best_clf.best_params_

{'classifier__C': 11.288378916846883,
 'classifier__penalty': 'l1',
 'classifier__solver': 'liblinear'}

In [41]:
logreg_pred = logreg.predict(X_test)

In [46]:
logreg_grid_pred = best_clf.predict(X_test)

In [43]:
nb_pred = nb.predict(X_test)

## Metrics

### Performance Including Oversampled Neutral Tweets

In [47]:
# With emoneg and emopos
# With added 50k happy and sad face tweets and doubled exisiting neutral

print('Log Reg CLF Report')
print(classification_report(y_test, logreg_pred))
print('-----------------------------------------------------------')
print('NB Multi CLF Report')
print(classification_report(y_test, nb_pred))
print('-----------------------------------------------------------')
print('Log Reg GridCV CLF Report')
print(classification_report(y_test, logreg_grid_pred))

Log Reg CLF Report
              precision    recall  f1-score   support

           0       0.96      0.89      0.92     11541
           2       0.75      0.90      0.82      8812
           4       0.90      0.83      0.86     11625

    accuracy                           0.87     31978
   macro avg       0.87      0.87      0.87     31978
weighted avg       0.88      0.87      0.87     31978

-----------------------------------------------------------
NB Multi CLF Report
              precision    recall  f1-score   support

           0       0.97      0.82      0.89     11541
           2       0.68      0.91      0.78      8812
           4       0.86      0.78      0.81     11625

    accuracy                           0.83     31978
   macro avg       0.84      0.84      0.83     31978
weighted avg       0.85      0.83      0.83     31978

-----------------------------------------------------------
Log Reg GridCV CLF Report
              precision    recall  f1-score   support

In [23]:
# With emoneg and emopos
# With added 50k happy and sad face tweets and doubled exisiting neutral

cm_nb = confusion_matrix(y_test,nb_pred)
print(cm_nb)

[[9527 1305  709]
 [  40 8052  720]
 [ 203 2414 9008]]


In [24]:
# With emoneg and emopos
# With added 50k happy and sad face tweets and doubled exisiting neutral

cm_lr = confusion_matrix(y_test,logreg_pred)
print(cm_lr)

[[10274   972   295]
 [  178  7951   683]
 [  225  1781  9619]]


In [25]:
# With emoneg and emopos
# With added 50k happy and sad face tweets and doubled exisiting neutral

cm_lrg = confusion_matrix(y_test,logreg_grid_pred)
print(cm_lrg)

[[10421   707   413]
 [  173  8155   484]
 [  395  1396  9834]]


## Saving Tuned Logistic Regression Model

In [93]:
#filename = 'tweet_40kfeat_LR_GridCV_3C_89p_model.sav'
#pickle.dump(best_clf, open(filename, 'wb'))

### Performance Not Including Oversampled Neutral Tweets

In [20]:
# With emoneg and emopos
# With added 50k happy and sad face tweets
# WITHOUT oversampled neutral tweets

print('Log Reg CLF Report')
print(classification_report(y_test, logreg_pred))
print('-----------------------------------------------------------')
print('NB Multi CLF Report')
print(classification_report(y_test, nb_pred))
print('-----------------------------------------------------------')
print('Log Reg GridCV CLF Report')
print(classification_report(y_test, logreg_grid_pred))

Log Reg CLF Report
              precision    recall  f1-score   support

           0       0.94      0.90      0.92     11459
           2       0.64      0.69      0.66      4560
           4       0.87      0.88      0.87     11505

    accuracy                           0.86     27524
   macro avg       0.82      0.82      0.82     27524
weighted avg       0.86      0.86      0.86     27524

-----------------------------------------------------------
NB Multi CLF Report
              precision    recall  f1-score   support

           0       0.97      0.83      0.89     11459
           2       0.59      0.65      0.62      4560
           4       0.78      0.87      0.83     11505

    accuracy                           0.82     27524
   macro avg       0.78      0.78      0.78     27524
weighted avg       0.83      0.82      0.82     27524

-----------------------------------------------------------
Log Reg GridCV CLF Report
              precision    recall  f1-score   support

In [21]:
# With emoneg and emopos
# With added 50k happy and sad face tweets 
# WITHOUT oversampled neutral tweets

cm_nb_no_os = confusion_matrix(y_test,nb_pred)
print(cm_nb_no_os)

[[ 9485   778  1196]
 [   50  2948  1562]
 [  206  1263 10036]]


In [22]:
# With emoneg and emopos
# With added 50k happy and sad face tweets
# WITHOUT oversampled neutral tweets

cm_lr_no_os = confusion_matrix(y_test,logreg_pred)
print(cm_lr_no_os)

[[10370   645   444]
 [  332  3128  1100]
 [  276  1121 10108]]


In [23]:
# With emoneg and emopos
# With added 50k happy and sad face tweets
# WITHOUT oversampled neutral tweets

cm_lrg_no_os = confusion_matrix(y_test,logreg_grid_pred)
print(cm_lrg_no_os)

[[10380   657   422]
 [  338  3225   997]
 [  311  1145 10049]]
