### Sentiment Analysis of Twitter Posts

##### Load the required Python packages

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
import string
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours
from imblearn.combine import SMOTETomek, SMOTEENN
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD
from sklearn.linear_model import LogisticRegression,SGDClassifier 
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import accuracy_score, f1_score, classification_report
# import spacy
import timeit, re 
import _pickle as pickle

##### Load the pickled train and test datasets

In [2]:
# function to load pickle file 
def pickleloader(filename):
    # # open the file for writing
    fileObject = open(filename,'rb') 

    # load the object from the file into var univ_processed_train
    return pickle.load(fileObject,  encoding="latin1")  #latin1 here, to bypass 
                                        # python2 to 3 pickle problem

    # here we close the fileObject
    fileObject.close()

# function to create a file and store the data in the file 
def picklemaker(filename, objectname): 
    # open the file for writing
    fileObject = open(filename,'wb') 

    # this writes the object a to the
    # file named 'testfile'
    pickle.dump(objectname,fileObject)   

    # here we close the fileObject
    fileObject.close()

In [3]:
# store the pickle file names as variables
file_Name_train = "tweet_train_complete_textnonull_PICKLE"
file_Name_test = "tweet_test_complete_textnonull_PICKLE"

# run pickleloader and store results to variable
train_complete = pickleloader(file_Name_train)
test_complete = pickleloader(file_Name_test)

#join the train and test sets together 
traintest_complete = pd.concat([train_complete,test_complete])

# extracted only the RELATED rows 
traintest_complete_REL = traintest_complete[traintest_complete.filtering=="RELATED"].copy()

##### Check the dataset for errors or unexpected inputs

In [4]:
# let's check our target column
traintest_complete_REL.polarity.unique()

array(['NEUTRAL', 'POSITIVE', 'NEGATIVE', 'RELATED'], dtype=object)

In [5]:
# let's check our predictors column (i.e. the tweet text)
traintest_complete_REL.text.isnull().sum()

0

The target column is "polarity". Each row is only supposed to have either a "POSITIVE", NEUTRAL" or "NEGATIVE" label. One or more rows has "RELATED" in it. The next steps are to locate them and remove them.

In [6]:
traintest_complete_REL[traintest_complete_REL.polarity == "RELATED"]

Unnamed: 0,tweet_id,author,entity_id,text,filtering,polarity,topic,topic_priority
40361,216287162696605696,katita048,RL2013D04E169,Wisin & Yandel - Follow The Leader ft. Jennife...,RELATED,RELATED,neutral,NEUTRAL
40362,218086720443400192,isamarfernande,RL2013D04E169,Wisin & Yandel - Follow The Leader ft. Jennife...,RELATED,RELATED,neutral,NEUTRAL


In [7]:
traintest_complete_REL.drop([40361, 40362], inplace=True)
traintest_complete_REL[traintest_complete_REL.polarity == "RELATED"]

Unnamed: 0,tweet_id,author,entity_id,text,filtering,polarity,topic,topic_priority


In [8]:
dataset = traintest_complete_REL.reset_index()
dataset.drop(["index"],axis=1,inplace=True)

##### Data cleaning and feature engineering

The dataset contain tweets that are English and Spanish. For the purpose of this project, we only want to use the English tweets. Therefore I have to find a way to easily filter them out. 

In [9]:
# write a function that determines whether a sentence is English or Spanish
def is_it_this_language(textsource, tokeniser, languages):
    # textsource, lDF, variable name of the the dataframe holding the sentence/document to be tokensized. if source is 
    # in a list, pass the list through pd.Dataframe() beforehand
    # tokeniser, function name, this is the tokeniser to be used... this function is written to work with NLTK's word_tokenize, wordpunct_tokenize
    # language, list, the language(s) to be checked for. NLTK's language are lowercase, full words e.g. "english". 
    
    it_is_this_language = {}
    for i in list(textsource.index):
        tokenised = tokeniser(textsource[i])
        tokens_lowercase = set([token.lower() for token in tokenised]) # using sets here so we can use set intersection below
    
        languages_ratios = {}
        for lang in languages: 
            stopwords_set = set(stopwords.words(lang))
            common_elements = tokens_lowercase.intersection(stopwords_set)
            languages_ratios[lang] = len(common_elements) #appends the number of tokens in the text that falls 
                                                              #in each of the NLTK's stop words.   
        it_is_this_language[i] = max(languages_ratios, key=languages_ratios.get)
    return it_is_this_language

# run the function with a timer. languages to be used are English and Spanish. This is due to the manner in which the 
# dataset was collected. 
languages_input = ["english", "spanish"]
language = is_it_this_language(dataset.text, wordpunct_tokenize,languages_input)

dataset["tweet_lang"] = language.values()

In [10]:
# we are only keeping the English tweets for this analysis. 
dataset_eng = dataset[dataset.tweet_lang=="english"].copy()
dataset.shape

(105003, 9)

##### Pre-processing of the English tweets. Using singular tokens to represent sentiment emoticons, capitalisation etc

In [11]:
# write functions that uses regex to preprocess the tweet texts in the same manner as the Stanford GloVe Twitter set
# https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb | neutralface and ALLCAPS are left out as they are 
# ambiguous 
# apply all these function first (such that # and @ can be used to locate these features. 
# followed by a final removal of special characters.  

def replaceurls(tweettext):
    return re.sub(r"http\B\S+|www\.\B\S+|\S+\.\w+", ' <URL> ', tweettext)

def replacementions(tweettext):
    return re.sub(r"(@|@\B)\w+", ' <USER> ', tweettext) 

def replacenumbers(tweettext):
    return re.sub(r'\s(\+|-)*\d+\b', " <NUMBER> ", tweettext) # slight difference from the GloVe treatment which also replaces +- 
                                                      # hypothesis is that +- numbers do not serve as signal for sentiment 
                                                        # only change numbers that start and end with whitespace. we don't
                                                        # want to replace instances such as test123 

def replacehashtags(tweettext):
    return re.sub(r'(#|#\B)\w+', " <HASHTAG> ", tweettext)

def replacerepeatexclaim(tweettext):
    return re.sub(r'(([!.]){2,})', '! <REPEAT> ', tweettext) # replace multiple !!! with ! and <REPEAT>. i.e. new feature

def replacerepeatquestion(tweettext):
    return re.sub(r'(([?.]){2,})', '? <REPEAT> ', tweettext) # replace multiple ??? with ? and <REPEAT>. i.e. new feature

def replacesmiles(tweettext):
    return re.sub(r"(8|:|=|;)('|`|-)?\){1,2}", " <SMILE> ", tweettext)

def replacelolface(tweettext):
    return re.sub(r"(8|:|=|;)('|`|-)?(P|p|D){1,3}", " <LOLFACE> ", tweettext) # slight difference from the GloVe
                                                                                    # treatment... catches :p and :d 
def replacesadface(tweettext):
    return re.sub(r"(8|:|=|;)('|`|-)?\({1,2}", " <SADFACE> ", tweettext)

def replaceheart(tweettext):
    return re.sub(r"<3{1,}", " <HEART> ", tweettext)

def replaceelongword(tweettext):
    return re.sub(r'(\S*?)\B(.)\2{2,}\b', r'\1' + r'\2' + ' <ELONG> ', tweettext)

def replaceallcaps(tweettext):
    return re.sub(r'(([A-Z]){7,}\B)', ' <ALLCAPS> ' + r'\1', tweettext) # regex for tokens composed of a sequence of 
                                                                        # 7 or more capitalised alphabets. 7 because it 
                                                                        # is unlikely to be a acronym (e.g. UNICEF)
        
tweetprocessorfuncs = [replaceurls, replacementions, replacenumbers, replacehashtags, replacerepeatexclaim, \
                       replacerepeatquestion, replacesmiles, replacelolface, replacesadface, replaceheart,\
                       replaceelongword, replaceallcaps]


In [12]:
# Applying the Pre-processing function across all English tweets
for i in tweetprocessorfuncs:
    dataset_eng.text = dataset_eng.text.apply(i)

In [13]:
# write a function that uses regex to remove remaining @ # and other special characters as well as english stop words. 

specialchars = re.compile('[^0-9a-z ?!<>+_]') # the GloVe Twitter set left ! and ? in place. we replicate the same 
                                              # to preserve any associated semantic pattern  
stopwords_list = set(stopwords.words('english')) #using NLTK's english stopwords

entity_names =["bmw", "audi", "volvo", "toyota", "volkswagen", "honda", "nissan", "fiat", "suzuki", "porsche",
              "yamaha", "mazda", "chrysler", "subaru", "ferrari", "bentley", "kia", "ford", "jaguar", "lexus",
              "barclays", "fargo", "wellsfargo", "bankia", "santander", "goldman", "harvard", "standford", 
              "berkeley", "princeton", "columbia", "yale", "hopkins", "johnhopkins", "oxford", "adele", "aliciakeys", 
               "alicia","beatles", "zeppelin", "aerosmith", "bonjovi", "jovi", "acdc", "thewanted", "coldplay", 
               "ladygaga","gaga", "madonna", "jennifer", "lopez", "jeniferlopez", "bieber", "justinbieber", "shakira", "psy",
              "whitneyhouston", "whitney", "houston", "britneyspears", "britney", "spears"]

for i in entity_names: 
    stopwords_list.add(i)

def finaltweetclean(tweettext):
    tweettext = tweettext.lower() # lowercase text
    tweettext = re.sub("  ", " ", tweettext) # delete double spaces
    tweettext = re.sub("  ", " ", tweettext) # delete double spaces
    tweettext = re.sub("  ", " ", tweettext) # delete double spaces
    tweettext = re.sub("  ", " ", tweettext) # delete double spaces
    tweettext = re.sub(specialchars, "", tweettext) # delete symbols which are in specialchars from text
    tweettext = ' '.join([i for i in tweettext.split() if i not in stopwords_list]) # delete stopwords from text
    return tweettext

dataset_eng.text = dataset_eng.text.apply(finaltweetclean)

##### Use gensim to load the GloVe Twitter trained vectors

In [17]:
import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec # for calling glove2word2vec script

In [18]:
# the GloVe format is not aligned for use with gensim. The following gensim script automatically converts it and 
# loads it. 

glove_file = ('./glove.twitter.27B.100d.txt')
tmp_file = get_tmpfile("glove_word2vec.txt")
# default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>
glove2word2vec(glove_file, tmp_file)

glove_twitter = KeyedVectors.load_word2vec_format(tmp_file)
len(glove_twitter.vectors)

1193514

In [19]:
# create an empty nested dictionary to store the word vectors based on the following classification: 
# (i) first character being alphabet, (ii) being a special token, (iii) first letter being a punctuation, except 
# special token, (iv) all others. 

glove_twitter_vocab = {"spectok":{},"specchar":{}}
for alphabet in [alpha for alpha in string.ascii_lowercase]:
    glove_twitter_vocab[alphabet] = {}
for punctuation in [punct for punct in string.punctuation]:
    glove_twitter_vocab[punctuation] = {}

In [20]:
# sort the GloVe Twitter model 
for i in glove_twitter.vocab: 
    if i[0] in [alpha for alpha in string.ascii_lowercase]:
        glove_twitter_vocab[i[0]].update({i:glove_twitter.get_vector(i)})
    elif i in ["<user>", "<url>", "<number>", "<hashtag>", "<repeat>", "<smile>", "<lolface>", \
             "<sadface>", "<heart>", "<elong>", "<allcaps>"]:
        glove_twitter_vocab["spectok"].update({i:glove_twitter.get_vector(i)})
    elif i[0] in [punct for punct in string.punctuation]:
        glove_twitter_vocab[i[0]].update({i:glove_twitter.get_vector(i)})
    else: 
        glove_twitter_vocab["specchar"].update({i:glove_twitter.get_vector(i)})

In [21]:
# write a function that (i) checks whether each of the token in a single tweet is in the Twitter-trained 100D GloVe set.
# (ii) if not, input a matrix of the same embedding size - (100,1) in this case -
spacynlp = spacy.load("en_core_web_sm")

def paravectorPOS(text): 
    word_list = []
    place_token = np.zeros_like(glove_twitter.get_vector("the"))+0.01
    if True in [i2 in glove_twitter.vocab for i2 in [i for i in text.split()]]:
        spacy_adjadv_list = [token.text for token in spacynlp(text) if token.pos_ in ["ADJ","ADV"]]

        for i3 in text.split(): 
            if i3[0] in [alpha for alpha in string.ascii_lowercase]: 
                if i3 in glove_twitter_vocab[i3[0]].keys():
                    if i3 in spacy_adjadv_list:
                        word_list.append(glove_twitter.get_vector(i3)**2)  # we square the adjectives and adverbs
                                                                           # to give them a higher/lower weight 
                                                                           # with respective to positive/negative 
                                                                           # sentiments 
                    else: 
                        word_list.append(glove_twitter.get_vector(i3))
            
            
            elif i3 in glove_twitter_vocab["spectok"].keys():
                word_list.append(glove_twitter.get_vector(i3))  
            
            
            elif i3[0] in [punct for punct in string.punctuation]: 
                if i3 in glove_twitter_vocab[i3[0]].keys():
                    if i3 in spacy_adjadv_list:
                        word_list.append(glove_twitter.get_vector(i3)**2) # we square the adjectives and adverbs
                                                                           # to give them a higher/lower weight 
                                                                           # with respective to positive/negative 
                                                                           # sentiments 
                    else: 
                        word_list.append(glove_twitter.get_vector(i3))
            
            else: 
                word_list.append(place_token)
    
    else:
        word_list.append(place_token) 
    return sum(word_list)/len(word_list)

# for tweets with no tokens in the GloVe Twitter embeddings, we impute a similar sized vector of 0.01s. a check of 
# shows that the most similar words to this vectors are infrequent words and do not hold significant meaning for sentiment. 
# glove_twitter.similar_by_vector(np.zeros_like(glove_twitter.get_vector("the")+0.01))
# [('bitmeye', 0.0),
#  ('bispa', 0.0),
#  ('bosanma', 0.0),
#  ('bord√µes', 0.0),
#  ('bookmarklet', 0.0),
#  ('boncabe', 0.0),
#  ('bocs', 0.0),
#  ('bloked', 0.0),
#  ('boyet', 0.0),
#  ('bilmeyi', 0.0)]

In [22]:
# using the paravectorPOS function (i.e. taking into consideration PoS and giving higher weights to adj and adverbs)

allparavectors = [] #empty list to append each tweets DF (containing summed GloVe)
for k in dataset_eng.text:  # 
    allparavectors.append(pd.DataFrame(paravectorPOS(k)).T) 
    
X_SATask_eng_GloVe = pd.concat(allparavectors,axis=0) #concat the GloVe-ed dataframes for each tweet
X_SATask_eng_GloVe.index = dataset_eng.index #restore the SATask_eng index to the new dataframe

In [23]:
# picklemaker("X_SATask_eng_GloVe_PICKLED",X_SATask_eng_GloVe)

In [24]:
X_SATask_eng_GloVe = pickleloader("X_SATask_eng_GloVe_PICKLED")

##### Train Test Split 

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_SATask_eng_GloVe, 
                                                    dataset_eng.polarity, test_size=0.33, random_state=42)

In [40]:
X_train.index

Int64Index([ 18422,   6442,  19577,   4040,  97325,  19764,  60200,   3979,
             58172,  62327,
            ...
             19526,  74102,  53398,  45212, 102138,   7311,  68181,  94459,
              1019,  19294],
           dtype='int64', length=56980)

##### MinMaxScaler

In [41]:
MMScale = MinMaxScaler()
X_train = MMScale.fit_transform(X_train)
X_test = MMScale.transform(X_test)

In [63]:
# RBScaler = RobustScaler()
# X_train = RBScaler.fit_transform(X_train)
# X_test = RBScaler.transform(X_test)

##### The count for the "NEGATIVE" class is significantly lower than "POSITIVE". Use SMOTE to upsample the "NEGATIVE" class. 

In [42]:
sm = SMOTETomek(ratio="minority", random_state=42, 
                smote=SMOTE(ratio="minority",random_state=42, k_neighbors=50,m_neighbors=50, n_jobs=-1),\
                tomek=TomekLinks(ratio="not minority",n_jobs=-1), n_jobs=-1) 
# , enn=EditedNearestNeighbours(n_neighbors=5, n_jobs=-1)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
X_train_res.shape



(82985, 100)

In [71]:
# from imblearn.over_sampling import ADASYN
# adasyn = ADASYN(ratio="minority", random_state=42, n_neighbors=5, n_jobs=-1) 

# X_train_res, y_train_res = adasyn.fit_sample(X_train, y_train)
# X_train_res.shape

(83671, 100)

#### Result of best-tuned Random Forest Classifier - with POS

In [49]:
clf = RandomForestClassifier(n_estimators=80,random_state=42, oob_score=True, n_jobs=-1,class_weight="balanced_subsample")
clf_model = clf.fit(X_train_res, y_train_res)

In [50]:
RFC_cvs_train = cross_val_score(clf_model, X_train_res, y=y_train_res)
RFC_cvs_test = cross_val_score(clf_model, X_test, y=y_test)
RFC_cr = classification_report(y_test, clf_model.predict(X_test))

In [51]:
print("RFClass 100 estimators, class_weight=balanced_subsample | using MinMaxScaler | using SMOTETomek k = 50, m = 50", "\n\n", \
           "train cross val score: ", "\n", RFC_cvs_train, "\n\n",
           "test cross val score: ", "\n", RFC_cvs_test, "\n\n",
           "class report - test set: ", "\n", RFC_cr) 

RFClass 100 estimators, class_weight=balanced_subsample | using MinMaxScaler | using SMOTETomek k = 50, m = 50 

 train cross val score:  
 [0.77127467 0.78573494 0.78699252] 

 test cross val score:  
 [0.64724746 0.65141635 0.6525922 ] 

 class report - test set:  
              precision    recall  f1-score   support

   NEGATIVE       0.44      0.54      0.49      3333
    NEUTRAL       0.73      0.20      0.32      7893
   POSITIVE       0.69      0.90      0.78     16839

avg / total       0.67      0.66      0.62     28065



In [48]:
print("RFClass 100 estimators, class_weight=balanced | using MinMaxScaler | using SMOTETomek k = 50, m = 50", "\n\n", \
           "train cross val score: ", "\n", RFC_cvs_train, "\n\n",
           "test cross val score: ", "\n", RFC_cvs_test, "\n\n",
           "class report - test set: ", "\n", RFC_cr) 

RFClass 100 estimators, class_weight=balanced | using MinMaxScaler | using SMOTETomek k = 50, m = 50 

 train cross val score:  
 [0.77105777 0.7862772  0.78648639] 

 test cross val score:  
 [0.6484233  0.65098878 0.65366114] 

 class report - test set:  
              precision    recall  f1-score   support

   NEGATIVE       0.44      0.54      0.49      3333
    NEUTRAL       0.72      0.20      0.32      7893
   POSITIVE       0.69      0.89      0.78     16839

avg / total       0.67      0.66      0.62     28065



In [45]:
print("RFClass 100 estimators | using MinMaxScaler | using SMOTETomek k = 50, m = 50", "\n\n", \
           "train cross val score: ", "\n", RFC_cvs_train, "\n\n",
           "test cross val score: ", "\n", RFC_cvs_test, "\n\n",
           "class report - test set: ", "\n", RFC_cr) 

RFClass 100 estimators | using MinMaxScaler | using SMOTETomek k = 50, m = 50 

 train cross val score:  
 [0.77138312 0.78945846 0.78775171] 

 test cross val score:  
 [0.65408872 0.65195083 0.65611972] 

 class report - test set:  
              precision    recall  f1-score   support

   NEGATIVE       0.45      0.54      0.49      3333
    NEUTRAL       0.72      0.21      0.33      7893
   POSITIVE       0.70      0.90      0.78     16839

avg / total       0.67      0.66      0.62     28065



In [38]:
print("RFClass 100 estimators | using MinMaxScaler | using SMOTETomek k = 10, m = 10", "\n\n", \
           "train cross val score: ", "\n", RFC_cvs_train, "\n\n",
           "test cross val score: ", "\n", RFC_cvs_test, "\n\n",
           "class report - test set: ", "\n", RFC_cr) 

RFClass 100 estimators | using MinMaxScaler | using SMOTETomek k = 10, m = 10 

 train cross val score:  
 [0.77138312 0.78945846 0.78775171] 

 test cross val score:  
 [0.65408872 0.65195083 0.65611972] 

 class report - test set:  
              precision    recall  f1-score   support

   NEGATIVE       0.45      0.54      0.49      3333
    NEUTRAL       0.72      0.21      0.33      7893
   POSITIVE       0.70      0.90      0.78     16839

avg / total       0.67      0.66      0.62     28065



In [31]:
print("RFClass 100 estimators | using MinMaxScaler | using SMOTETomek k = 5, m = 5", "\n\n", \
           "train cross val score: ", "\n", RFC_cvs_train, "\n\n",
           "test cross val score: ", "\n", RFC_cvs_test, "\n\n",
           "class report - test set: ", "\n", RFC_cr) 

RFClass 100 estimators | using MinMaxScaler | using SMOTETomek k = 5, m = 5 

 train cross val score:  
 [0.77138312 0.78945846 0.78775171] 

 test cross val score:  
 [0.65408872 0.65195083 0.65611972] 

 class report - test set:  
              precision    recall  f1-score   support

   NEGATIVE       0.45      0.54      0.49      3333
    NEUTRAL       0.72      0.21      0.33      7893
   POSITIVE       0.70      0.90      0.78     16839

avg / total       0.67      0.66      0.62     28065



In [74]:
print("RFClass 100 estimators | using MinMaxScaler | using ADASYN n = 5", "\n\n", \
           "train cross val score: ", "\n", RFC_cvs_train, "\n\n",
           "test cross val score: ", "\n", RFC_cvs_test, "\n\n",
           "class report - test set: ", "\n", RFC_cr) 

RFClass 100 estimators | using MinMaxScaler | using ADASYN n = 5 

 train cross val score:  
 [0.73686852 0.76103403 0.75983363] 

 test cross val score:  
 [0.65408872 0.65195083 0.65611972] 

 class report - test set:  
              precision    recall  f1-score   support

   NEGATIVE       0.42      0.55      0.48      3333
    NEUTRAL       0.72      0.21      0.32      7893
   POSITIVE       0.70      0.89      0.78     16839

avg / total       0.67      0.66      0.62     28065



In [67]:
print("RFClass 100 estimators | using MinMaxScaler | using SMOTETomek, k=5, m=5", "\n\n", \
           "train cross val score: ", "\n", RFC_cvs_train, "\n\n",
           "test cross val score: ", "\n", RFC_cvs_test, "\n\n",
           "class report - test set: ", "\n", RFC_cr) 

RFClass 100 estimators | using MinMaxScaler | using SMOTETomek, k=5, m=5 

 train cross val score:  
 [0.77138312 0.78945846 0.78775171] 

 test cross val score:  
 [0.65408872 0.65195083 0.65611972] 

 class report - test set:  
              precision    recall  f1-score   support

   NEGATIVE       0.45      0.54      0.49      3333
    NEUTRAL       0.72      0.21      0.33      7893
   POSITIVE       0.70      0.90      0.78     16839

avg / total       0.67      0.66      0.62     28065



In [58]:
print("RFClass 100 estimators | using MinMaxScaler | using ADASYN", "\n\n", \
           "train cross val score: ", "\n", RFC_cvs_train, "\n\n",
           "test cross val score: ", "\n", RFC_cvs_test, "\n\n",
           "class report - test set: ", "\n", RFC_cr) 

RFClass 100 estimators | using MinMaxScaler | using ADASYN 

 train cross val score:  
 [0.73098792 0.76350391 0.76160353] 

 test cross val score:  
 [0.65408872 0.65195083 0.65611972] 

 class report - test set:  
              precision    recall  f1-score   support

   NEGATIVE       0.42      0.55      0.48      3333
    NEUTRAL       0.73      0.21      0.33      7893
   POSITIVE       0.70      0.89      0.78     16839

avg / total       0.68      0.66      0.62     28065



In [43]:
print("RFClass 100 estimators | using MinMaxScaler", "\n\n", \
           "train cross val score: ", "\n", RFC_cvs_train, "\n\n",
           "test cross val score: ", "\n", RFC_cvs_test, "\n\n",
           "class report - test set: ", "\n", RFC_cr) 

RFClass 100 estimators | using MinMaxScaler 

 train cross val score:  
 [0.77138312 0.78945846 0.78775171] 

 test cross val score:  
 [0.65408872 0.65195083 0.65611972] 

 class report - test set:  
              precision    recall  f1-score   support

   NEGATIVE       0.45      0.54      0.49      3333
    NEUTRAL       0.72      0.21      0.33      7893
   POSITIVE       0.70      0.90      0.78     16839

avg / total       0.67      0.66      0.62     28065



In [35]:
print("RFClass 100 estimators | using RobustScaler", "\n\n", \
           "train cross val score: ", "\n", RFC_cvs_train, "\n\n",
           "test cross val score: ", "\n", RFC_cvs_test, "\n\n",
           "class report - test set: ", "\n", RFC_cr) 

RFClass 100 estimators 

 train cross val score:  
 [0.77068729 0.79058254 0.7894908 ] 

 test cross val score:  
 [0.64949225 0.65440941 0.65804383] 

 class report - test set:  
              precision    recall  f1-score   support

   NEGATIVE       0.45      0.54      0.49      3333
    NEUTRAL       0.72      0.21      0.32      7893
   POSITIVE       0.69      0.90      0.78     16839

avg / total       0.67      0.66      0.62     28065



#### Running a pipeline with GridSearch CV to find the best performing classifier

In [None]:
est_names = ["rfClass", "sgdClass", 'decisiontree']

estimators = [RandomForestClassifier(random_state=42), 
              SGDClassifier(random_state=42, n_jobs=-1),DecisionTreeClassifier(random_state=42)] 
              
parameters = [
    {est_names[0]+'__n_estimators': np.arange(70, 101 ,10),
    est_names[0]+'__oob_score': (True, False),
    est_names[0]+'__class_weight':('balanced', 'balanced_subsample')},
    {est_names[1]+'__loss': ('hinge', 'log', 'modified_huber'), 
     est_names[1]+'__penalty': ('l1','l2', 'elasticnet'),
     est_names[1]+'__alpha': ([10 ** x for x in range(-6, -2)]),
     est_names[1]+'__tol' : ([10 ** x for x in range(-4, -2)]),
     est_names[1]+'__max_iter':(np.arange(300, 500, 50))},
    {est_names[2]+'__criterion': ("gini", "entropy"), 
     est_names[2]+'__max_features' : (np.arange(0.5,1.01,0.25))}
             ]

# for RFCLass, we don't limit max depth... 
# for TFIDF normalisation read 2.2.3 of https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf
# for TFIDF sublinear_tf = True is necessary because it provides smoothing. 
# for SparsePCA read https://gawalt.com/brian/resources/TextEDAviaSparsePCA.pdf

gridsearch_models = []
for est_name, estimator, param in zip(est_names, estimators, parameters):
    pipe =  Pipeline([(est_name, estimator)])
    pipe_GS = GridSearchCV(pipe,param_grid=param, n_jobs=-1, cv=3)
    clf = pipe_GS.fit(X_train_res , y=y_train_res)
    gridsearch_models.append(clf)
    pipe_GS_cvs_train = cross_val_score(clf, X_train_res, y=y_train_res)
    pipe_GS_cvs_test = cross_val_score(clf, X_test, y=y_test)
    pipe_GS_cr = classification_report(y_test, clf.predict(X_test))                 
    print(est_name, "\n\n", \
           "train cross val score: ", "\n", pipe_GS_cvs_train, "\n\n",
           "test cross val score: ", "\n", pipe_GS_cvs_train, "\n\n",
           "class report - test set: ", "\n", pipe_GS_cr)  


rfClass 

 train cross val score:  
 [0.77232304 0.78837394 0.78786016] 

 test cross val score:  
 [0.77232304 0.78837394 0.78786016] 

 class report - test set:  
              precision    recall  f1-score   support

   NEGATIVE       0.45      0.55      0.49      3333
    NEUTRAL       0.74      0.20      0.32      7893
   POSITIVE       0.69      0.90      0.78     16839

avg / total       0.68      0.66      0.62     28065



In [None]:
gridsearch_models

In [52]:
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier(
    DecisionTreeClassifier(),
    n_estimators=10,
    learning_rate=1)

adaboost_clf = adaboost.fit(X_train_res, y_train_res)
adaboost_cvs_train = cross_val_score(adaboost_clf, X_train_res, y=y_train_res)
adaboost_cvs_test = cross_val_score(adaboost_clf, X_test, y=y_test)
adaboost_cr = classification_report(y_test, adaboost_clf.predict(X_test))                 

In [53]:
print("Adaboost with DecisionTreeClassifier", "\n\n", \
           "train cross val score: ", "\n", adaboost_cvs_train, "\n\n",
           "test cross val score: ", "\n", adaboost_cvs_test, "\n\n",
           "class report - test set: ", "\n", adaboost_cr) 

Adaboost with DecisionTreeClassifier 

 train cross val score:  
 [0.70200998 0.71545803 0.72307581] 

 test cross val score:  
 [0.58770711 0.59091395 0.5797969 ] 

 class report - test set:  
              precision    recall  f1-score   support

   NEGATIVE       0.33      0.50      0.40      3333
    NEUTRAL       0.47      0.31      0.38      7893
   POSITIVE       0.70      0.74      0.72     16839

avg / total       0.59      0.59      0.59     28065



In [54]:
adaboost = AdaBoostClassifier(
    RandomForestClassifier(n_estimators=80,random_state=42, oob_score=True, n_jobs=-1,\
                           class_weight="balanced_subsample"),
    n_estimators=10,
    learning_rate=1)

adaboost_clf = adaboost.fit(X_train_res, y_train_res)
adaboost_cvs_train = cross_val_score(adaboost_clf, X_train_res, y=y_train_res)
adaboost_cvs_test = cross_val_score(adaboost_clf, X_test, y=y_test)
adaboost_cr = classification_report(y_test, adaboost_clf.predict(X_test))                 

In [55]:
print("Adaboost with RandomForestClassifier", "\n\n", \
           "train cross val score: ", "\n", adaboost_cvs_train, "\n\n",
           "test cross val score: ", "\n", adaboost_cvs_test, "\n\n",
           "class report - test set: ", "\n", adaboost_cr) 

Adaboost with RandomForestClassifier 

 train cross val score:  
 [0.77456438 0.78653026 0.7876071 ] 

 test cross val score:  
 [0.63303046 0.63858899 0.63976483] 

 class report - test set:  
              precision    recall  f1-score   support

   NEGATIVE       0.45      0.56      0.50      3333
    NEUTRAL       0.80      0.15      0.26      7893
   POSITIVE       0.69      0.91      0.78     16839

avg / total       0.69      0.66      0.60     28065

