LOADING REQUIRED DATASETS

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
from sklearn.pipeline import Pipeline
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
word_lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")

from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

import pickle

In [3]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.naive_bayes import MultinomialNB

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.neural_network import MLPClassifier

In [4]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,cross_val_score

LOADING DATASET INTO DASK DATAFRAME BY FILE NAME

In [10]:
def data_loading(filename):
    df = pd.read_csv(filename,encoding='latin-1',header=None)
    df = df[[5,0]]
    df.columns = ['statement','analysis']
    df['index_col']=1
    df['index_col'] = df['index_col'].cumsum()
    df.dropna()
    return df

EXTRACTING FEATURES AND LABELS FROM DATAFRAME

In [12]:
def feature_label_split(df):
    X = df['statement']
    y = df['analysis']
    X = X.astype(str)
    y = y.astype(int)
    return X,y

APPLYING PREPROCESSING ACTIVITIES, LEMMATIZIZNG, REMOVING STOP WORDS, TFIDF VECTORIZER 
ON FEATURE MATRIX

In [5]:
def preprocess(sentence):
    emoji_dict = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
                    ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
                    ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', r':\\': 'annoyed', 
                    ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
                    '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
                    '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
                    ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}
    
    sentence = str(sentence)
    sentence = sentence.lower()
    sentence = re.sub('<[^>]*>',' tag ',sentence)
    sentence = re.sub(r'((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)', ' url ',sentence)
    sentence = re.sub('@[^\s]+>',' USER ',sentence)
    sentence = re.sub('[^a-zA-Z0-9]',' ',sentence) 
    for emoji in emoji_dict.keys():
        sentence = sentence.replace(emoji, " EMOJI "+emoji_dict[emoji])
    sentence = re.sub(r"(.)\1\1+",r"\1\1",sentence)
    
    return sentence

def lemmatizer(sentence):
    return ''.join([word_lemmatizer.lemmatize(word) for word in sentence])

def stop_words_remover(sentence):
    sentence = str(sentence)
    sentence = ''.join(sentence)
    stopwords = nlp.Defaults.stop_words
    new_sent = ''
    for word_token in sentence.split():
        if word_token not in stopwords:
            new_sent = new_sent + word_token + ' '
    return new_sent

class DataCleaner(BaseEstimator,TransformerMixin):
    def __init__(self,X=None,y=None):
        self.X = X
        self.y = y
        
    
    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        X_ = X.copy()
        for row in X_.iteritems():
            row = preprocess(row)
            row = stop_words_remover(row)
            row = lemmatizer(row)
        return X_

tfidf = TfidfVectorizer(
                        ngram_range=(1,2),
                        max_features=500000,
                        )

LOADING & EXTRACTING FEATURES AND LABEL ON TRAINING, TEST DATASET

In [6]:
pipe = Pipeline([
                ('data_cleaning',DataCleaner()),
                ('vectorizer',tfidf)
        ])

In [7]:
df_train = data_loading("training140.csv")
X_train, y_train = feature_label_split(df_train)

NameError: name 'data_loading' is not defined

In [41]:
X_train_tr = pipe.fit_transform(X_train)

file = open('picklefiles/X_train.pickle','wb')
pickle.dump(X_train,file)
file.close()

file = open('picklefiles/pipe_fitted.pickle','wb')
pickle.dump(pipe,file)
file.close()

file = open('picklefiles/y_train.pickle','wb')
pickle.dump(y_train,file)
file.close() 

KeyboardInterrupt: 

In [8]:
X_train = pickle.load(open("picklefiles/X_train.pickle",'rb'))
X_train_tr = pickle.load(open("picklefiles/X_train_tr.pickle",'rb'))
y_train = pickle.load(open("picklefiles/y_train.pickle",'rb'))
pipe = pickle.load(open('picklefiles/pipe_fitted.pickle','rb'))

In [13]:
df_test = data_loading("test140.csv")
X_test, y_test = feature_label_split(df_test)
X_test_tr =pipe.transform(X_test)

CREATING CLASSIFIERS, PARAMETER GRID

In [None]:
clf1 = LinearSVC(max_iter=1000,tol=0.0001)

param_grid1 =   {'C':[0.001,0.01,0.1,1.0,10.0],
                }

In [None]:
clf2 = SVC(max_iter=1000,tol=0.001,cache_size=5)
param_grid2 =   {'C':[0.001,0.01,0.1,1.0,10.0],
                'gamma':['auto','scale'],
                'kernel':['poly','rbf','sigmoid']
                }

In [None]:
clf3 = LogisticRegression(max_iter=1000,tol=0.0001,warm_start=True,n_jobs=-1,solver='saga')
param_grid3 = {'C':[0.001,0.01,0.1,1.0,10.0],
                'solver' : ['sag','lbfgs']
            }

In [None]:
clf4 = DecisionTreeClassifier(max_depth=10,min_samples_split=2)
param_grid4 = {'max_depth' : range(5,25),
                'min_samples_split' : range(2,8),
                'min_samples_leaf' : range(1,8)
                }

In [None]:
clf5 = MultinomialNB()
param_grid5 = {'alpha':[0.001,0.1,1,10,100]
                }

In [None]:
linsvcCV = pickle.load(open('picklefiles/linearsvc_best_model.pickle','rb'))
logregCV = pickle.load(open('picklefiles/logreg_best_model.pickle','rb'))
mulinomialnbCV = pickle.load(open('picklefiles/multinomialnb_best_model.pickle','rb'))
desctreeCV = pickle.load(open('picklefiles/desctree_best_model.pickle','rb'))

estimators = [('lsvc',linsvcCV.best_estimator_),
               ('lr',logregCV.best_estimator_),
               ('mnb',mulinomialnbCV.best_estimator_),
               ('dt',desctreeCV.best_estimator_) 
                ]

clf6 = VotingClassifier(estimators,voting='hard',n_jobs=-1)
param_grid6 = {}

In [None]:
clf7 = BaggingClassifier(base_estimator=linsvcCV.best_estimator_,n_jobs=-1)
param_grid7 = {}

In [None]:
clf8 = RandomForestClassifier(n_estimators=20,max_features='sqrt',max_depth=50,min_samples_split=8,min_samples_leaf=4,n_jobs=-1)
param_grid8 = { 'max_depth':np.linspace(start=20,stop=200,num=10).astype(int),
                'min_samples_split':[2,4,6,8,10],
                'min_samples_leaf':[2,4,6,8,10]
                }

In [None]:
clf9 = AdaBoostClassifier(n_estimators=200)

In [None]:
clf10 = XGBClassifier(n_estimators=200,max_depth=25,learning_rate=0.5,booster='gbtree',n_jobs=-1)

In [None]:
clf11 = GradientBoostingClassifier(n_estimators=200,learning_rate=0.5,max_depth=25,max_features='sqrt',warm_start=True,tol=0.0012)

In [None]:
clf12 = MLPClassifier(hidden_layer_sizes=(50,100,10),tol=0.0001,learning_rate_init=0.005,verbose=True)
#clf12 = MLPClassifier(hidden_layer_sizes=(32,128,8,1),tol=0.0001,verbose=True,learning_rate_init=0.005,n_iter_no_change=10)
#clf12 = MLPClassifier(hidden_layer_sizes=(50),tol=0.0001,verbose=True,learning_rate_init=0.005,n_iter_no_change=10,batch_size=1024)

COMPUTING ACCURACY SCORE AS MODEL METRIC

In [1]:
def compute_accuracy(model,X,y):
    y_predictions = model.predict(X)
    ac_score = accuracy_score(y,y_predictions)
    return ac_score

def compute_accuracy1(model,X,y):
    y_predictions = model.predict(X)
    y = y.apply(lambda x:0 if x==0 else 1)
    y.astype(int)
    ac_score = accuracy_score(y,y_predictions)
    return ac_score    

TRAINING LINEARSVC CLASSIFIER ON TRAINING SET

In [None]:
#y_train = y_train.apply(lambda x: 0 if x==0 else 1)
clf12.fit(X_train_tr,y_train)



PRINTING ACCURACY SCORE OF LINEARSVC ESTIMATOR ON TRAINING SET FOLLOWED BY TEST SET

In [None]:
print(compute_accuracy(clf12,X_train_tr,y_train))
print(compute_accuracy(clf12,X_test_tr,y_test))

0.762021875
0.5742971887550201


In [None]:
file = open('picklefiles/mlpclf_model.pickle','wb')
pickle.dump(clf12,file)
file.close()

HYPERPARAMETER TUNING

In [None]:
grid_search_svc = GridSearchCV(estimator = clf9, 
                                        param_grid=param_grid9,
                                        cv=3,
                                        refit= 'acc',
                                        scoring={'acc':'accuracy',
                                                 'mse':'neg_mean_squared_error'
                                                },
                                        n_jobs=-1,
                                        pre_dispatch='2*n_jobs',
                                        return_train_score=True
                                )  

rand_search_svc = RandomizedSearchCV(estimator = clf9, 
                                        param_distributions=param_grid9,
                                        cv=3,
                                        refit= 'acc',
                                        scoring={'acc':'accuracy',
                                                 'mse':'neg_mean_squared_error'
                                                },
                                        n_jobs=-1,
                                        pre_dispatch='2*n_jobs',
                                        return_train_score=True
                                ) 

In [None]:
#grid_search_svc.fit(X_train_tr,y_train)
file = open('picklefiles/adaboost_best_model.pickle','wb')
pickle.dump(clf9,file)
file.close()

In [None]:
grid_search_svc.best_estimator_

In [None]:
print(compute_accuracy(rand_search_svc.best_estimator_,X_train_tr,y_train))
print(compute_accuracy(rand_search_svc.best_estimator_,X_test_tr,y_test))

In [14]:
linearsvcCV = pickle.load(open('picklefiles/linearsvc_best_model.pickle','rb'))
linearsvc = linearsvcCV.best_estimator_

svcCV = pickle.load(open('picklefiles/svc_best_model.pickle','rb'))
svcclf = svcCV.best_estimator_

logregCV = pickle.load(open('picklefiles/logreg_best_model.pickle','rb'))
logreg = logregCV.best_estimator_

multinomialnbCV = pickle.load(open('picklefiles/multinomialnb_best_model.pickle','rb'))
multinomialnb = multinomialnbCV.best_estimator_

mvotingCV = pickle.load(open('picklefiles/voting_best_model.pickle','rb'))
mvoting = mvotingCV.best_estimator_

baggingCV = pickle.load(open('picklefiles/bagging_best_model.pickle','rb'))
bagging = baggingCV

rforestCV = pickle.load(open('picklefiles/randforest_best_model.pickle','rb'))
rforest = rforestCV.best_estimator_

adaboost = pickle.load(open('picklefiles/adaboost_model.pickle','rb'))

xgboost = pickle.load(open('picklefiles/xgboost_model.pickle','rb'))

gradientboost = pickle.load(open('picklefiles/gradientboost_model.pickle','rb'))

mlpclf = pickle.load(open('picklefiles/mlpclf_model.pickle','rb'))

In [15]:
print("\nSVC:",svcclf)
print('Training set accuracy: ',compute_accuracy(svcclf,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(svcclf,X_test_tr,y_test))


SVC: SVC(C=10.0, cache_size=5, kernel='sigmoid', max_iter=1000)
Training set accuracy:  0.5995675
Test set accuracy:  0.43373493975903615


In [16]:
print("\nLinearSVC:",linearsvc)
print('Training set accuracy: ',compute_accuracy(linearsvc,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(linearsvc,X_test_tr,y_test))
print("\nLogisticRegressor:",logreg)
print('Training set accuracy: ',compute_accuracy(logreg,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(logreg,X_test_tr,y_test))
print("\nMultinomialNB:",multinomialnb)
print('Training set accuracy: ',compute_accuracy(multinomialnb,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(multinomialnb,X_test_tr,y_test))
print("\nVotingClassifier:",mvoting)
print('Training set accuracy: ',compute_accuracy(mvoting,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(mvoting,X_test_tr,y_test))
print("\nBaggingClassifier:",bagging)
print('Training set accuracy: ',compute_accuracy(bagging,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(bagging,X_test_tr,y_test))
print("\nRandomForestClassifier:",rforest)
print('Training set accuracy: ',compute_accuracy(rforest,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(rforest,X_test_tr,y_test))
print("\nAdaBoostClassifier:",adaboost)
print('Training set accuracy: ',compute_accuracy(adaboost,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(adaboost,X_test_tr,y_test))
print("\nXGBClassifier:",xgboost)
print('Training set accuracy: ',compute_accuracy1(xgboost,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy1(xgboost,X_test_tr,y_test))
print("\nGradientBoostingClassifier:",gradientboost)
print('Training set accuracy: ',compute_accuracy1(gradientboost,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy1(gradientboost,X_test_tr,y_test))
print("\nMLPClassifier:",mlpclf)
print('Training set accuracy: ',compute_accuracy(mlpclf,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(mlpclf,X_test_tr,y_test))


LinearSVC: LinearSVC(C=0.1)
Training set accuracy:  0.858434375
Test set accuracy:  0.5963855421686747

LogisticRegressor: LogisticRegression(max_iter=1000, n_jobs=-1, warm_start=True)
Training set accuracy:  0.8566475
Test set accuracy:  0.5983935742971888

MultinomialNB: MultinomialNB(alpha=1)
Training set accuracy:  0.8386875
Test set accuracy:  0.608433734939759

VotingClassifier: VotingClassifier(estimators=[('lsvc', LinearSVC(C=0.1)),
                             ('lr',
                              LogisticRegression(max_iter=1000, n_jobs=-1,
                                                 warm_start=True)),
                             ('mnb', MultinomialNB(alpha=1)),
                             ('dt',
                              DecisionTreeClassifier(max_depth=23,
                                                     min_samples_leaf=5,
                                                     min_samples_split=4))],
                 n_jobs=-1)
Training set accuracy:  0.855020