In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
from sklearn.pipeline import Pipeline
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
word_lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")

from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

import pickle

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [5]:
def data_loading(filename):
    df = pd.read_csv(filename,encoding='latin-1',header=None)
    df = df[[5,0]]
    df.columns = ['statement','analysis']
    df['index_col']=1
    df['index_col'] = df['index_col'].cumsum()
    df.dropna()
    return df

def feature_label_split(df):
    X = df['statement']
    y = df['analysis']
    X = X.astype(str)
    y = y.astype(int)
    return X,y

In [6]:
def preprocess(sentence):
    emoji_dict = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
                    ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
                    ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', r':\\': 'annoyed', 
                    ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
                    '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
                    '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
                    ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}
    
    sentence = str(sentence)
    sentence = sentence.lower()
    sentence = re.sub('<[^>]*>',' tag ',sentence)
    sentence = re.sub(r'((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)', ' url ',sentence)
    sentence = re.sub('@[^\s]+>',' USER ',sentence)
    sentence = re.sub('[^a-zA-Z0-9]',' ',sentence) 
    for emoji in emoji_dict.keys():
        sentence = sentence.replace(emoji, " EMOJI "+emoji_dict[emoji])
    sentence = re.sub(r"(.)\1\1+",r"\1\1",sentence)
    
    return sentence

def lemmatizer(sentence):
    return ''.join([word_lemmatizer.lemmatize(word) for word in sentence])

def stop_words_remover(sentence):
    sentence = str(sentence)
    sentence = ''.join(sentence)
    stopwords = nlp.Defaults.stop_words
    new_sent = ''
    for word_token in sentence.split():
        if word_token not in stopwords:
            new_sent = new_sent + word_token + ' '
    return new_sent

class DataCleaner(BaseEstimator,TransformerMixin):
    def __init__(self,X=None,y=None):
        self.X = X
        self.y = y
        
    
    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        X_ = X.copy()
        for row in X_.iteritems():
            row = preprocess(row)
            row = stop_words_remover(row)
            row = lemmatizer(row)
        return X_

tfidf = TfidfVectorizer(
                        ngram_range=(1,2),
                        max_features=500000,
                        )

In [7]:
X_train = pickle.load(open('picklefiles/X_train.pickle','rb'))
X_train_tr = pickle.load(open("picklefiles/X_train_tr.pickle",'rb'))
y_train = pickle.load(open("picklefiles/y_train.pickle",'rb'))
pipe = pickle.load(open('picklefiles/pipe_fitted.pickle','rb'))

In [8]:
df_test = data_loading("test140.csv")
df_test = df_test.loc[df_test['analysis']!=2]
X_test, y_test = feature_label_split(df_test)
X_test_tr =pipe.transform(X_test)

In [9]:
def compute_accuracy(model,X,y):
    y_predictions = model.predict(X)
    ac_score = accuracy_score(y,y_predictions)
    return ac_score

def compute_accuracy1(model,X,y):
    y_predictions = model.predict(X)
    y = y.apply(lambda x:0 if x==0 else 1)
    y.astype(int)
    ac_score = accuracy_score(y,y_predictions)
    return ac_score  

In [10]:
linearsvcCV = pickle.load(open('picklefiles/linearsvc_best_model.pickle','rb'))
linearsvc = linearsvcCV.best_estimator_

svcCV = pickle.load(open('picklefiles/svc_best_model.pickle','rb'))
svcclf = svcCV.best_estimator_

logregCV = pickle.load(open('picklefiles/logreg_best_model.pickle','rb'))
logreg = logregCV.best_estimator_

multinomialnbCV = pickle.load(open('picklefiles/multinomialnb_best_model.pickle','rb'))
multinomialnb = multinomialnbCV.best_estimator_

mvotingCV = pickle.load(open('picklefiles/voting_best_model.pickle','rb'))
mvoting = mvotingCV.best_estimator_

baggingCV = pickle.load(open('picklefiles/bagging_best_model.pickle','rb'))
bagging = baggingCV

rforestCV = pickle.load(open('picklefiles/randforest_best_model.pickle','rb'))
rforest = rforestCV.best_estimator_

adaboost = pickle.load(open('picklefiles/adaboost_model.pickle','rb'))

xgboost = pickle.load(open('picklefiles/xgboost_model.pickle','rb'))

gradientboost = pickle.load(open('picklefiles/gradientboost_model.pickle','rb'))

mlpclf = pickle.load(open('picklefiles/mlpclf_model.pickle','rb'))

In [11]:
print("\nSVC:",svcclf)
print('Training set accuracy: ',compute_accuracy(svcclf,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(svcclf,X_test_tr,y_test))
print("\nLinearSVC:",linearsvc)
print('Training set accuracy: ',compute_accuracy(linearsvc,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(linearsvc,X_test_tr,y_test))
print("\nLogisticRegressor:",logreg)
print('Training set accuracy: ',compute_accuracy(logreg,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(logreg,X_test_tr,y_test))
print("\nMultinomialNB:",multinomialnb)
print('Training set accuracy: ',compute_accuracy(multinomialnb,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(multinomialnb,X_test_tr,y_test))
print("\nVotingClassifier:",mvoting)
print('Training set accuracy: ',compute_accuracy(mvoting,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(mvoting,X_test_tr,y_test))
print("\nBaggingClassifier:",bagging)
print('Training set accuracy: ',compute_accuracy(bagging,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(bagging,X_test_tr,y_test))
print("\nRandomForestClassifier:",rforest)
print('Training set accuracy: ',compute_accuracy(rforest,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(rforest,X_test_tr,y_test))
print("\nAdaBoostClassifier:",adaboost)
print('Training set accuracy: ',compute_accuracy(adaboost,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(adaboost,X_test_tr,y_test))
print("\nXGBClassifier:",xgboost)
print('Training set accuracy: ',compute_accuracy1(xgboost,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy1(xgboost,X_test_tr,y_test))
print("\nGradientBoostingClassifier:",gradientboost)
print('Training set accuracy: ',compute_accuracy1(gradientboost,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy1(gradientboost,X_test_tr,y_test))
print("\nMLPClassifier:",mlpclf)
print('Training set accuracy: ',compute_accuracy(mlpclf,X_train_tr,y_train))
print('Test set accuracy: ',compute_accuracy(mlpclf,X_test_tr,y_test))


SVC: SVC(C=10.0, cache_size=5, kernel='sigmoid', max_iter=1000)
Training set accuracy:  0.5995675
Test set accuracy:  0.6016713091922006

LinearSVC: LinearSVC(C=0.1)
Training set accuracy:  0.858434375
Test set accuracy:  0.8272980501392758

LogisticRegressor: LogisticRegression(max_iter=1000, n_jobs=-1, warm_start=True)
Training set accuracy:  0.8566475
Test set accuracy:  0.83008356545961

MultinomialNB: MultinomialNB(alpha=1)
Training set accuracy:  0.8386875
Test set accuracy:  0.8440111420612814

VotingClassifier: VotingClassifier(estimators=[('lsvc', LinearSVC(C=0.1)),
                             ('lr',
                              LogisticRegression(max_iter=1000, n_jobs=-1,
                                                 warm_start=True)),
                             ('mnb', MultinomialNB(alpha=1)),
                             ('dt',
                              DecisionTreeClassifier(max_depth=23,
                                                     min_samples_leaf=5,


In [12]:
def prediction_on_input(input):
    statements = pd.Series(input)
    statements_tr = pipe.transform(statements)
    predictions = xgboost.predict(statements_tr)
    temp_dict = {'statement':statements,
                 'sentiment prediction': predictions   
                }
    result = pd.DataFrame(temp_dict)
    result['sentiment prediction'] = result['sentiment prediction'].apply(lambda x: 'Negative' if x==0 else 'Positive')
    return result

In [13]:
#a = str(input("Enter the sentecne"))
a = ["Can't connect front end and backend","infant the legend","he is an idiot","To all our teachers and mentors, your contribution to our society is priceless."]
result_table = prediction_on_input(a)
result_table

Unnamed: 0,statement,sentiment prediction
0,Can't connect front end and backend,Negative
1,infant the legend,Positive
2,he is an idiot,Negative
3,"To all our teachers and mentors, your contribu...",Positive
