In [4]:
import os
import matplotlib.pyplot as plt#visualization
from PIL import  Image
%matplotlib inline
import pandas as pd
import seaborn as sns#visualization
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
import plotly.offline as py#visualization
py.init_notebook_mode(connected=True)#visualization
import plotly.graph_objs as go#visualization
import plotly.tools as tls#visualization
import plotly.figure_factory as ff#visualization
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score,recall_score
from imblearn.over_sampling import SMOTE
from IPython.display import SVG,display
import numpy as np
import preprocessor as p
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser, Phrases#from sklearn import model_selection, naive_bayes, svm
from sklearn.naive_bayes import GaussianNB
from yellowbrick.classifier import DiscriminationThreshold
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [5]:
%run ./Utils.ipynb

In [6]:
np.random.seed(500)

In [7]:
data = pd.read_csv('data/output.csv', sep='\t', index_col=False)

In [8]:
data = data.drop_duplicates(keep='first')
data = data.dropna()
data['Sentiment'] = data['Sentiment'].astype(int)

In [9]:
data = data[(data['Sentiment'] != 0)]

In [10]:
pos_words = []
pos_lable = []
with open('data/SinPos1.txt','r',encoding = 'UTF-8') as f:
    for line in f:
        line = line.strip()
        if line != '':
            pos_words.append(line)
            pos_lable.append(1)

In [11]:
neg_words = []
neg_lable = []
with open('data/SinNeg1.txt','r',encoding = 'UTF-8') as f:
    for line in f:
        line = line.strip()
        #print(line)
        #neg_words.append(line)
        if line != '':
            neg_words.append(line)
            neg_lable.append(-1)

In [12]:
Tweets = pos_words + neg_words
Sentiment = pos_lable + neg_lable

In [13]:
data1 = {'Tweet':Tweets, 'Sentiment':Sentiment}

In [14]:
data_new = pd.DataFrame(data1)

In [15]:
final_data = pd.concat([data_new, data],ignore_index=True) 

In [16]:
# Cleaning data - remove punctuation from every newsgroup text
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY, p.OPT.HASHTAG)

cleaned_tweets = []
for line in final_data['Tweet']:
    line = line.strip()
    line = p.clean(line)
    line = re.sub(r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]','',line).strip().split(' ')
    cleaned_tweets.append(line)

In [17]:
word2vec_file = 'word2vec_new_301119' + str(200) + '.model'
model = Word2Vec.load(word2vec_file)

In [18]:
def get_mean_vector(word2vec_model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in word2vec_model.wv.vocab]
    if len(words) >= 1:
        return np.mean(word2vec_model[words], axis=0)
    else:
        return []

In [19]:
sentence_vectors = []
for doc in cleaned_tweets:
    vec = get_mean_vector(model, doc)
    if len(vec) > 0:
        sentence_vectors.append(vec)
    else:
        sentence_vectors.append(0)

In [20]:
final_df = pd.DataFrame(sentence_vectors) 

In [21]:
final_df.columns = final_df.columns.astype(str)

In [22]:
sentiment = final_data["Sentiment"]

In [23]:
train_x = final_df[:21690]
test_x = final_df[21690:]
train_y = sentiment[:21690]
test_y = sentiment[21690:]

In [24]:
len(train_x),len(test_x),len(train_y),len(test_y)

(21690, 2000, 21690, 2000)

In [25]:
def prediction_alg(algorithm,training_x,testing_x,
                                 training_y,testing_y,threshold_plot = True) :
    
    #model
    algorithm.fit(training_x,training_y)
    predictions   = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
    
    print (algorithm)
    print ("\n Classification report : \n",classification_report(testing_y,predictions))
    print ("Accuracy Score   : ",accuracy_score(testing_y,predictions))
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y,predictions)
    #roc_auc_score
    model_roc_auc = roc_auc_score(testing_y,predictions) 
    print ("Area under curve : ",model_roc_auc)
    fpr,tpr,thresholds = roc_curve(testing_y,probabilities[:,1])
     
    #plot roc curve
    trace1 = go.Scatter(x = fpr,y = tpr,
                        name = "Roc : " + str(model_roc_auc),
                        line = dict(color = ('rgb(22, 96, 167)'),width = 2),
                       )
    trace2 = go.Scatter(x = [0,1],y=[0,1],
                        line = dict(color = ('rgb(205, 12, 24)'),width = 2,
                        dash = 'dot'))
    
    #plot confusion matrix
    trace3 = go.Heatmap(z = conf_matrix ,x = ["Not churn","Churn"],
                        y = ["Not churn","Churn"],
                        showscale  = False,colorscale = "Blues",name = "matrix",
                        xaxis = "x2",yaxis = "y2"
                       )
    
    layout = go.Layout(dict(title="Model performance" ,
                            autosize = False,height = 500,width = 800,
                            showlegend = False,
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(title = "false positive rate",
                                         gridcolor = 'rgb(255, 255, 255)',
                                         domain=[0, 0.6],
                                         ticklen=5,gridwidth=2),
                            yaxis = dict(title = "true positive rate",
                                         gridcolor = 'rgb(255, 255, 255)',
                                         zerolinewidth=1,
                                         ticklen=5,gridwidth=2),
                            margin = dict(b=200),
                            xaxis2=dict(domain=[0.7, 1],tickangle = 90,
                                        gridcolor = 'rgb(255, 255, 255)'),
                            yaxis2=dict(anchor='x2',gridcolor = 'rgb(255, 255, 255)')
                           )
                  )
    data = [trace1,trace2,trace3]
    fig = go.Figure(data=data,layout=layout)
    
    py.iplot(fig)
    
#     if threshold_plot == True : 
#         visualizer = DiscriminationThreshold(algorithm)
#         visualizer.fit(training_x,training_y)
#         visualizer.poof()

### GaussianNB

In [26]:
gnb = GaussianNB(priors=None)
prediction_alg(gnb,train_x,test_x,train_y,test_y)

GaussianNB(priors=None, var_smoothing=1e-09)

 Classification report : 
               precision    recall  f1-score   support

          -1       0.60      0.82      0.70      1193
           1       0.43      0.20      0.27       807

    accuracy                           0.57      2000
   macro avg       0.52      0.51      0.48      2000
weighted avg       0.53      0.57      0.52      2000

Accuracy Score   :  0.5705
Area under curve :  0.5102809553041232


### SVM 

In [27]:
svc_lin  = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
               decision_function_shape='ovr', degree=3, gamma=1.0, kernel='linear',
               max_iter=-1, probability=True, random_state=None, shrinking=True,
               tol=0.001, verbose=False)
prediction_alg(svc_lin,train_x,test_x,train_y,test_y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1.0, kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

 Classification report : 
               precision    recall  f1-score   support

          -1       0.64      0.95      0.76      1193
           1       0.71      0.20      0.31       807

    accuracy                           0.64      2000
   macro avg       0.67      0.57      0.54      2000
weighted avg       0.67      0.64      0.58      2000

Accuracy Score   :  0.6445
Area under curve :  0.5725099220878502


### SVM non linear hyper plane

In [28]:
svc_rbf  = SVC(C=1.0, kernel='rbf', 
               degree= 3, gamma=1.0, 
               coef0=0.0, shrinking=True,
               probability=True,tol=0.001,
               cache_size=200, class_weight=None,
               verbose=False,max_iter= -1,
               random_state=None)
prediction_alg(svc_rbf,train_x,test_x,train_y,test_y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1.0, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

 Classification report : 
               precision    recall  f1-score   support

          -1       0.60      0.99      0.75      1193
           1       0.44      0.01      0.02       807

    accuracy                           0.60      2000
   macro avg       0.52      0.50      0.38      2000
weighted avg       0.54      0.60      0.45      2000

Accuracy Score   :  0.5955
Area under curve :  0.500765514655399


### LightGBM

In [29]:
lgbm_c = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                        learning_rate=0.5, max_depth=7, min_child_samples=20,
                        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
                        n_jobs=-1, num_leaves=500, objective='binary', random_state=None,
                        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
                        subsample_for_bin=200000, subsample_freq=0)

prediction_alg(lgbm_c,train_x,test_x,train_y,test_y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.5, max_depth=7,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=500, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

 Classification report : 
               precision    recall  f1-score   support

          -1       0.67      0.80      0.73      1193
           1       0.58      0.40      0.48       807

    accuracy                           0.64      2000
   macro avg       0.62      0.60      0.60      2000
weighted avg       0.63      0.64      0.63      2000

Accuracy Score   :  0.6425
Area under curve :  0.6039105646215895


### XGboost

In [30]:
xgc = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                    colsample_bytree=1, gamma=0, learning_rate=0.9, max_delta_step=0,
                    max_depth = 7, min_child_weight=1, missing=None, n_estimators=100,
                    n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
                    reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                    silent=True, subsample=1)

prediction_alg(xgc,train_x,test_x,train_y,test_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.9, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=1, nthread=1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
              subsample=1, tree_method=None, validate_parameters=False,
              verbosity=None)

 Classification report : 
               precision    recall  f1-score   support

          -1       0.66      0.78      0.71      1193
           1       0.55      0.41      0.47       807

    accuracy                           0.63      2000
   macro avg       0.61      0.59      0.59      2000
weighted avg       0.62      0.63 

### AdaBoost

In [31]:
adb = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200
)

prediction_alg(adb,train_x,test_x,train_y,test_y)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          