In [1]:
import nltk
import re
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import gensim
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

In [2]:
import gensim.downloader as api
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [3]:
wv = api.load('word2vec-google-news-300')


In [4]:
wv['review']

array([-5.19531250e-01, -3.36914062e-02, -2.07519531e-02,  5.24902344e-02,
        5.93261719e-02, -1.28906250e-01,  1.28784180e-02, -8.44726562e-02,
        2.27539062e-01, -1.21093750e-01, -1.64062500e-01,  6.44531250e-02,
        4.10156250e-02, -8.54492188e-03, -8.83789062e-02,  8.69140625e-02,
        5.68847656e-02, -1.12304688e-01, -2.03125000e-01, -2.15820312e-01,
        2.94921875e-01, -2.66113281e-02, -8.10546875e-02,  1.19628906e-02,
       -9.70458984e-03,  3.22265625e-02,  5.93261719e-02,  1.26953125e-01,
        1.24023438e-01,  1.15966797e-02, -1.03027344e-01, -1.53320312e-01,
       -2.22656250e-01, -7.61718750e-02, -1.62353516e-02, -7.22656250e-02,
        4.02832031e-02, -2.24609375e-01,  2.27539062e-01, -8.39843750e-02,
       -3.02734375e-02,  2.75878906e-02, -1.25000000e-01,  1.27929688e-01,
       -9.27734375e-02, -4.85839844e-02,  6.93359375e-02, -1.35742188e-01,
       -1.88476562e-01,  5.39550781e-02,  2.46093750e-01,  7.93457031e-03,
        1.53320312e-01,  

In [5]:
#taking average of word_embeddings of each word of review. So finally we would have a 300-dim vector of each sentence to feed in our ML model
def sent_vec(sent):
    vector_size = wv.vector_size
    wv_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 1
    for w in sent:
        if w in wv:
            ctr += 1
            wv_res += wv[w]
    wv_res = wv_res/ctr
    return wv_res

In [6]:
df = pd.read_csv(r"C:\Users\karan\Downloads\review_data.csv")
df = df.iloc[:, [1, 2, 3]]
df.head()

Unnamed: 0,sentiment,review_body,review_tidy
0,1,Great love it,Great love
1,0,Lots of ads<br />Slow processing speed<br />Oc...,Lots Slow processing speed Occasionally shuts ...
2,1,Excellent unit. The versatility of this table...,Excellent unit versatility tablet besides comp...
3,1,I bought this on Amazon Prime so I ended up bu...,bought Amazon Prime ended buying camera okay l...
4,1,All Amazon products continue to meet my expect...,Amazon products continue meet expectations


In [8]:
df['review_tidy'] = df['review_tidy'].astype('str')

In [10]:
import spacy


In [11]:
nlp = spacy.load('en_core_web_sm')

In [12]:
#tokenizing and lemmitizing
def tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)



    # print(doc)
    # print(type(doc))

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc]

   

    # return preprocessed list of tokens
    return mytokens

In [13]:
df['tokens'] = df['review_tidy'].apply(tokenizer)
df.head()

Unnamed: 0,sentiment,review_body,review_tidy,tokens
0,1,Great love it,Great love,"[great, love]"
1,0,Lots of ads<br />Slow processing speed<br />Oc...,Lots Slow processing speed Occasionally shuts ...,"[lot, slow, processing, speed, occasionally, s..."
2,1,Excellent unit. The versatility of this table...,Excellent unit versatility tablet besides comp...,"[excellent, unit, versatility, tablet, besides..."
3,1,I bought this on Amazon Prime so I ended up bu...,bought Amazon Prime ended buying camera okay l...,"[buy, amazon, prime, end, buy, camera, okay, l..."
4,1,All Amazon products continue to meet my expect...,Amazon products continue meet expectations,"[amazon, product, continue, meet, expectation]"


In [14]:
df['vec'] = df['tokens'].apply(sent_vec)
df.head()

Unnamed: 0,sentiment,review_body,review_tidy,tokens,vec
0,1,Great love it,Great love,"[great, love]","[0.058268229166666664, 0.0185546875, -0.000854..."
1,0,Lots of ads<br />Slow processing speed<br />Oc...,Lots Slow processing speed Occasionally shuts ...,"[lot, slow, processing, speed, occasionally, s...","[0.0572662353515625, -0.023372395833333334, -0..."
2,1,Excellent unit. The versatility of this table...,Excellent unit versatility tablet besides comp...,"[excellent, unit, versatility, tablet, besides...","[0.04113743222992996, 0.013691112912934402, -0..."
3,1,I bought this on Amazon Prime so I ended up bu...,bought Amazon Prime ended buying camera okay l...,"[buy, amazon, prime, end, buy, camera, okay, l...","[0.052636564447638694, -0.014119008953651686, ..."
4,1,All Amazon products continue to meet my expect...,Amazon products continue meet expectations,"[amazon, product, continue, meet, expectation]","[-0.10062662760416667, 0.09965006510416667, -0..."


In [15]:
X = df['vec'].to_list()
Y = df['sentiment'].to_list()


In [16]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size = .3, random_state = 42)

In [19]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from xgboost import plot_tree, plot_importance
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve, f1_score
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [20]:
def evaluation(model, title = ""):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_valid)
    print("F1-score of {} is {}".format(title, f1_score(y_valid, y_pred)))
    

In [22]:
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import f1_score

In [23]:
log_reg = LogisticRegression(random_state=42)
evaluation(log_reg, "LogisticRegression")

F1-score of LogisticRegression is 0.9374960659658841


In [None]:
def objective_svm(trial):
    kernel = trial.suggest_categorical('kernel', ['poly', 'rbf'])
    C = trial.suggest_float('C', 0.01, 100, log = True)
    gamma = trial.suggest_float('gamma', 0.01, 100, log = True)
    model = SVC(kernel = kernel, C = C, gamma = gamma, random_state = 42)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_valid)
    return f1_score(y_valid, y_pred)

In [None]:
sampler = TPESampler(seed = 1)
study = optuna.create_study(direction = 'maximize', sampler = sampler)
study.optimize(objective_svm, n_trials = 40)

In [None]:
def objective_xg(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 100, 1000)
    learning_rate = trail.suggest_float('learning_rate', 0.001, 0.1, log = True)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    
    model = XGBClassifier(n_estimators = n_estimators, learning_rate = learning_rate, max_depth = max_depth, random_state = 42)
    model.fit(x_train_tf, y_train_tf)
    y_pred_tf = model.predict(x_valid_tf)
    return f1_score(y_valid_tf, y_pred_tf)

In [None]:
sampler = TPESampler(seed = 1)
study = optuna.create_study(direction = 'maximize', sampler = sampler)
study.optimize(objective_xg, n_trials = 40)