In [1]:
import os
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
stp_word = stopwords.words('english')
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [2]:
data = pd.read_csv('../input/preprocessed-dataset-sentiment-analysis/EcoPreprocessed.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,review,polarity,division
0,3870,able play youtube alexa,0.5,positive
1,62,able recognize indian accent really well drop ...,0.2794,positive
2,487,absolute smart device amazon connect external ...,0.1827,positive
3,3204,absolutely amaze new member family control hom...,0.3682,positive
4,1265,absolutely amaze previously sceptical invest m...,0.2333,positive


In [3]:
data = data[['review','division']]
data

Unnamed: 0,review,division
0,able play youtube alexa,positive
1,able recognize indian accent really well drop ...,positive
2,absolute smart device amazon connect external ...,positive
3,absolutely amaze new member family control hom...,positive
4,absolutely amaze previously sceptical invest m...,positive
...,...,...
4079,yo yo yo love go if want one smart speaker val...,positive
4080,youtube music,neutral
4081,youtube support nahi kartasong recognise achha...,neutral
4082,yup proscontrols wipro light amazinglysony bra...,neutral


In [4]:
# drop duplicates
data = data.drop_duplicates()
data

Unnamed: 0,review,division
0,able play youtube alexa,positive
1,able recognize indian accent really well drop ...,positive
2,absolute smart device amazon connect external ...,positive
3,absolutely amaze new member family control hom...,positive
4,absolutely amaze previously sceptical invest m...,positive
...,...,...
4079,yo yo yo love go if want one smart speaker val...,positive
4080,youtube music,neutral
4081,youtube support nahi kartasong recognise achha...,neutral
4082,yup proscontrols wipro light amazinglysony bra...,neutral


In [5]:
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos= 'v') for word in word_tokens]
    lemmas = ' '.join(lemmas)
    return lemmas
data['review'] = data['review'].apply(lemmatize_word)

In [6]:
data.division = data.division.map({'positive':2,'neutral':1,'negative':0})

In [7]:
data['review'] = data['review'].apply(lambda x : x.lower())

TFIDF Vectorization

In [8]:
def tokenize(text):
    word_list = word_tokenize(text)
    text_token = []
    for word in word_list:
        if not word in stp_word:
            text_token.append(word)
    text_token = ' '.join(text_token)
    return text_token 

In [9]:
X_pre = data.review.apply(tokenize)
target = data.division

In [10]:
# Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_pre,target,test_size=0.2,random_state=0)

In [11]:
# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='word',max_features=4000,lowercase=False)
vectorizer.fit(x_train)

TfidfVectorizer(lowercase=False, max_features=4000)

In [12]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vectorizer.transform(x_train),y_train)
classifier.score(vectorizer.transform(x_test),y_test)

0.7262247838616714

In [13]:
from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate = 0.01,n_estimators = 1000,gamma = 1.0,
                    random_state = 0,objective='multi:softmax')
xgb.fit(vectorizer.transform(x_train),y_train)
xgb.score(vectorizer.transform(x_test),y_test)

0.8573487031700289

In [14]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=26, metric='minkowski', p=2)
knn.fit(vectorizer.transform(x_train),y_train)
knn.score(vectorizer.transform(x_test),y_test)

0.7680115273775217

In [15]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=1000, criterion='gini',random_state=0)
forest.fit(vectorizer.transform(x_train),y_train)
forest.score(vectorizer.transform(x_test),y_test)

0.8285302593659942

Word2vec

In [16]:
data = pd.read_csv('../input/preprocessed-dataset-sentiment-analysis/EcoPreprocessed.csv')
data = data[['review','division']]
data = data.drop_duplicates()
data['review'] = data['review'].apply(lemmatize_word)
data.division = data.division.map({'positive':2,'neutral':1,'negative':0})

In [17]:
X_train,X_test,y_train,y_test = train_test_split(data.review,data.division,
                                                 test_size=0.2,random_state=0)

In [18]:
def clean_text(text,remove_stopwords = False):
    text = re.sub(r'[^a-zA-Z]',' ',text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in stp_word]
    return words
train_sentences_list = []
test_sentences_list = []
for line in X_train:
    train_sentences_list.append(clean_text(line,remove_stopwords=True))
for line in X_test:
    test_sentences_list.append(clean_text(line,remove_stopwords=True))
# set the parameters
num_features = 150
min_word_count = 20
num_workers = 4
context = 4
model_name = 'Word2vec_.model'

from gensim.models.word2vec import Word2Vec
model = Word2Vec(train_sentences_list,workers=num_workers,vector_size = num_features,
                 min_count=min_word_count,window=context)
model.init_sims(replace = True)
model.save(os.path.join('.',model_name))

In [19]:
def to_review_vector(review):
    global word_vec
    review = clean_text(review,remove_stopwords=True)
    word_vec = np.zeros((1,150))
    for word in review:
        if word in model.wv:
            word_vec += np.array([model.wv[word]])
    return pd.Series(word_vec.mean(axis = 0))
X_train = X_train.apply(to_review_vector)
X_test = X_test.apply(to_review_vector)
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
2621,-0.012209,-0.020146,0.032614,-0.031338,0.012491,-0.015247,-0.005827,0.142751,-0.072489,0.049458,...,0.081165,0.008116,-0.007941,0.021384,0.187847,0.07024,-0.091968,-0.092549,0.098067,-0.108503
3987,-0.346596,-0.528311,0.841766,-0.736205,0.405037,-0.338978,-0.147472,3.705772,-1.850994,1.268297,...,2.054024,0.237842,-0.235404,0.551236,4.851963,1.812554,-2.343545,-2.459024,2.524158,-2.785991
690,-0.095938,-0.129727,0.223476,-0.209926,0.105157,-0.092115,-0.028948,1.000796,-0.497657,0.333881,...,0.546478,0.061011,-0.052898,0.146791,1.303013,0.480256,-0.631449,-0.667027,0.682519,-0.750771
3959,-0.10711,-0.193148,0.304228,-0.273685,0.1425,-0.125174,-0.050154,1.28276,-0.633417,0.435583,...,0.720127,0.090034,-0.094073,0.185982,1.697307,0.624522,-0.810357,-0.861578,0.89737,-0.975332
1637,-0.093367,-0.179814,0.283662,-0.232319,0.113026,-0.115102,-0.029364,1.144796,-0.568175,0.394631,...,0.629646,0.061632,-0.076413,0.160705,1.493856,0.556825,-0.71305,-0.758825,0.794645,-0.849805


In [20]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
classifier = MultinomialNB()
classifier.fit(min_max_scaler.fit_transform(X_train),y_train)
classifier.score(min_max_scaler.fit_transform(X_test),y_test)

0.6397694524495677

In [21]:
xgb = XGBClassifier(learning_rate = 0.01,n_estimators = 1000,gamma = 1.0,random_state = 0)
xgb.fit(X_train,y_train)
xgb.score(X_test,y_test)

0.7564841498559077

In [22]:
knn = KNeighborsClassifier(n_neighbors=11, metric='minkowski', p=2)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)

0.7579250720461095

In [23]:
forest = RandomForestClassifier(n_estimators=1000, criterion='gini',random_state=0)
forest.fit(X_train,y_train)
forest.score(X_test,y_test)

0.7651296829971181