### Importing python utilities for task execution

In [1]:
import nltk
import pandas as pd
import numpy as np
import random
import string
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer        
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, classification_report

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Reading the dataset in dataframe

In [3]:
reviews = pd.read_csv('K8 Reviews v0.2.csv')

In [4]:
## Check top 5 rows
reviews.head()

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


In [5]:
## Size of the dataset - it has 14675 reviews and 2 fields as seen above
reviews.shape

(14675, 2)

In [6]:
reviews.sentiment.value_counts()

0    7712
1    6963
Name: sentiment, dtype: int64

## Using the following NLTK modules for text preprocessing
 - NLTK's Porter Stemmer for word stemming and WordNet Lemmatizer for lemmatization
 - NLTK's word tokenizer for splitting sentences into words
 - NLTK's stop word module for removing unwanted words

In [7]:
# http://www.nltk.org/howto/stem.html
# https://www.nltk.org/api/nltk.tokenize.html
# https://pythonspot.com/nltk-stop-words/
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [8]:
from spell import correction

In [34]:
correction("superbbb")

'superb'

In [29]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        corrected = correction(item)
        stemmed.append(stemmer.stem(corrected))
    return stemmed

lemm = WordNetLemmatizer()
def lemmatize_tokens(tokens, lemm):
    lemmatized = []
    for item in tokens:
        corrected = correction(item)
        lemmatized.append(lemm.lemmatize(corrected))
    return lemmatized

def tokenize(text):
    text = re.sub("[^a-zA-Z]", " ", text) # remove non letters
#     text = re.sub(r'\b\w{1,3}\b', '', text) # remove letters with less than 3 chars
    # tokenize (split into words)
    tokens = nltk.word_tokenize(text)    
    # stem
    stems = lemmatize_tokens(tokens, lemm)
#     stems = stem_tokens(tokens, stemmer)
    return stems


- Using Scikit Learn for vectorizing the text into numbers as Machine Learning algorithms only understands numbers
- We use Count Vectorizer/Tf-Idf Vectorizer for this task. It takes various parameters in one go for text preprocessing

In [48]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True, #We want only lower cased words during vectorization
    stop_words = 'english', # Remove unwanted words from the sentences. Words like - is,the,if,it,etc are not required
    max_features = 100 # we are only considering first 100 features for sentiment classification
)

In [10]:
vectorizer = TfidfVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True, #We want only lower cased words during vectorization
#     stop_words = 'english', # Remove unwanted words from the sentences. Words like - is,the,if,it,etc are not required
    max_features = 200 # we are only considering first 100 features for sentiment classification
)

- Applying the vectorizer to the list of all reviews

In [11]:
corpus_data_features = vectorizer.fit_transform(reviews.review.tolist())
corpus_data_features_nd = corpus_data_features.toarray()

- Here we have transformed the text into numbers and extracted only top N features given by parameter 'max features' in vectorizer

In [12]:
corpus_data_features_nd

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.10707407,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [13]:
corpus_data_features_nd.shape

(14675, 200)

- You can see some of the words considered in sentiment classification

In [14]:
vocab = vectorizer.get_feature_names()
vocab[:10]

['a',
 'about',
 'adroit',
 'after',
 'all',
 'also',
 'am',
 'amazing',
 'amazon',
 'an']

- Splitting the reviews into train and test sets
- train set will be used for training the model to understand the data
- test set will be used to check the performance of our model

In [15]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(corpus_data_features_nd[0:len(reviews)]
                                                 ,reviews.sentiment
                                                 ,random_state = 42
                                                 ,train_size = 0.75 #75:25 ratio
                                                )

## Algorithm - Logistic Regression
 - Training the model with different regularization parameter values

In [16]:
from sklearn.linear_model import LogisticRegression

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c,class_weight='balanced')
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_test, lr.predict(X_test))))

Accuracy for C=0.01: 0.8367402562005997
Accuracy for C=0.05: 0.8555464704279095
Accuracy for C=0.25: 0.8629054238212047
Accuracy for C=0.5: 0.8661760697737804
Accuracy for C=1: 0.8661760697737804


In [17]:
print(classification_report(y_test,lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      0.87      0.87      1921
           1       0.85      0.87      0.86      1748

    accuracy                           0.87      3669
   macro avg       0.87      0.87      0.87      3669
weighted avg       0.87      0.87      0.87      3669



- Looking at 5 most discriminating words each for Positive and Negative

In [18]:
feature_to_coef = {
    word: coef for word, coef in zip(
        vectorizer.get_feature_names(), lr.coef_[0]
    )
}

for best_positive in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(best_positive)

('awesome', 5.097339923909264)
('best', 4.50820283429397)
('excellent', 4.406626472360816)
('great', 3.8414357212126307)
('superb', 3.373699344815222)


In [19]:
for best_negative in sorted(feature_to_coef.items(), key=lambda x: x[1])[:5]:
    print(best_negative)

('worst', -6.446337865274848)
('not', -5.7554926264707555)
('poor', -4.693562310440202)
('waste', -3.4916882775863507)
('bad', -3.3918666462054032)


- Checking prediction on random reviews

In [25]:
random_reviews = reviews.sample(50) # choosing random 15 reviews

## Applying same transformation
features = vectorizer.fit_transform(random_reviews.review.tolist())
features = features.toarray()

# Prediction for random reviews
y_pred_random = lr.predict(features)

random_reviews['Predicted'] = y_pred_random

random_reviews.head()

Unnamed: 0,sentiment,review,Predicted
2093,1,Battery problem,0
6106,0,"phone is good but it,s pries is to high 10000 ...",0
4608,0,Stock android not user friendly like earlier v...,0
12730,0,Don't buy this....,0
3117,0,yupprocessing is good....speed is good ...came...,0


- Training with different classifier algorithms

In [22]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [23]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    GaussianNB()]

In [24]:
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    print("Accuracy for %s: %s" 
           % (name, clf.score(X_test, y_test)))

Accuracy for Nearest Neighbors: 0.7748705369310439
Accuracy for Linear SVM: 0.8416462251294631
Accuracy for RBF SVM: 0.8642681929681112
Accuracy for Decision Tree: 0.7675115835377487
Accuracy for Random Forest: 0.7274461706186972
Accuracy for Neural Net: 0.8626328699918234
Accuracy for Naive Bayes: 0.8048514581629872


## Using LSTM

In [93]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [118]:
tokenizer_object = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
tokenizer_object.fit_on_texts(reviews.review.tolist())

In [119]:
#checking max length of review which is required during pad sequences

max_length = max([len(s.split()) for s in reviews.review.tolist()])
print(max_length)

722


In [120]:
#vocabulary size

vocab_size = len(tokenizer_object.word_index) + 1
print(vocab_size)

13007


In [121]:
#Split train and test
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(reviews.review
                                                 ,reviews.sentiment
                                                 ,random_state = 42
                                                 ,train_size = 0.70
                                                )

In [122]:
#Text to list of words/tokens
X_train_tokens = tokenizer_object.texts_to_sequences(x_train)
X_test_tokens = tokenizer_object.texts_to_sequences(x_test)

In [123]:
#Adds zero padding to make each sentence of same length (here max_length)
X_train_pad = pad_sequences(X_train_tokens,maxlen = max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens,maxlen = max_length, padding='post')

In [124]:
#importing keras utilities
from keras.models import Sequential
from keras.layers import Dense,LSTM,Dropout
from keras.layers.embeddings import Embedding

In [125]:
#Constructing LSTM model

EMBEDDING_DIM = 200

model = Sequential()

model.add(Embedding(vocab_size,EMBEDDING_DIM, input_length = max_length)) #Embedding layer
model.add(LSTM(units=256, dropout = 0.3,recurrent_dropout=0.3)) #LSTM layer with dropout
# model.add(Dense(128,activation='relu'))
# model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid')) #Fully connected output layer

model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics=['accuracy'])

In [126]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 722, 200)          2601400   
_________________________________________________________________
lstm_10 (LSTM)               (None, 256)               467968    
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 257       
Total params: 3,069,625
Trainable params: 3,069,625
Non-trainable params: 0
_________________________________________________________________


In [127]:
#Training
model.fit(X_train_pad,y_train,batch_size=32,epochs=5,validation_data=(X_test_pad,y_test),verbose=2)

Train on 10272 samples, validate on 4403 samples
Epoch 1/5
 - 235s - loss: 0.6930 - acc: 0.5227 - val_loss: 0.6919 - val_acc: 0.5249
Epoch 2/5
 - 233s - loss: 0.6924 - acc: 0.5183 - val_loss: 0.6920 - val_acc: 0.5249
Epoch 3/5
 - 232s - loss: 0.6924 - acc: 0.5258 - val_loss: 0.6922 - val_acc: 0.5249
Epoch 4/5
 - 233s - loss: 0.6922 - acc: 0.5251 - val_loss: 0.6919 - val_acc: 0.5249
Epoch 5/5
 - 232s - loss: 0.6921 - acc: 0.5258 - val_loss: 0.6951 - val_acc: 0.5249


<keras.callbacks.History at 0x7fa5299707b8>

In [70]:
#Evaluating on test set
score,acc = model.evaluate(X_test, y_test, verbose = 2, batch_size = 32)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.69
acc: 0.52


## Word2Vec + LSTM

In [None]:
from gensim import models

In [None]:
#Training the word2vec model on the reviews

wv_model = models.Word2Vec([s.split() for s in reviews.review.to_list()]
                           ,min_count = 3 # Ignores all words with total frequency lower than this
                           ,size = 200 #Dimensionality of the word vectors.
                           ,window = 3 #Maximum distance between the current and predicted word within a sentence.
                          )

In [115]:
#Create a dictionary of all the word : vector pairs
embeddings_index = {}
for w in wv_model.wv.vocab.keys():
    embeddings_index[w] = wv_model.wv[w]

#embeddings_index['Good']

In [128]:
#tokenizer_object.word_index.items()

In [117]:
#Creating a embedding matrix which is required as weights in the embedding layer on LSTM model
num_words = vocab_size
embedding_matrix = np.zeros((num_words, 200))

for word, i in tokenizer_object.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
#Constructing LSTM model

EMBEDDING_DIM = 200

model = Sequential()

model.add(
        Embedding(vocab_size
                ,EMBEDDING_DIM
                ,weights = [embedding_matrix] #Supplied embedding matrix created from word vectors
                ,input_length = max_length
                ,trainable=False)
         )

model.add(LSTM(units=256, dropout = 0.3,recurrent_dropout=0.3))

# model.add(Dense(128,activation='relu'))
# model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics=['accuracy'])

In [None]:
model.fit(X_train_pad,y_train,batch_size=32,epochs=5,validation_data=(X_test_pad,y_test),verbose=2)

In [None]:
score,acc = model.evaluate(X_test, y_test, verbose = 2, batch_size = 32)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

## Ways of improving the accuracy
 - Try different tokenizer (eg. spacy tokenizer)
 - use a different spell correction method
 - use word segmentation technique for separating the erroneous joined words
 - convert emojis into texts
 - remove contractions like "wasn't" to "was not", "isn't" to "is not",etc
 - convert smileys into text
 - use Fackbook AI's fasttext based word vectors representation as it operates on character instead of words