
**Sentiment Analysis:** the process of computationally identifying and categorizing opinions expressed in a piece of text, especially in order to determine whether the writer's attitude towards a particular topic, product, etc. is positive, negative, or neutral. 


In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.layers import Dense ,LSTM,concatenate,Input,Flatten
import re

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

Only keeping the necessary columns.

In [14]:
data = pd.read_csv('https://play.minio.io:9000/rao/data_1_train.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=Q3AM3UQ867SPQQA43P2F%2F20180422%2F%2Fs3%2Faws4_request&X-Amz-Date=20180422T014411Z&X-Amz-Expires=432000&X-Amz-SignedHeaders=host&X-Amz-Signature=30f103b98d02bf9a8271aa3347d2872d313cfb882a068a1f205ea0e6d1f50a69')
# Keeping only the neccessary columns
print (data.columns.tolist())

data = data[[' text',' class']]

['example_id', ' text', ' aspect_term', ' term_location', ' class']


In [15]:
#data = data[data.sentiment != "Neutral"]
data[' text'] = data[' text'].apply(lambda x: x.lower())
data[' text'] = data[' text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print(data[ data[' class'] == 1].size)
print(data[ data[' class'] == 0].size)
print(data[ data[' class'] == -1].size)


max_fatures = 1000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data[' text'].values)
X = tokenizer.texts_to_sequences(data[' text'].values)
X = pad_sequences(X)
print(X.shape)

1878
872
1656


Next, I compose the LSTM Network. Note that **embed_dim**, **lstm_out**, **batch_size**, **droupout_x** variables are hyperparameters, their values are somehow intuitive, can be and must be played with in order to achieve good results. Please also note that I am using softmax as activation function. The reason is that our Network is using categorical crossentropy, and softmax is just the right activation method for that.

In [19]:
embed_dim = 128
lstm_out = 196


input_1 = Input(shape=(1785, ))
word_embedding = Embedding(max_fatures, embed_dim,input_length = X.shape[1])(input_1)
drop_out_1 = SpatialDropout1D(0.4)(word_embedding)
lstm_1 = LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)(drop_out_1)

output = Dense(3,activation='softmax')(lstm_1)

model = Model(inputs=[input_1], outputs=output)


model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])


    
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 1785)              0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 1785, 128)         128000    
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 1785, 128)         0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 591       
Total params: 383,391
Trainable params: 383,391
Non-trainable params: 0
_________________________________________________________________
None


Hereby I declare the train and test dataset.

In [20]:
Y = pd.get_dummies(data[' class']).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

((1476, 1785), (1476, 3))
((727, 1785), (727, 3))


In [21]:
batch_size = 16
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)

Epoch 1/7
 - 1605s - loss: 1.0307 - acc: 0.4898
Epoch 2/7
 - 1171s - loss: 0.8084 - acc: 0.6477
Epoch 3/7
 - 1246s - loss: 0.6186 - acc: 0.7473
Epoch 4/7
 - 788s - loss: 0.5187 - acc: 0.8089
Epoch 5/7
 - 785s - loss: 0.4769 - acc: 0.8184
Epoch 6/7
 - 790s - loss: 0.4049 - acc: 0.8462
Epoch 7/7
 - 789s - loss: 0.3742 - acc: 0.8591


<keras.callbacks.History at 0x7f06c7038d90>

Here we train the Network. We should run much more than 7 epoch, But in practice its has to be trained for more iterations.

Extracting a validation set, and measuring score and accuracy.

In [22]:
validation_size = 300

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.95
acc: 0.67


Finally measuring the number of correct guesses. 

In [23]:
pos_cnt, neg_cnt, pos_correct, neg_correct, neutral_correct,neutral_cnt  = 0, 0, 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        if np.argmax(Y_validate[x]) == 1:
            neutral_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    elif np.argmax(Y_validate[x]) == 1:
        neutral_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

print("neutral_accuracy", neutral_correct/neutral_cnt*100, "%")


('pos_acc', 100, '%')
('neg_acc', 0, '%')
('neutral_accuracy', 0, '%')


Finally, an example on predicting an arbitrary review sentiment:

In [24]:
twt = 'Meetings: Because none of us is as dumb as all of us.'
#vectorizing the review by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the review to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=28, dtype='int32', padding='post', truncating='post', value=0)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("negative")

elif (np.argmax(sentiment) == 2):
    print("positive")
    
else (np.argmax(sentiment) == 1):
    print("neutral")

SyntaxError: ignored