In [1]:
import pandas as pd #Basic packages for creating dataframes and loading dataset
import numpy as np

import matplotlib.pyplot as plt #Package for visualization

import re #importing package for Regular expression operations

from sklearn.model_selection import train_test_split #Package for splitting the data

from sklearn.preprocessing import LabelEncoder #Package for conversion of categorical to Numerical

from keras.preprocessing.text import Tokenizer #Tokenization
from keras.preprocessing.sequence import pad_sequences #Add zeros or crop based on the length
from keras.models import Sequential #Sequential Neural Network
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D #For layers in Neural Network
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
data = pd.read_csv('Sentiment.csv') #Looading the dataset

data = data[['text','sentiment']] # Keeping only the neccessary columns

In [3]:
data['text'] = data['text'].apply(lambda x: x.lower()) #converting to lower case
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))) #only a-z,A-Z,0-9 would be remaining in the data, else special characters are removed

In [4]:
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ') #Removing Retweets

In [5]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)

In [6]:
X = pad_sequences(X)

embed_dim = 128
lstm_out = 196

In [7]:
def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return model
# print(model.summary())

In [8]:
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [9]:
batch_size = 32
model = createmodel() #Function call to Sequential Neural Network
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2) #verbose the higher, the more messages
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
print(score)
print(acc)


Epoch 1/1
 - 24s - loss: 0.8318 - accuracy: 0.6418
0.7780609636808806
0.6743119359016418


In [10]:
print(model.metrics_names)

['loss', 'accuracy']


## Save the model and use the saved model to predict on new text data (ex, “A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump”)

In [11]:
model.save('sentimentAnalysis.h5')

In [12]:
from keras.models import load_model
model= load_model('sentimentAnalysis.h5')

In [13]:
print(integer_encoded)
print(data['sentiment'])

[1 2 1 ... 2 0 2]
0         Neutral
1        Positive
2         Neutral
3        Positive
4        Positive
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 13871, dtype: object


In [14]:
sentence = ['A lot of good things are happening. We are respected again throughout the world, and that is a great thing.@realDonaldTrump']
sentence = tokenizer.texts_to_sequences(sentence)
sentence = pad_sequences(sentence, maxlen=28, dtype='int32', value=0)
sentiment = model.predict_classes(sentence,batch_size=1,verbose = 2)[0]
print(sentiment)
if sentiment == 0:
  print("Neutral")
elif sentiment < 0:
  print("Negative")
elif sentiment > 0:
  print("Positive")
else:
  print("Can not be determined")

0
Neutral


## Apply GridSearchCV on the source code provided in the class

In [15]:
from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn=createmodel,verbose=2)
batch_size= [10, 20, 40]
epochs = [1, 2]
param_grid= {'batch_size':batch_size, 'epochs':epochs}
from sklearn.model_selection import GridSearchCV
grid  = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result= grid.fit(X_train,Y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Epoch 1/1
 - 42s - loss: 0.8346 - accuracy: 0.6404
Epoch 1/1
 - 42s - loss: 0.8259 - accuracy: 0.6477
Epoch 1/1
 - 43s - loss: 0.8278 - accuracy: 0.6466
Epoch 1/1
 - 42s - loss: 0.8312 - accuracy: 0.6467
Epoch 1/1
 - 46s - loss: 0.8254 - accuracy: 0.6403
Epoch 1/2
 - 44s - loss: 0.8341 - accuracy: 0.6472
Epoch 2/2
 - 41s - loss: 0.6907 - accuracy: 0.7084
Epoch 1/2
 - 46s - loss: 0.8320 - accuracy: 0.6462
Epoch 2/2
 - 45s - loss: 0.6910 - accuracy: 0.7117
Epoch 1/2
 - 45s - loss: 0.8267 - accuracy: 0.6423
Epoch 2/2
 - 44s - loss: 0.6844 - accuracy: 0.7117
Epoch 1/2
 - 43s - loss: 0.8357 - accuracy: 0.6421
Epoch 2/2
 - 42s - loss: 0.6894 - accuracy: 0.7067
Epoch 1/2
 - 48s - loss: 0.8255 - accuracy: 0.6468
Epoch 2/2
 - 48s - loss: 0.6764 - accuracy: 0.7158
Epoch 1/1
 - 30s - loss: 0.8379 - accuracy: 0.6427
Epoch 1/1
 - 29s - loss: 0.8304 - accuracy: 0.6450
Epoch 1/1
 - 30s - loss: 0.8338 - accuracy: 0.6403
Epoch 1/1
 - 46s - loss: 0.8346 - accuracy: 0.6389
Epoch 1/1
 - 44s - loss: 0.8347