### Twitter Feeling Analysis by LSTM

Based on twitter comments, I decided to determine the sentiments.
The accuracy in the the test set is 65%. 

In [27]:
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [28]:
data = pd.read_csv('Sentiment.csv')
data = data[['text','sentiment']]

In [29]:
data.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [30]:
# lowercase all the strings
data['text'] = data['text'].apply(lambda x: x.lower())

# remove non-ascci characters
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print(data[ data['sentiment'] == 'Positive'].size)
print(data[ data['sentiment'] == 'Negative'].size)
print(data[ data['sentiment'] == 'Neutral'].size)

4472
16986
6284


In [31]:
# replace 'rt' with ' '
for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')

In [32]:
data.head()

Unnamed: 0,text,sentiment
0,nancyleegrahn how did everyone feel about th...,Neutral
1,scottwalker didnt catch the full gopdebate l...,Positive
2,tjmshow no mention of tamir rice and the gop...,Neutral
3,robgeorge that carly fiorina is trending ho...,Positive
4,danscavino gopdebate w realdonaldtrump deliv...,Positive


In [33]:
# set maximum words in dictionary
max_fatures = 5000
tokenizer = Tokenizer(nb_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)

In [34]:
# add zero to match lengths
X = pad_sequences(X)

In [35]:
model = Sequential()
model.add(Embedding(max_fatures, 150, input_length = X.shape[1], dropout=0.2))
model.add(LSTM(80, dropout_U=0.1, dropout_W=0.1))
model.add(Dense(56,activation = 'relu'))
model.add(Dropout(0.15))
model.add(Dense(36,activation = 'relu'))
model.add(Dropout(0.15))
model.add(Dense(26,activation = 'relu'))
model.add(Dropout(0.15))
model.add(Dense(16,activation = 'relu'))
model.add(Dropout(0.15))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  
  This is separate from the ipykernel package so we can avoid doing imports until


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 29, 150)           750000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 80)                73920     
_________________________________________________________________
dense_9 (Dense)              (None, 56)                4536      
_________________________________________________________________
dropout_7 (Dropout)          (None, 56)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 36)                2052      
_________________________________________________________________
dropout_8 (Dropout)          (None, 36)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 26)                962       
__________

In [36]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

((9293, 29), (9293, 3))
((4578, 29), (4578, 3))


In [37]:
from keras.callbacks import EarlyStopping

In [38]:
batch_size = 32
early_stopping_monitor = EarlyStopping(patience=5)
model.fit(X_train, Y_train, nb_epoch = 20, validation_split = 0.2, batch_size=batch_size, verbose = 1, callbacks=[early_stopping_monitor])

Train on 7434 samples, validate on 1859 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


<keras.callbacks.History at 0x7f0ba96d8bd0>

In [40]:
_ ,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("accuracy: " + str(acc))


accuracy: 0.643294014854
