In [31]:
import numpy as np # linear algebra
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical


In [32]:
import re # string searching and manipulating

In [33]:
data = pd.read_csv('./Datasets/Sentiment.csv')

In [34]:
data.columns

Index(['id', 'candidate', 'candidate_confidence', 'relevant_yn',
       'relevant_yn_confidence', 'sentiment', 'sentiment_confidence',
       'subject_matter', 'subject_matter_confidence', 'candidate_gold', 'name',
       'relevant_yn_gold', 'retweet_count', 'sentiment_gold',
       'subject_matter_gold', 'text', 'tweet_coord', 'tweet_created',
       'tweet_id', 'tweet_location', 'user_timezone'],
      dtype='object')

In [35]:
data = data[['text', 'sentiment']]

In [36]:
data = data[data.sentiment != "Neutral"] # delete the neutral sentiments
data['text'] = data['text'].apply(lambda x: x.lower()) # all text to lower case

In [37]:
data['text']

1        rt @scottwalker: didn't catch the full #gopdeb...
3        rt @robgeorge: that carly fiorina is trending ...
4        rt @danscavino: #gopdebate w/ @realdonaldtrump...
5        rt @gregabbott_tx: @tedcruz: "on my first day ...
6        rt @warriorwoman91: i liked her and was happy ...
                               ...                        
13866    rt @cappy_yarbrough: love to see men who will ...
13867    rt @georgehenryw: who thought huckabee exceede...
13868    rt @lrihendry: #tedcruz as president, i will a...
13869    rt @jrehling: #gopdebate donald trump says tha...
13870    rt @lrihendry: #tedcruz headed into the presid...
Name: text, Length: 10729, dtype: object

In [38]:
print(data[ data['sentiment'] == 'Positive'].size)
print(data[ data['sentiment'] == 'Negative'].size)

4472
16986


In [39]:
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')
    
tokenizer = Tokenizer(num_words=2000, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [46]:
embed_dim = 128
lstm_out = 196

model = Sequential()

model.add(Embedding(2000, embed_dim, input_length = X.shape[1])) #used to create word vectors for incoming words

model.add(SpatialDropout1D(0.4))

model.add(LSTM(196, dropout= 0.2))

model.add(Dense(2, activation='softmax')) # output layer

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [47]:
data['sentiment']

1        Positive
3        Positive
4        Positive
5        Positive
6        Negative
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 10729, dtype: object

In [48]:
Y

array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [0, 1],
       [1, 0],
       [0, 1]], dtype=uint8)

In [49]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)


(7188, 29) (7188, 2)
(3541, 29) (3541, 2)


In [50]:
batch_size = 32
model.fit(X_train, Y_train, epochs=7, batch_size=batch_size, verbose=2)

Epoch 1/7
 - 7s - loss: 0.4317 - acc: 0.8168
Epoch 2/7
 - 5s - loss: 0.3210 - acc: 0.8646
Epoch 3/7
 - 5s - loss: 0.2806 - acc: 0.8819
Epoch 4/7
 - 5s - loss: 0.2525 - acc: 0.8939
Epoch 5/7
 - 5s - loss: 0.2257 - acc: 0.9068
Epoch 6/7
 - 5s - loss: 0.2035 - acc: 0.9167
Epoch 7/7
 - 5s - loss: 0.1827 - acc: 0.9226


<keras.callbacks.History at 0x1b8a2dab9b0>

In [51]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose=2, batch_size= batch_size)
print("Score: %.2f" % (score))
print("acc: %.2f" % (acc))

Score: 0.48
acc: 0.85


In [55]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0,0,0,0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]), batch_size=1, verbose=2)[0]
    
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
            
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1 #actual negative cnt
    else:
        pos_cnt += 1 #actual pos cnt
        
        
print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 62.7831715210356 %
neg_acc 92.19143576826197 %
