### Sentiment with LSTM

In [1]:
# Dependencies
import pandas as pd
import os

In [2]:
# Read preprocessed data into dataframe
file = os.path.join('..', 'Output', 'tweets1.csv' )
tweet_df = pd.read_csv(file)
print(tweet_df.count())

# Drop Null values from data frame
tweet_df = tweet_df.dropna(subset=['Tokenized'])
print(tweet_df.count())

tweet_df.head()

ItemID       99998
Sentiment    99998
Tokenized    97036
dtype: int64
ItemID       97036
Sentiment    97036
Tokenized    97036
dtype: int64


Unnamed: 0,ItemID,Sentiment,Tokenized
0,1,0,sad friend
1,2,0,miss new moon trailer
2,3,1,already
3,4,0,cry dentist since get crown put
4,5,0,think mi cheating


In [3]:
# Add sequence vectors 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_fatures = 2000

print(f'Sentiment 1 count: ' + str(tweet_df[ tweet_df['Sentiment'] == 1].size))
print(f'Sentiment 0 count: ' + str(tweet_df[ tweet_df['Sentiment'] == 0].size))

docs = tweet_df.Tokenized
tokenizer = Tokenizer(num_words=max_fatures, split=" ")
tokenizer.fit_on_texts(docs)

#making sequences:
X = tokenizer.texts_to_sequences(docs.values)
X = pad_sequences(X)
print(X.shape)
tweet_df.head()

Using TensorFlow backend.


Sentiment 1 count: 163857
Sentiment 0 count: 127251
(97036, 35)


Unnamed: 0,ItemID,Sentiment,Tokenized
0,1,0,sad friend
1,2,0,miss new moon trailer
2,3,1,already
3,4,0,cry dentist since get crown put
4,5,0,think mi cheating


In [4]:
# Build model
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 35, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 35, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [5]:
# Split data into test, train sets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

y = tweet_df.Sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

# Scaling
#X_minmax = MinMaxScaler().fit(X_train)
#X_train_minmax = X_minmax.transform(X_train)
#X_test_minmax = X_minmax.transform(X_test)

#print(X_train_minmax)

print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(X_train),
(len(X_train[y_train == 0]) / (len(X_train)*1.))*100,
(len(X_train[y_train == 1]) / (len(X_train)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(X_test),
(len(X_test[y_test == 0]) / (len(X_test)*1.))*100,
(len(X_test[y_test == 1]) / (len(X_test)*1.))*100))

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

Train set has total 72777 entries with 43.73% negative, 56.27% positive
Test set has total 24259 entries with 43.67% negative, 56.33% positive
(72777, 35) (72777,)
(24259, 35) (24259,)


In [6]:
# Train model
batch_size = 32
# model.fit(X_train, y_train, epochs = 5, batch_size=batch_size, verbose = 2)

# Save model
# model_file = os.path.join('..', 'Output', 'tweets1_model.h5')
# model.save_weights(model_file)

# Load trained model from file
model_file = os.path.join('..', 'Output', 'tweets1_model.h5')
model.load_weights(model_file)

In [7]:
# Validation
validation_size = 5000

aX_validate = X_test[-validation_size:]
aY_validate = y_test[-validation_size:]
aX_test = X_test[:-validation_size]
aY_test = y_test[:-validation_size]
score,acc = model.evaluate(aX_test, aY_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.53
acc: 0.73


In [8]:
# Prediction for new tweets
import numpy as np

xw = [
    'president trump said president right thing quid pro quo charge cannot convict sitting president basis conflicting ambiguous evidence destabilize government thank ken',
    'see list leader organ young people help end sexual violence work embody foundation passing baton next',
    'mike love wife love brad next',
    'kicking big way hope see follow new wellness get latest update',
    'location getting new tag renewal kiosk state bridge road another store formerly murder location ponce de ave',
    'hate athlete appear tear live television',
    'need even say well go anyways cornel tonight'
]

xtokenizer = Tokenizer(num_words=max_fatures, split=" ")
xtokenizer.fit_on_texts(xw)

# Make sequences:
xX = xtokenizer.texts_to_sequences(xw)
xX = pad_sequences(xX, maxlen=35)
print(xX.shape)

for y in range(len(xw)):
    result = model.predict(xX[y].reshape(1,xX[y].shape[0]),batch_size=1,verbose = 2)[0]
    print(result, np.argmax(result))


(7, 35)
[0.831918   0.16808194] 0
[0.38022038 0.6197796 ] 1
[0.55233836 0.44766167] 0
[0.48144904 0.51855093] 1
[0.32893258 0.6710674 ] 1
[0.26495877 0.7350412 ] 1
[0.35548532 0.6445146 ] 1


In [9]:
# Results summary
# https://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras
import numpy as np

y_prediction = []

fck = list(y_test)

pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for i in range(len(X_test)):
        
    result = model.predict(X_test[i].reshape(1, X_test[i].shape[0]),batch_size=1,verbose = 2)[0]

    y_prediction.append(np.argmax(result))
    
    if np.argmax(result) == fck[i]:
        if fck[i] == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if fck[i] == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1


print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 77.95258305283184 %
neg_acc 66.80826961200793 %


In [10]:
# Results
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_prediction))
print(classification_report(y_test, y_prediction))
print(accuracy_score(y_test, y_prediction))

[[ 7077  3516]
 [ 3013 10653]]
              precision    recall  f1-score   support

           0       0.70      0.67      0.68     10593
           1       0.75      0.78      0.77     13666

   micro avg       0.73      0.73      0.73     24259
   macro avg       0.73      0.72      0.72     24259
weighted avg       0.73      0.73      0.73     24259

0.7308627725792489


In [11]:
# Hand entered values
print(X_test[24256])


[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0   26  516  870
   15  752   15  916  279  188 1500]


In [12]:
# Results = model.predict(X_test[i].reshape(1,X_test[i].shape[0]),batch_size=1,verbose = 2)[0]
print(X_test.shape)

(24259, 35)


In [13]:
result = model.predict(X_test[i].reshape(1,X_test[i].shape[0]),batch_size=1,verbose = 2)[0]
print(result)

[0.06407241 0.9359276 ]
