In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf

from sklearn.feature_extraction.text import CountVectorizer
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import re

In [3]:
data = pd.read_csv('twitter_training.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

print(data)

                                                    text sentiment
0      I am coming to the borders and I will kill you...  Positive
1      im getting on borderlands and i will kill you ...  Positive
2      im coming on borderlands and i will murder you...  Positive
3      im getting on borderlands 2 and i will murder ...  Positive
4      im getting into borderlands and i can murder y...  Positive
...                                                  ...       ...
74676  Just realized that the Windows partition of my...  Positive
74677  Just realized that my Mac window partition is ...  Positive
74678  Just realized the windows partition of my Mac ...  Positive
74679  Just realized between the windows partition of...  Positive
74680  Just like the windows partition of my Mac is l...  Positive

[74681 rows x 2 columns]


In [4]:
data = data[data.sentiment != "Neutral"]
data = data[data.sentiment != "Irrelevant"]
print(data)
#data = data[data.sentiment != "Neutral"]
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',str(x))))
data['text'] = data['text'].apply(lambda x: x.lower())

print(data[ data['sentiment'] == 'Positive'].size)
print(data[ data['sentiment'] == 'Negative'].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

                                                    text sentiment
0      I am coming to the borders and I will kill you...  Positive
1      im getting on borderlands and i will kill you ...  Positive
2      im coming on borderlands and i will murder you...  Positive
3      im getting on borderlands 2 and i will murder ...  Positive
4      im getting into borderlands and i can murder y...  Positive
...                                                  ...       ...
74676  Just realized that the Windows partition of my...  Positive
74677  Just realized that my Mac window partition is ...  Positive
74678  Just realized the windows partition of my Mac ...  Positive
74679  Just realized between the windows partition of...  Positive
74680  Just like the windows partition of my Mac is l...  Positive

[43373 rows x 2 columns]
41662
45084


  data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',str(x))))
  row[0] = row[0].replace('rt',' ')
  row[0] = row[0].replace('rt',' ')


In [5]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

None


In [6]:

Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(34698, 166) (34698, 2)
(8675, 166) (8675, 2)


In [7]:
print((Y_train))

[[ True False]
 [False  True]
 [ True False]
 ...
 [ True False]
 [False  True]
 [False  True]]


In [8]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, verbose = 2)

Epoch 1/10
1085/1085 - 151s - 139ms/step - accuracy: 0.7956 - loss: 0.4378
Epoch 2/10
1085/1085 - 164s - 151ms/step - accuracy: 0.8562 - loss: 0.3308
Epoch 3/10
1085/1085 - 217s - 200ms/step - accuracy: 0.8738 - loss: 0.2845
Epoch 4/10
1085/1085 - 245s - 226ms/step - accuracy: 0.8902 - loss: 0.2517
Epoch 5/10
1085/1085 - 324s - 299ms/step - accuracy: 0.8995 - loss: 0.2255
Epoch 6/10
1085/1085 - 359s - 331ms/step - accuracy: 0.9105 - loss: 0.2048
Epoch 7/10
1085/1085 - 363s - 335ms/step - accuracy: 0.9166 - loss: 0.1841
Epoch 8/10
1085/1085 - 339s - 313ms/step - accuracy: 0.9243 - loss: 0.1687
Epoch 9/10
1085/1085 - 380s - 351ms/step - accuracy: 0.9285 - loss: 0.1586
Epoch 10/10
1085/1085 - 389s - 358ms/step - accuracy: 0.9338 - loss: 0.1433


<keras.src.callbacks.history.History at 0x177f17bf0>

In [9]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

225/225 - 29s - 130ms/step - accuracy: 0.9049 - loss: 0.2298
score: 0.23
acc: 0.90


In [10]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

1/1 - 0s - 225ms/step
1/1 - 0s - 54ms/step


1/1 - 0s - 46ms/step
1/1 - 0s - 48ms/step
1/1 - 0s - 52ms/step
1/1 - 0s - 53ms/step
1/1 - 0s - 48ms/step
1/1 - 0s - 45ms/step
1/1 - 0s - 50ms/step
1/1 - 0s - 55ms/step
1/1 - 0s - 51ms/step
1/1 - 0s - 42ms/step
1/1 - 0s - 48ms/step
1/1 - 0s - 56ms/step
1/1 - 0s - 55ms/step
1/1 - 0s - 46ms/step
1/1 - 0s - 41ms/step
1/1 - 0s - 48ms/step
1/1 - 0s - 38ms/step
1/1 - 0s - 43ms/step
1/1 - 0s - 51ms/step
1/1 - 0s - 54ms/step
1/1 - 0s - 56ms/step
1/1 - 0s - 43ms/step
1/1 - 0s - 42ms/step
1/1 - 0s - 34ms/step
1/1 - 0s - 44ms/step
1/1 - 0s - 47ms/step
1/1 - 0s - 46ms/step
1/1 - 0s - 41ms/step
1/1 - 0s - 37ms/step
1/1 - 0s - 36ms/step
1/1 - 0s - 40ms/step
1/1 - 0s - 48ms/step
1/1 - 0s - 43ms/step
1/1 - 0s - 45ms/step
1/1 - 0s - 45ms/step
1/1 - 0s - 58ms/step
1/1 - 0s - 53ms/step
1/1 - 0s - 41ms/step
1/1 - 0s - 54ms/step
1/1 - 0s - 47ms/step
1/1 - 0s - 47ms/step
1/1 - 0s - 41ms/step
1/1 - 0s - 45ms/step
1/1 - 0s - 47ms/step
1/1 - 0s - 44ms/step
1/1 - 0s - 45ms/step
1/1 - 0s - 55ms/step
1/1 - 0s - 47

In [19]:
model.save('sentModel.h5')



In [None]:
twt = ['ew']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the tweet to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=28, dtype='int32', value=0)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

1/1 - 0s - 11ms/step
negative
