In [55]:
import numpy as np
import pandas as pd

In [56]:
from sklearn.model_selection import train_test_split
import re

In [57]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [58]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

In [59]:
# Importing dataset
data = pd.read_csv("sentiment.csv")
data.head()

Unnamed: 0,unit_id,golden,unit_state,trusted_judgments,last_judgment_at,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2015-02-25T05:24:00,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24T11:35:00,570306000000000000,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2015-02-25T01:53:00,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24T11:15:00,570301000000000000,,Pacific Time (US & Canada)
2,681448156,False,finalized,3,2015-02-25T10:01:00,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24T11:15:00,570301000000000000,Lets Play,Central Time (US & Canada)
3,681448158,False,finalized,3,2015-02-25T03:05:00,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24T11:15:00,570301000000000000,,Pacific Time (US & Canada)
4,681448159,False,finalized,3,2015-02-25T05:50:00,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24T11:14:00,570301000000000000,,Pacific Time (US & Canada)


In [60]:
# keepling only necessary columns
sentiment_data = data[["text", "airline_sentiment"]]
sentiment_data.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [61]:
sentiment_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               14640 non-null  object
 1   airline_sentiment  14640 non-null  object
dtypes: object(2)
memory usage: 228.9+ KB


In [62]:
sentiment_data["airline_sentiment"].value_counts()

airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64

The 'Neutral' feelings are now being dropped, as my intention was to simply distinguish between positive and negative tweets. 

In [63]:
sentiment_data.isnull().sum()

text                 0
airline_sentiment    0
dtype: int64

In [64]:
# Filter the DataFrame to include only rows with 'positive' or 'negative' sentiment
data = sentiment_data[sentiment_data['airline_sentiment'].isin(['positive', 'negative'])]

In [65]:
data.head()

Unnamed: 0,text,airline_sentiment
1,@VirginAmerica plus you've added commercials t...,positive
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
5,@VirginAmerica seriously would pay $30 a fligh...,negative
6,"@VirginAmerica yes, nearly every time I fly VX...",positive


In [66]:
data["airline_sentiment"].value_counts()

airline_sentiment
negative    9178
positive    2363
Name: count, dtype: int64

In [67]:
data.duplicated().sum()

156

In [68]:
# Removing duplicate values
data = data.drop_duplicates()

In [69]:
data.duplicated().sum()

0

In [70]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11385 entries, 1 to 14638
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               11385 non-null  object
 1   airline_sentiment  11385 non-null  object
dtypes: object(2)
memory usage: 266.8+ KB


After that, I set the maximum feature count at 2000 and vectorize the text into sequences using Tokenizer so that the network can handle it as input.

In [71]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [72]:
X.shape

(11385, 50)

In [73]:
# spliting train and test data
Y = pd.get_dummies(data['airline_sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.30, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(7969, 50) (7969, 2)
(3416, 50) (3416, 2)


Implement the LSTM Network. It should be noted that the variables embed_dim, lstm_out, batch_size, and droupout_x are hyperparameters; to obtain desirable results, one must experiment with their settings, which are somewhat intuitive. Also, note that activation function is softmax. The rationale is that softmax is the ideal activation technique for the network, which uses categorical crossentropy.

In [74]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [75]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 128)           256000    
                                                                 
 spatial_dropout1d_1 (Spati  (None, 50, 128)           0         
 alDropout1D)                                                    
                                                                 
 lstm_1 (LSTM)               (None, 196)               254800    
                                                                 
 dense_1 (Dense)             (None, 2)                 394       
                                                                 
Total params: 511194 (1.95 MB)
Trainable params: 511194 (1.95 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [76]:
# training the model

In [77]:
model.fit(X_train, Y_train, epochs = 7, batch_size=32, verbose = 2)

Epoch 1/7
250/250 - 37s - loss: 0.3263 - accuracy: 0.8684 - 37s/epoch - 147ms/step
Epoch 2/7
250/250 - 37s - loss: 0.1919 - accuracy: 0.9282 - 37s/epoch - 146ms/step
Epoch 3/7
250/250 - 37s - loss: 0.1421 - accuracy: 0.9464 - 37s/epoch - 147ms/step
Epoch 4/7
250/250 - 37s - loss: 0.1261 - accuracy: 0.9538 - 37s/epoch - 147ms/step
Epoch 5/7
250/250 - 37s - loss: 0.1014 - accuracy: 0.9649 - 37s/epoch - 147ms/step
Epoch 6/7
250/250 - 36s - loss: 0.0816 - accuracy: 0.9698 - 36s/epoch - 146ms/step
Epoch 7/7
250/250 - 37s - loss: 0.0696 - accuracy: 0.9752 - 37s/epoch - 148ms/step


<keras.src.callbacks.History at 0x2e93c7d90>

To evaluate the performance of the LSTM classifier, we have various metrics including confusion matrix, recall, precision, and F-score.

In [84]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [86]:
y_pred = model.predict(X_test)



In [87]:
#converting prediction to classses
y_pred_classes = np.argmax(y_pred, axis = 1)

In [88]:
# convert true labels to classes
Y_test_classes = np.argmax(Y_test, axis = 1)

In [91]:
# Confusion matrix
conf_matrix = confusion_matrix(Y_test_classes, y_pred_classes)
print("Confusion Matrix:")
print(conf_matrix)
# Accuracy
accuracy = accuracy_score(Y_test_classes, y_pred_classes)
print("Accuracy:", accuracy)
# Precision
precision = precision_score(Y_test_classes, y_pred_classes)
print("Precision:", precision)
# Recall
recall = recall_score(Y_test_classes, y_pred_classes)
print("Recall:", recall)
# F1-score
f1 = f1_score(Y_test_classes, y_pred_classes)
print("F1 Score:", f1)

Confusion Matrix:
[[2557  160]
 [ 151  548]]
Accuracy: 0.9089578454332553
Precision: 0.7740112994350282
Recall: 0.7839771101573677
F1 Score: 0.7789623312011372
