In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import keras.utils
import numpy as np

In [2]:
Tweets = pd.read_csv("Tweets.csv")
Tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [10]:
Tweets.tail()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
14631,569588464896876545,negative,1.0,Bad Flight,1.0,American,,MDDavis7,,0,@AmericanAir thx for nothing on getting us out...,,2015-02-22 12:04:07 -0800,US,Eastern Time (US & Canada)
14633,569587705937600512,negative,1.0,Cancelled Flight,1.0,American,,RussellsWriting,,0,@AmericanAir my flight was Cancelled Flightled...,,2015-02-22 12:01:06 -0800,Los Angeles,Arizona
14636,569587371693355008,negative,1.0,Customer Service Issue,1.0,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flig...,,2015-02-22 11:59:46 -0800,Texas,
14637,569587242672398336,neutral,1.0,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to...,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",
14638,569587188687634433,negative,1.0,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my ...",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada)


In [3]:
Tweets.groupby(['airline_sentiment']).size()

airline_sentiment
negative    9178
neutral     3099
positive    2363
dtype: int64

In [4]:
Tweets = Tweets[Tweets['airline_sentiment_confidence']>0.8]

In [5]:
Tweets.groupby(['airline_sentiment']).size()

airline_sentiment
negative    7392
neutral     1550
positive    1517
dtype: int64

In [6]:
token = Tokenizer(num_words=100)
token.fit_on_texts(Tweets['text'].values)

In [9]:
X = token.texts_to_sequences(Tweets['text'].values)
X = pad_sequences(X, padding='post', maxlen=100)
X

array([[97, 62,  0, ...,  0,  0,  0],
       [97, 99,  1, ...,  0,  0,  0],
       [97,  9, 99, ...,  0,  0,  0],
       ...,
       [13, 98, 93, ...,  0,  0,  0],
       [13, 89,  1, ...,  0,  0,  0],
       [13,  6, 23, ...,  0,  0,  0]])

In [11]:

labelEnc = LabelEncoder()
y = labelEnc.fit_transform(Tweets['airline_sentiment'])
print(y)

[1 0 0 ... 0 1 0]


In [12]:
y = keras.utils.to_categorical(y)
print(y)

[[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_test

array([[18, 29, 28, ...,  0,  0,  0],
       [ 8, 80,  2, ...,  0,  0,  0],
       [16, 46, 56, ...,  0,  0,  0],
       ...,
       [ 8, 28, 11, ...,  0,  0,  0],
       [97, 46,  5, ...,  0,  0,  0],
       [13, 22,  0, ...,  0,  0,  0]])

In [16]:
model = Sequential()

model.add(Embedding(input_dim=len(token.word_index), output_dim=128, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(units=196, dropout=0.2, recurrent_dropout=0, activation='tanh', recurrent_activation='sigmoid', unroll=False, use_bias=True))
model.add(Dense(units=3, activation='softmax'))

In [17]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

None


In [18]:
model.fit(X_train, y_train, epochs=15, batch_size=30, verbose=True, validation_data=(X_test, y_test))

Epoch 1/15
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 76ms/step - accuracy: 0.6942 - loss: 0.8281 - val_accuracy: 0.7046 - val_loss: 0.8152
Epoch 2/15
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 76ms/step - accuracy: 0.6990 - loss: 0.8291 - val_accuracy: 0.7046 - val_loss: 0.8149
Epoch 3/15
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 73ms/step - accuracy: 0.7025 - loss: 0.8202 - val_accuracy: 0.7046 - val_loss: 0.8124
Epoch 4/15
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 70ms/step - accuracy: 0.7130 - loss: 0.8003 - val_accuracy: 0.7046 - val_loss: 0.8126
Epoch 5/15
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 69ms/step - accuracy: 0.7103 - loss: 0.8051 - val_accuracy: 0.7046 - val_loss: 0.8128
Epoch 6/15
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 66ms/step - accuracy: 0.6986 - loss: 0.8239 - val_accuracy: 0.7046 - val_loss: 0.8150
Epoch 7/15
[1m2

<keras.src.callbacks.history.History at 0x21b3f34b710>

In [23]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss:", loss)
print("Accuracy:", accuracy)

[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.6931 - loss: 0.8344
Loss: 0.814531147480011
Accuracy: 0.7045888900756836


In [24]:
prev = model.predict(X_test)

[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step


In [25]:
print(prev)

[[0.7375754  0.13401946 0.12840514]
 [0.7375754  0.13401946 0.12840514]
 [0.7375754  0.13401946 0.12840514]
 ...
 [0.7375754  0.13401946 0.12840512]
 [0.7375754  0.13401946 0.12840512]
 [0.7375754  0.13401946 0.12840512]]
