In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer




In [2]:
tf.config.run_functions_eagerly(True)


In [3]:
# Reading the datasets
df_train = pd.read_csv('train_Tweet.csv')
df_test = pd.read_csv('test_Tweet.csv')

display(df_train)
display(df_test)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive
...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative
3530,416863ce47,All alone in this old house again. Thanks for...,positive
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive


In [4]:
len(df_train)


27481

In [5]:
# Check properties information
df_train.describe().T


Unnamed: 0,count,unique,top,freq
textID,27481,27481,cb774db0d1,1
text,27480,27480,"I`d have responded, if I were going",1
selected_text,27480,22463,good,199
sentiment,27481,3,neutral,11118


In [6]:
df_train.isna().sum()


textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [7]:
df_test.isna().sum()


textID       0
text         0
sentiment    0
dtype: int64

In [8]:
df_train.dropna(inplace=True)


In [9]:
# Print the first row in the training dataset
print("First Row in the Training Dataset:")
print(df_train.iloc[0])

print('-' * 30)

# Count the number of rows where 'text' is equal to 'selected_text'
matching_rows = len(df_train[df_train['text'] == df_train['selected_text']])
print(f"Number of Matching Rows: {matching_rows}")

First Row in the Training Dataset:
textID                                     cb774db0d1
text              I`d have responded, if I were going
selected_text     I`d have responded, if I were going
sentiment                                     neutral
Name: 0, dtype: object
------------------------------
Number of Matching Rows: 7283


In [10]:
# Check for empty strings - train set
blanks_train = []

for i, tid, t, tst, snt in df_train.itertuples():
    if type(t) == str and t.isspace():
        blanks_train.append(i)
        
if len(blanks_train) > 0:
    print(len(blanks_train))
    df_train.drop(blanks_train, inplace=True)

In [11]:
# Check for empty strings - test set
blanks_test = []

for i, tid, t, snt in df_test.itertuples():
    if type(t) == str and t.isspace():
        blanks_test.append(i)
        
if len(blanks_test) > 0:
    print(len(blanks_test))
    df_train.drop(blanks_test, inplace=True)

In [12]:
df_train['sentiment'].value_counts()


neutral     11117
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [13]:
# Performing T-Test
from scipy import stats

df_train['text_length'] = df_train['text'].apply(lambda t: len(str(t)))

positive_text_lengths = df_train[df_train['sentiment'] == 'positive']['text_length']
negative_text_lengths = df_train[df_train['sentiment'] == 'negative']['text_length']
neutral_text_lengths = df_train[df_train['sentiment'] == 'neutral']['text_length']

# Perform ANOVA test
f_statistic, p_value = stats.f_oneway(positive_text_lengths, negative_text_lengths, neutral_text_lengths)

# Print the results
print("ANOVA Test Results:")
print(f"F-statistic: {f_statistic}")
print(f"P-value: {p_value}")

# Interpret the results
alpha = 0.05  # Set your significance level
if p_value < alpha:
    print("The means of at least two groups are significantly different.")
else:
    print("There is no significant difference in the means of the groups.")

ANOVA Test Results:
F-statistic: 72.2127709711816
P-value: 5.254438748898152e-32
The means of at least two groups are significantly different.


In [14]:
# Preparing the data
max_len = 32
num_words = 500

In [15]:
# Tokenizing
tok = Tokenizer(num_words=num_words)
tok.fit_on_texts(df_train['text'])


In [16]:
# Defining X and y
X_train = df_train['text']
y_train = df_train['sentiment']

X_test = df_test['text']
y_test = df_test['sentiment']


In [17]:
X_train_mat = tok.texts_to_sequences(X_train)
X_test_mat = tok.texts_to_sequences(X_test)

In [18]:
# Pad sequences to the same length
X_train_padded = pad_sequences(X_train_mat, maxlen=max_len)
X_test_padded = pad_sequences(X_test_mat, maxlen=max_len)

In [19]:
y_train = pd.get_dummies(y_train, drop_first=True, dtype=int).to_numpy()
y_test = pd.get_dummies(y_test, drop_first=True, dtype=int).to_numpy()

In [28]:
y_train

array([[1, 0],
       [0, 0],
       [0, 0],
       ...,
       [0, 1],
       [0, 1],
       [1, 0]])

In [29]:
y_train.shape

(27480, 2)

In [30]:
y_test.shape

(3534, 2)

In [31]:
y_test

array([[1, 0],
       [0, 1],
       [0, 0],
       ...,
       [0, 0],
       [0, 1],
       [0, 1]])

In [20]:
# Create NN Architectures

# RNN
def create_rnn_model(units, dropout_rate):
    model = Sequential()
    model.add(Embedding(num_words, 128, input_length=max_len))
    model.add(SimpleRNN(units))
    model.add(Dropout(dropout_rate))
    model.add(Dense(2, activation='sigmoid'))
    return model

# LSTM
def create_lstm_model(units, dropout_rate):
    model = Sequential()
    model.add(Embedding(num_words, 32, input_length=max_len))
    model.add(LSTM(units))
    model.add(Dropout(dropout_rate))
    model.add(Dense(2, activation='sigmoid'))
    return model

# GRU
def create_gru_model(units, dropout_rate):
    model = Sequential()
    model.add(Embedding(num_words, 32, input_length=max_len))
    model.add(GRU(units))
    model.add(Dropout(dropout_rate))
    model.add(Dense(2, activation='sigmoid'))
    return model

In [21]:
# Define hyperparameters
units = 128
dropout_rate = 0.3

In [22]:
# Create and compile the models

# RNN
rnn_model = create_rnn_model(units, dropout_rate)
rnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# LSTM
lstm_model = create_lstm_model(units, dropout_rate)
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# GRU
gru_model = create_gru_model(units, dropout_rate)
gru_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])





In [23]:
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)


In [24]:
# Train the models

# RNN
rnn_history = rnn_model.fit(X_train_padded, y_train, epochs=20, batch_size=128, 
                            validation_split=0.2, callbacks=[early_stopping])

# LSTM
lstm_history = lstm_model.fit(X_train_padded, y_train, epochs=20, batch_size=128, 
                              validation_split=0.2, callbacks=[early_stopping])

# GRU
gru_history = gru_model.fit(X_train_padded, y_train, epochs=20, batch_size=128, 
                            validation_split=0.2, callbacks=[early_stopping])

Epoch 1/20





Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


In [25]:
rnn_val_acc = max(rnn_history.history['val_accuracy'])
lstm_val_acc = max(lstm_history.history['val_accuracy'])
gru_val_acc = max(gru_history.history['val_accuracy'])
max_val_acc = max(rnn_val_acc, lstm_val_acc, gru_val_acc)


print("RNN Validation Accuracy:", rnn_val_acc)
print("LSTM Validation Accuracy:", lstm_val_acc)
print("GRU Validation Accuracy:", gru_val_acc)
print('-' * 50)
print("Max Validation Accuracy:", max_val_acc)

RNN Validation Accuracy: 0.7012372612953186
LSTM Validation Accuracy: 0.7012372612953186
GRU Validation Accuracy: 0.7305312752723694
--------------------------------------------------
Max Validation Accuracy: 0.7305312752723694


In [26]:
# Define a function to save the models
from datetime import datetime

def save_model(model, prefix=''):
    current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"{prefix}model_{current_datetime}.h5"

    model.save(f'{filename}')
    print(f"Model saved to {filename}")

In [27]:
# Saving the models
save_model(rnn_model, prefix='rnn')
save_model(lstm_model, prefix='lstm')
save_model(gru_model, prefix='gru')

Model saved to rnnmodel_2024-02-08_21-56-29.h5
Model saved to lstmmodel_2024-02-08_21-56-29.h5
Model saved to grumodel_2024-02-08_21-56-29.h5


  saving_api.save_model(
