## End to end Deep Learning Project Using Simple RNN

In [13]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense,Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import pickle
from sklearn.preprocessing import LabelEncoder

In [2]:
def load_stopwords(file_path):
    with open(file_path, 'r') as file:
        stopwords = file.read().splitlines()
    return set(stopwords)
stopwords = load_stopwords('english_stopwords.txt')


In [15]:
df = pd.read_csv("dataset.csv")
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])

In [16]:
label_encoder = LabelEncoder()

df['labels'] = label_encoder.fit_transform(df['labels'])

In [3]:
def preprocess_text(text):
    # Load stopwords
    stop_words = set(stopwords.words('english'))

    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]  
    return ' '.join(tokens)

In [18]:
df['tweets'] = df['tweets'].apply(preprocess_text)

In [19]:
max_length = df['tweets'].apply(lambda x: len(x.split())).max()
print("Maximum tweet length:", max_length)

Maximum tweet length: 46


In [20]:
vocab_size = 10000
dimensions = 50
max_length = 46

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df['tweets'], df['labels'], test_size=0.2, random_state=42)

In [22]:
# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=vocab_size)

# Convert sentences to sequences of integers
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences( X_train)
X_test_sequences = tokenizer.texts_to_sequences( X_test)

In [23]:
with open('tokenizer.pkl','wb') as file:
    pickle.dump(tokenizer,file)

In [24]:
#padding
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='pre')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='pre')

In [25]:
num_classes = len(label_encoder.classes_)
y_train_encoded = to_categorical(y_train, num_classes)
y_test_encoded = to_categorical(y_test, num_classes)

In [26]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

In [27]:
def create_embedding_matrix(embeddings_index, tokenizer):
    embedding_matrix = np.zeros((vocab_size, dimensions))
    for word, i in tokenizer.word_index.items():
        if i < vocab_size:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [28]:
glove_file_path = 'glove.6B.50d.txt'
embeddings_index = load_glove_embeddings(glove_file_path)
embedding_matrix = create_embedding_matrix(embeddings_index, tokenizer)

In [29]:
model = Sequential()
model.add(Input(shape=(max_length,)))
model.add(Embedding(vocab_size, 50, input_length=max_length))
model.add(GRU(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(GRU(100))
model.add(Dense(3, activation="softmax"))
model.compile(loss="categorical_crossentropy",optimizer='adam',metrics=['accuracy'])
model.summary()



In [30]:
## Create an instance of EarlyStoppping Callback
from tensorflow.keras.callbacks import EarlyStopping
earlystopping=EarlyStopping(monitor='val_loss',patience=3,restore_best_weights=True)
earlystopping

<keras.src.callbacks.early_stopping.EarlyStopping at 0x796eefeb9de0>

In [31]:
model.fit(
    X_train_padded,
    y_train_encoded,
    epochs=10,
    batch_size=32,
    validation_data=(X_test_padded, y_test_encoded),
    callbacks=[earlystopping])

Epoch 1/10
[1m5483/5483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 9ms/step - accuracy: 0.7880 - loss: 0.5282 - val_accuracy: 0.8896 - val_loss: 0.3324
Epoch 2/10
[1m5483/5483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 9ms/step - accuracy: 0.9007 - loss: 0.3000 - val_accuracy: 0.8982 - val_loss: 0.3096
Epoch 3/10
[1m5483/5483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 9ms/step - accuracy: 0.9155 - loss: 0.2554 - val_accuracy: 0.9004 - val_loss: 0.3065
Epoch 4/10
[1m5483/5483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 9ms/step - accuracy: 0.9249 - loss: 0.2277 - val_accuracy: 0.8997 - val_loss: 0.3170
Epoch 5/10
[1m5483/5483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 9ms/step - accuracy: 0.9351 - loss: 0.1973 - val_accuracy: 0.8980 - val_loss: 0.3340
Epoch 6/10
[1m5483/5483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 9ms/step - accuracy: 0.9467 - loss: 0.1646 - val_accuracy: 0.8893 - val_loss: 0.3588


<keras.src.callbacks.history.History at 0x796edd6966e0>

In [32]:
## Save model file
model.save('gru_rnn.keras')

In [34]:
with open('label_mapping.pkl', 'wb') as mapping_file:
    pickle.dump(label_encoder, mapping_file)