In [7]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [8]:
df = pd.read_csv("labeled_data.csv")
df = df[['tweet', 'class']].dropna()
df['label'] = df['class'].apply(lambda x: 1 if x == 0 else 0)

In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|@\S+|#\S+|RT", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['tweet'] = df['tweet'].apply(clean_text)

In [10]:
vocab_size = 10000
max_len = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df['tweet'])
sequences = tokenizer.texts_to_sequences(df['tweet'])
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = df['label'].values


In [11]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [13]:
model_lstm = Sequential()
model_lstm.add(Embedding(vocab_size, 100, input_length=max_len))
model_lstm.add(LSTM(64))
model_lstm.add(Dense(64, activation='relu'))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(1, activation='sigmoid'))

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stop], batch_size=32, verbose=0)



<keras.src.callbacks.history.History at 0x1c96081b620>

In [14]:
loss_lstm, acc_lstm = model_lstm.evaluate(X_test, y_test, verbose=0)
print(f"LSTM Accuracy: {acc_lstm * 100:.2f}%")

LSTM Accuracy: 94.15%


In [15]:
model_cnn = Sequential()
model_cnn.add(Embedding(vocab_size, 100, input_length=max_len))
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(1, activation='sigmoid'))

model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stop], batch_size=32, verbose=0)

<keras.src.callbacks.history.History at 0x1c9676bd130>

In [16]:
loss_cnn, acc_cnn = model_cnn.evaluate(X_test, y_test, verbose=0)
print(f"CNN Accuracy: {acc_cnn * 100:.2f}")

CNN Accuracy: 94.25


In [17]:
model_lstm.save('model_lstm.h5')



In [21]:
model_cnn.save('model_cnn.h5')



In [20]:
import joblib

joblib.dump(tokenizer, 'tokenizer.pkl')

['tokenizer.pkl']

In [18]:
model_lstm.save("model_lstm.keras", include_optimizer=False)
model_cnn.save("model_cnn.keras", include_optimizer=False)
