In [7]:
import tensorflow as tf
import pandas as pd
import joblib
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Conv1D, MaxPooling1D, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
def preprocess(text):
    text = re.sub(r"http\S+", "", text.lower())
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return " ".join(tokens)

In [9]:
def create_cnn_lstm_model(input_dim=5000, embedding_dim=128, input_length=100):
    model = Sequential([
        Embedding(input_dim, embedding_dim, input_length=input_length),
        Conv1D(64, 5, activation='relu'),
        MaxPooling1D(2),
        LSTM(64, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.build((None, input_length)) 
    model.summary()
    return model

In [10]:
def train_cnn_lstm(X_train, y_train, X_test, y_test):
    model = Sequential([
        Embedding(input_dim=5000, output_dim=128, input_length=100),
        Conv1D(128, 5, activation='relu'),
        MaxPooling1D(5),
        Dropout(0.5),
        LSTM(128),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])
    
    history = model.fit(X_train, y_train,
                       validation_data=(X_test, y_test),
                       epochs=5,
                       batch_size=64)
    
    metrics = {
        'train_accuracy': history.history['accuracy'][-1],
        'val_accuracy': history.history['val_accuracy'][-1],
        'train_loss': history.history['loss'][-1],
        'val_loss': history.history['val_loss'][-1]
    }
    
    return model, metrics

In [11]:
# Continuous Learning
def incremental_training(new_data_path):
    model = tf.keras.models.load_model('models/cnn_lstm.h5')
    tokenizer = joblib.load('models/tokenizer.pkl')

    df_new = pd.read_csv(new_data_path)
    X_new = df_new['text']
    y_new = df_new['label']

    X_new_seq = tokenizer.texts_to_sequences(X_new)
    X_new_pad = pad_sequences(X_new_seq, maxlen=100)

    model.fit(X_new_pad, y_new, epochs=2, batch_size=32, verbose=1)

    model.save('models/cnn_lstm.h5')
    print("Model updated and saved.")


In [12]:
MAX_WORDS = 5000
MAX_LEN = 100

df = pd.read_csv('data/processed_data.csv')
df['processed_text'] = df['text'].apply(preprocess) 

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(df['processed_text'])  
    
sequences = tokenizer.texts_to_sequences(df['processed_text'])
X = pad_sequences(sequences, maxlen=MAX_LEN)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

cnn_lstm, metrics = train_cnn_lstm(X_train, y_train, X_test, y_test)
    
print("\n=== Model Performance ===")
print(pd.DataFrame([metrics]))
    
print("\nSaving model...")
joblib.dump(tokenizer, 'models/tokenizer.pkl')
cnn_lstm.save('models/cnn_lstm.keras')

Epoch 1/5




[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 26ms/step - accuracy: 0.7420 - loss: 0.4758 - val_accuracy: 0.8660 - val_loss: 0.3018
Epoch 2/5
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 26ms/step - accuracy: 0.8963 - loss: 0.2388 - val_accuracy: 0.8724 - val_loss: 0.2936
Epoch 3/5
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.9251 - loss: 0.1786 - val_accuracy: 0.8734 - val_loss: 0.3096
Epoch 4/5
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 25ms/step - accuracy: 0.9460 - loss: 0.1280 - val_accuracy: 0.8741 - val_loss: 0.3509
Epoch 5/5
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 25ms/step - accuracy: 0.9577 - loss: 0.0977 - val_accuracy: 0.8715 - val_loss: 0.3803

=== Model Performance ===
   train_accuracy  val_accuracy  train_loss  val_loss
0        0.953871        0.8715    0.107672  0.380323

Saving model...
