In [None]:
# Data manipulation
import pandas as pd
import numpy as np
import string
import re

# Scikit-learn utilities
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

# Imbalanced data handling
from imblearn.over_sampling import SMOTE

# TensorFlow / Keras - Model building and preprocessing
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Embedding, LSTM, GRU, Dense, Dropout, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Flatten)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Visualization
import plotly.graph_objs as go
import plotly.express as px




In [2]:
# Load your dataset
data = pd.read_csv('datasets\cleaned_train.csv')

# Check the column names
print(data.columns)





Index(['Id', 'pub_title', 'dataset_title', 'dataset_label', 'cleaned_label'], dtype='object')


In [None]:


# Combine title fields
data['text'] = data['pub_title'].astype(str) + " " + data['dataset_title'].astype(str)

# Tokenization
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(data['text'])

vocab_size = len(tokenizer.word_index) + 1  # Calculate vocab size

# Convert text to sequences and pad
X = tokenizer.texts_to_sequences(data['text'])
X = pad_sequences(X, padding='pre', maxlen=200)

# Encode and one-hot encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)  # Make sure 'labels' is defined

# Get the number of unique classes
num_classes = len(np.unique(y))

# One-hot encode labels
y_cat = to_categorical(y, num_classes=num_classes)



In [None]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, stratify=y_cat, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=42)

# Apply SMOTE
y_train_int = np.argmax(y_train, axis=1)
smote = SMOTE(random_state=42, n_jobs=1, k_neighbors=1)
X_train_res, y_train_res_int = smote.fit_resample(X_train, y_train_int)
y_train_res = to_categorical(y_train_res_int, num_classes=num_classes)
X_train_res_small = X_train_res[:10000]
y_train_res_small = y_train_res[:10000]

# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
]

# Model builders
def build_bilstm_model():
    model = Sequential([
        Embedding(vocab_size, 100, input_length=200),
        Bidirectional(LSTM(64, return_sequences=True)),
        GlobalMaxPooling1D(),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

def build_gru_model():
    model = Sequential([
        Embedding(vocab_size, 100, input_length=200),
        GRU(64, return_sequences=True),
        GlobalMaxPooling1D(),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

def build_cnn_model():
    model = Sequential([
        Embedding(vocab_size, 100, input_length=200),
        Conv1D(128, kernel_size=5, activation='relu'),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model






In [None]:
# Train and evaluate with return
def train_and_evaluate(model_fn, name):
    print(f"\nTraining {name}...")
    model = model_fn()
    history = model.fit(X_train_res_small, y_train_res_small,
                        validation_data=(X_val, y_val),
                        epochs=30,
                        batch_size=64,
                        callbacks=callbacks,
                        verbose=2)
    
    # Predict and get class labels
    y_pred = model.predict(X_val)
    y_pred_labels = np.argmax(y_pred, axis=1)
    y_true_labels = np.argmax(y_val, axis=1)

    # Get only the classes actually used in validation set
    used_labels = sorted(np.unique(np.concatenate((y_true_labels, y_pred_labels))))
    used_class_names = [label_encoder.classes_[i] for i in used_labels]

    print(f"\n{name} Classification Report:")
    print(classification_report(
        y_true_labels,
        y_pred_labels,
        labels=used_labels,
        target_names=used_class_names,
        zero_division=0
    ))

    return history, y_true_labels, y_pred_labels, name




In [None]:
# Visualization functions
def plot_history_plotly(history, model_name):
    fig = go.Figure()
    fig.add_trace(go.Scatter(y=history.history['accuracy'], mode='lines', name='Train Accuracy'))
    fig.add_trace(go.Scatter(y=history.history['val_accuracy'], mode='lines', name='Val Accuracy'))
    fig.update_layout(title=f'{model_name} Training History',
                      xaxis_title='Epochs', yaxis_title='Accuracy')
    fig.show()

def plot_confusion_matrix_plotly(y_true, y_pred, labels, model_name):
    cm = confusion_matrix(y_true, y_pred, labels=range(len(labels)))
    fig = px.imshow(cm,
                    text_auto=True,
                    x=labels,
                    y=labels,
                    color_continuous_scale='Blues',
                    title=f'{model_name} Confusion Matrix')
    fig.update_layout(xaxis_title='Predicted Label', yaxis_title='True Label')
    fig.show()



In [None]:
# Train, Evaluate, Visualize
bilstm_history, bilstm_true, bilstm_pred, bilstm_name = train_and_evaluate(build_bilstm_model, "BiLSTM")
plot_history_plotly(bilstm_history, bilstm_name)
plot_confusion_matrix_plotly(bilstm_true, bilstm_pred, label_encoder.classes_, bilstm_name)

gru_history, gru_true, gru_pred, gru_name = train_and_evaluate(build_gru_model, "GRU")
plot_history_plotly(gru_history, gru_name)
plot_confusion_matrix_plotly(gru_true, gru_pred, label_encoder.classes_, gru_name)

cnn_history, cnn_true, cnn_pred, cnn_name = train_and_evaluate(build_cnn_model, "CNN")
plot_history_plotly(cnn_history, cnn_name)
plot_confusion_matrix_plotly(cnn_true, cnn_pred, label_encoder.classes_, cnn_name)


Training BiLSTM...
Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 200, 100)          1497800   
                                                                 
 bidirectional_8 (Bidirectio  (None, 200, 128)         84480     
 nal)                                                            
                                                                 
 global_max_pooling1d_9 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dropout_10 (Dropout)        (None, 128)               0         
                                                                 
 dense_11 (Dense)            (None, 111)               14319     
                                                                 
Total params: 1,596,599
Trainable


Training GRU...
Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 200, 100)          1497800   
                                                                 
 gru_1 (GRU)                 (None, 200, 64)           31872     
                                                                 
 global_max_pooling1d_10 (Gl  (None, 64)               0         
 obalMaxPooling1D)                                               
                                                                 
 dropout_11 (Dropout)        (None, 64)                0         
                                                                 
 dense_12 (Dense)            (None, 111)               7215      
                                                                 
Total params: 1,536,887
Trainable params: 1,536,887
Non-trainable params: 0
__________________________


Training CNN...
Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 200, 100)          1497800   
                                                                 
 conv1d_1 (Conv1D)           (None, 196, 128)          64128     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 98, 128)          0         
 1D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 12544)             0         
                                                                 
 dropout_12 (Dropout)        (None, 12544)             0         
                                                                 
 dense_13 (Dense)            (None, 64)                802880    
                                    