In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report
import joblib
import gensim.downloader as api
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import torch.nn.functional as F


# 1. Setup & Data Loading

In [2]:
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), "../.."))
DATA_DIR = os.path.join(BASE_DIR, "data/processed")
RESULTS_DIR = os.path.join(BASE_DIR, "results/tables")
MODEL_DIR = os.path.join(BASE_DIR, "models/deep_learning")
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(DATA_DIR, "val.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

# 2. Label Encoding

In [3]:
label_encoder = LabelEncoder()
train_df['label_enc'] = label_encoder.fit_transform(train_df['label'])
val_df['label_enc'] = label_encoder.transform(val_df['label'])
test_df['label_enc'] = label_encoder.transform(test_df['label'])
num_classes = len(label_encoder.classes_)
joblib.dump(label_encoder, os.path.join(MODEL_DIR, "label_encoder.pkl"))
print(f"Encoded labels: {label_encoder.classes_}")

Encoded labels: ['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'
 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'
 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'
 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'
 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'
 'talk.politics.misc' 'talk.religion.misc']


# 3. GloVe Embeddings

In [4]:
print("Loading GloVe embeddings...")
glove_model = api.load("glove-wiki-gigaword-100")
embedding_dim = 100

def text_to_embedding_sequence(text, glove_model, embedding_dim=100):
    tokens = text.split()
    vectors = [glove_model[token] for token in tokens if token in glove_model]
    return vectors

Loading GloVe embeddings...


# 4. Padding sequences

In [5]:
def pad_sequences(sequences, max_len, embedding_dim=100):
    padded = []
    for seq in sequences:
        if len(seq) < max_len:
            pad_len = max_len - len(seq)
            seq = seq + [np.zeros(embedding_dim)] * pad_len
        else:
            seq = seq[:max_len]
        padded.append(seq)
    return np.array(padded)

max_seq_len = 100

print("Converting train texts to sequences...")
train_sequences = [text_to_embedding_sequence(text, glove_model, embedding_dim) for text in train_df['clean_text']]
X_train_seq = pad_sequences(train_sequences, max_seq_len, embedding_dim)

print("Converting validation texts to sequences...")
val_sequences = [text_to_embedding_sequence(text, glove_model, embedding_dim) for text in val_df['clean_text']]
X_val_seq = pad_sequences(val_sequences, max_seq_len, embedding_dim)

print("Converting test texts to sequences...")
test_sequences = [text_to_embedding_sequence(text, glove_model, embedding_dim) for text in test_df['clean_text']]
X_test_seq = pad_sequences(test_sequences, max_seq_len, embedding_dim)

y_train = train_df['label_enc'].values
y_val = val_df['label_enc'].values
y_test = test_df['label_enc'].values

Converting train texts to sequences...
Converting validation texts to sequences...
Converting test texts to sequences...


# 5. Data to Tensors

In [6]:
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
X_val_tensor   = torch.tensor(X_val_seq, dtype=torch.float32)
X_test_tensor  = torch.tensor(X_test_seq, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_val_tensor   = torch.tensor(y_val, dtype=torch.long)
y_test_tensor  = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset   = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset  = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Train set: {len(train_dataset)} samples")
print(f"Validation set: {len(val_dataset)} samples")
print(f"Test set: {len(test_dataset)} samples")

Train set: 15997 samples
Validation set: 2000 samples
Test set: 2000 samples


# 6. Model Definitions

In [7]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim=128, num_layers=1, output_dim=2, dropout_rate=0.5):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(
            input_size=embedding_dim, 
            hidden_size=hidden_dim, 
            num_layers=num_layers, 
            batch_first=True, 
            dropout=dropout_rate if num_layers > 1 else 0
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, (hn, cn) = self.lstm(x)
        hidden = hn[-1]
        hidden = self.dropout(hidden)
        out = self.fc(hidden)
        return out

In [8]:
class CNNClassifier(nn.Module):
    def __init__(self, embedding_dim, max_seq_len, num_classes, num_filters=100, kernel_sizes=[3, 4, 5], dropout_rate=0.5):
        super(CNNClassifier, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=k) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)
        self.max_seq_len = max_seq_len

    def forward(self, x):
        x = x.permute(0, 2, 1)  # (batch, seq_len, embedding_dim) -> (batch, embedding_dim, seq_len)
        x = [F.relu(conv(x)).max(dim=2)[0] for conv in self.convs]  # Convolution + Global Max Pooling
        x = torch.cat(x, dim=1)  # Concatenate feature maps
        x = self.dropout(x)
        x = self.fc(x)
        return x

In [14]:
class TransformerClassifier(nn.Module):
    def __init__(self, embedding_dim, num_heads, num_layers, output_dim, dropout_rate=0.5):
        super(TransformerClassifier, self).__init__()
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, dropout=dropout_rate),
            num_layers=num_layers
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        # x: [batch_size, seq_len, embedding_dim]
        x = self.transformer_encoder(x.permute(1, 0, 2)) # [seq_len, batch_size, embedding_dim]
        # average the embeddings across the sequence length
        x = torch.mean(x, dim=0) # [batch_size, embedding_dim]
        x = self.dropout(x)
        x = self.fc(x) # [batch_size, output_dim]
        return x

# 7. Training and Evaluation

Training

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20, patience=5, model_name="Model"):
    model = model.to(device) #Move model to device
    best_val_loss = float('inf')
    best_model_state = None
    best_val_acc = 0
    epochs_no_improve = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct, total = 0, 0

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - {model_name}", leave=False)

        for X_batch, y_batch in progress_bar:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device) #Move data to device

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

            progress_bar.set_postfix(loss=loss.item(), acc=100 * correct / total)

        # Validation
        model.eval()
        val_correct, val_total = 0, 0
        val_predictions = []
        val_true_labels = []
        total_val_loss = 0.0  # Initialize total_val_loss
        val_report = {} #initialise to empty dictionary
        with torch.no_grad():
            for X_val_batch, y_val_batch in val_loader:
                X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device) #Move data to device
                val_outputs = model(X_val_batch)
                loss = criterion(val_outputs, y_val_batch)
                total_val_loss += loss.item()
                _, val_predicted = torch.max(val_outputs, 1)
                val_total += y_val_batch.size(0)
                val_correct += (val_predicted == y_val_batch).sum().item()
                val_predictions.extend(val_predicted.cpu().numpy())
                val_true_labels.extend(y_val_batch.cpu().numpy())

        val_acc = 100 * val_correct / val_total
        avg_val_loss = total_val_loss / len(val_loader) if len(val_loader) > 0 else 0.0 #handle case of val_loader being empty
        val_report = classification_report(val_true_labels, val_predictions, output_dict=True, zero_division=0) if len(val_loader) > 0 else {}

        print(f"Epoch {epoch+1}/{epochs} - Loss: {running_loss/len(train_loader):.4f} - Val Loss: {avg_val_loss:.4f} - Val Acc: {val_acc:.2f}% - {model_name}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_val_acc = val_acc
            epochs_no_improve = 0
            best_model_state = model.state_dict()
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print(f"Early stopping triggered at epoch {epoch+1} - {model_name}")
            break

    model.load_state_dict(best_model_state)
    print(f"Best Validation Accuracy: {best_val_acc:.2f}% - {model_name}")
    return model, best_val_acc/100, val_report

evaluation

In [11]:
def evaluate_model(model, test_loader, model_name="Model"):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device) #Move data to device
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(y_batch.cpu().numpy())

    test_accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, output_dict=True, zero_division=0)
    print(f"Test Accuracy: {test_accuracy:.2f}% - {model_name}")
    print(classification_report(true_labels, predictions))
    return test_accuracy, report

In [29]:
def save_results(model_name, test_accuracy, report, val_accuracy, val_report):
    results = {
        "Model": model_name,
        "Test_Accuracy": test_accuracy,
        "Test_Precision": report['weighted avg']['precision'],
        "Test_Recall": report['weighted avg']['recall'],
        "Test_F1-Score": report['weighted avg']['f1-score'],
        "Val_Accuracy": val_accuracy,
        "Val_Precision": val_report['weighted avg']['precision'],
        "Val_Recall": val_report['weighted avg']['recall'],
        "Val_F1-Score": val_report['weighted avg']['f1-score']
    }
    results_df = pd.DataFrame([results])
    results_file = os.path.join(RESULTS_DIR, "deep_models_results.csv")
    if not os.path.exists(results_file):
        results_df.to_csv(results_file, index=False, header=True)
    else:
        results_df.to_csv(results_file, index=False, header=False, mode='a')
    print(f"Deep Learning results saved to: {results_file}")

# 8. Model Training and Evaluation

In [25]:
criterion = nn.CrossEntropyLoss().to(device)

LSTM

In [31]:
lstm_model = LSTMClassifier(embedding_dim, hidden_dim=128, num_layers=1, output_dim=num_classes, dropout_rate=0.5)
lstm_model, val_accuracy_lstm, val_report_lstm = train_model(lstm_model, train_loader, val_loader, criterion, optim.Adam(lstm_model.parameters(), lr=0.001), epochs=20, model_name="LSTM")
test_accuracy_lstm, report_lstm = evaluate_model(lstm_model, test_loader, model_name="LSTM")
save_results("LSTMClassifier", test_accuracy_lstm, report_lstm, val_accuracy_lstm, val_report_lstm)

Epoch 1/20 - LSTM:   0%|          | 0/500 [00:00<?, ?it/s]

                                                                                         

Epoch 1/20 - Loss: 2.3894 - Val Loss: 2.1036 - Val Acc: 28.35% - LSTM


                                                                                         

Epoch 2/20 - Loss: 2.1178 - Val Loss: 1.8180 - Val Acc: 38.90% - LSTM


                                                                                          

Epoch 3/20 - Loss: 1.6990 - Val Loss: 1.2972 - Val Acc: 51.70% - LSTM


                                                                                          

Epoch 4/20 - Loss: 1.5943 - Val Loss: 1.5545 - Val Acc: 45.85% - LSTM


                                                                                          

Epoch 5/20 - Loss: 1.1385 - Val Loss: 0.8818 - Val Acc: 67.50% - LSTM


                                                                                          

Epoch 6/20 - Loss: 0.7065 - Val Loss: 0.5849 - Val Acc: 73.45% - LSTM


                                                                                          

Epoch 7/20 - Loss: 0.6503 - Val Loss: 0.5057 - Val Acc: 80.20% - LSTM


                                                                                          

Epoch 8/20 - Loss: 0.5354 - Val Loss: 0.3622 - Val Acc: 84.80% - LSTM


                                                                                           

Epoch 9/20 - Loss: 0.3213 - Val Loss: 0.2248 - Val Acc: 90.85% - LSTM


                                                                                            

Epoch 10/20 - Loss: 0.3199 - Val Loss: 0.1950 - Val Acc: 91.65% - LSTM


                                                                                           

Epoch 11/20 - Loss: 1.7373 - Val Loss: 1.0801 - Val Acc: 67.80% - LSTM


                                                                                           

Epoch 12/20 - Loss: 0.6226 - Val Loss: 0.3304 - Val Acc: 86.00% - LSTM


                                                                                           

Epoch 13/20 - Loss: 0.3700 - Val Loss: 0.2232 - Val Acc: 89.95% - LSTM


                                                                                            

Epoch 14/20 - Loss: 0.2663 - Val Loss: 0.1689 - Val Acc: 94.45% - LSTM


                                                                                            

Epoch 15/20 - Loss: 0.1973 - Val Loss: 0.3564 - Val Acc: 89.15% - LSTM


                                                                                            

Epoch 16/20 - Loss: 0.1676 - Val Loss: 0.0934 - Val Acc: 96.05% - LSTM


                                                                                            

Epoch 17/20 - Loss: 0.1311 - Val Loss: 0.0914 - Val Acc: 96.95% - LSTM


                                                                                             

Epoch 18/20 - Loss: 0.1462 - Val Loss: 0.0785 - Val Acc: 96.70% - LSTM


                                                                                            

Epoch 19/20 - Loss: 0.1697 - Val Loss: 0.1809 - Val Acc: 93.90% - LSTM


                                                                                            

Epoch 20/20 - Loss: 0.1396 - Val Loss: 0.0848 - Val Acc: 96.20% - LSTM
Best Validation Accuracy: 96.70% - LSTM
Test Accuracy: 0.96% - LSTM
              precision    recall  f1-score   support

           0       0.87      0.84      0.85       100
           1       0.99      0.99      0.99       100
           2       1.00      0.96      0.98       100
           3       0.98      1.00      0.99       100
           4       1.00      1.00      1.00       100
           5       1.00      1.00      1.00       100
           6       0.99      0.98      0.98       100
           7       1.00      0.99      0.99       100
           8       0.99      1.00      1.00       100
           9       1.00      1.00      1.00       100
          10       0.99      1.00      1.00       100
          11       0.98      1.00      0.99       100
          12       0.98      1.00      0.99       100
          13       0.99      1.00      1.00       100
          14       1.00      0.99      0.99       

CNN

In [33]:
cnn_model = CNNClassifier(embedding_dim, max_seq_len, num_classes)
cnn_model, val_accuracy_cnn, val_report_cnn = train_model(cnn_model, train_loader, val_loader, criterion, optim.Adam(cnn_model.parameters(), lr=0.003), epochs=20, model_name="CNN")
test_accuracy_cnn, report_cnn = evaluate_model(cnn_model, test_loader, model_name="CNN")
save_results("CNNClassifier", test_accuracy_cnn, report_cnn, val_accuracy_cnn, val_report_cnn)

                                                                                           

Epoch 1/20 - Loss: 0.4117 - Val Loss: 0.0873 - Val Acc: 97.00% - CNN


                                                                                           

Epoch 2/20 - Loss: 0.1507 - Val Loss: 0.1065 - Val Acc: 96.85% - CNN


                                                                                           

Epoch 3/20 - Loss: 0.1446 - Val Loss: 0.0888 - Val Acc: 96.90% - CNN


                                                                                            

Epoch 4/20 - Loss: 0.1534 - Val Loss: 0.1031 - Val Acc: 96.55% - CNN


                                                                                           

Epoch 5/20 - Loss: 0.1563 - Val Loss: 0.0922 - Val Acc: 96.85% - CNN


                                                                                            

Epoch 6/20 - Loss: 0.1541 - Val Loss: 0.0899 - Val Acc: 96.70% - CNN
Early stopping triggered at epoch 6 - CNN
Best Validation Accuracy: 97.00% - CNN
Test Accuracy: 0.97% - CNN
              precision    recall  f1-score   support

           0       0.93      0.84      0.88       100
           1       0.99      0.99      0.99       100
           2       1.00      1.00      1.00       100
           3       1.00      1.00      1.00       100
           4       1.00      1.00      1.00       100
           5       1.00      0.99      0.99       100
           6       1.00      1.00      1.00       100
           7       1.00      0.98      0.99       100
           8       1.00      1.00      1.00       100
           9       1.00      1.00      1.00       100
          10       1.00      1.00      1.00       100
          11       0.99      1.00      1.00       100
          12       0.97      1.00      0.99       100
          13       1.00      1.00      1.00       100
          14

Transformer Model Training

In [34]:
transformer_model = TransformerClassifier(embedding_dim=embedding_dim, num_heads=10, num_layers=2, output_dim=num_classes, dropout_rate=0.5)
transformer_model, val_accuracy_transformer, val_report_transformer = train_model(transformer_model, train_loader, val_loader, criterion, optim.Adam(transformer_model.parameters(), lr=0.001), epochs=20, model_name="Transformer")
test_accuracy_transformer, report_transformer = evaluate_model(transformer_model, test_loader, model_name="Transformer")
save_results("TransformerClassifier", test_accuracy_transformer, report_transformer, val_accuracy_transformer, val_report_transformer)

                                                                                                 

Epoch 1/20 - Loss: 1.2015 - Val Loss: 0.3059 - Val Acc: 84.85% - Transformer


                                                                                                 

Epoch 2/20 - Loss: 0.3215 - Val Loss: 0.2842 - Val Acc: 87.30% - Transformer


                                                                                                  

Epoch 3/20 - Loss: 0.2389 - Val Loss: 0.1299 - Val Acc: 95.20% - Transformer


                                                                                                  

Epoch 4/20 - Loss: 0.1506 - Val Loss: 0.1746 - Val Acc: 93.05% - Transformer


                                                                                                   

Epoch 5/20 - Loss: 0.1289 - Val Loss: 0.0842 - Val Acc: 96.30% - Transformer


                                                                                                   

Epoch 6/20 - Loss: 0.1146 - Val Loss: 0.1088 - Val Acc: 95.50% - Transformer


                                                                                                   

Epoch 7/20 - Loss: 0.1095 - Val Loss: 0.1632 - Val Acc: 95.50% - Transformer


                                                                                                   

Epoch 8/20 - Loss: 0.1054 - Val Loss: 0.1490 - Val Acc: 95.25% - Transformer


                                                                                                   

Epoch 9/20 - Loss: 0.0965 - Val Loss: 0.1255 - Val Acc: 96.10% - Transformer


                                                                                                    

Epoch 10/20 - Loss: 0.0923 - Val Loss: 0.1108 - Val Acc: 96.35% - Transformer
Early stopping triggered at epoch 10 - Transformer
Best Validation Accuracy: 96.30% - Transformer
Test Accuracy: 0.97% - Transformer
              precision    recall  f1-score   support

           0       0.98      0.83      0.90       100
           1       0.99      0.96      0.97       100
           2       0.98      0.99      0.99       100
           3       1.00      0.99      0.99       100
           4       0.98      1.00      0.99       100
           5       0.98      0.98      0.98       100
           6       1.00      0.99      0.99       100
           7       0.99      1.00      1.00       100
           8       1.00      1.00      1.00       100
           9       1.00      0.99      0.99       100
          10       0.99      1.00      1.00       100
          11       0.99      1.00      1.00       100
          12       1.00      1.00      1.00       100
          13       0.99      1.0

# 9. Save Models

In [None]:
torch.save(lstm_model.state_dict(), os.path.join(MODEL_DIR, "lstm_model.pth"))
torch.save(cnn_model.state_dict(), os.path.join(MODEL_DIR, "cnn_model.pth"))