In [11]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report
import joblib
import gensim.downloader as api
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import torch.nn.functional as F


# 1. Setup & Data Loading

In [12]:
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), "../.."))
DATA_DIR = os.path.join(BASE_DIR, "data/processed")
RESULTS_DIR = os.path.join(BASE_DIR, "results/tables")
MODEL_DIR = os.path.join(BASE_DIR, "models/deep_learning")
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(DATA_DIR, "val.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

# 2. Label Encoding

In [13]:
label_encoder = LabelEncoder()
train_df['label_enc'] = label_encoder.fit_transform(train_df['label'])
val_df['label_enc'] = label_encoder.transform(val_df['label'])
test_df['label_enc'] = label_encoder.transform(test_df['label'])
num_classes = len(label_encoder.classes_)
joblib.dump(label_encoder, os.path.join(MODEL_DIR, "label_encoder.pkl"))
print(f"Encoded labels: {label_encoder.classes_}")

Encoded labels: ['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'
 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'
 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'
 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'
 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'
 'talk.politics.misc' 'talk.religion.misc']


# 3. GloVe Embeddings

In [14]:
print("Loading GloVe embeddings...")
glove_model = api.load("glove-wiki-gigaword-100")
embedding_dim = 100

def text_to_embedding_sequence(text, glove_model, embedding_dim=100):
    tokens = text.split()
    vectors = [glove_model[token] for token in tokens if token in glove_model]
    return vectors

Loading GloVe embeddings...


# 4. Padding sequences

In [15]:
def pad_sequences(sequences, max_len, embedding_dim=100):
    padded = []
    for seq in sequences:
        if len(seq) < max_len:
            pad_len = max_len - len(seq)
            seq = seq + [np.zeros(embedding_dim)] * pad_len
        else:
            seq = seq[:max_len]
        padded.append(seq)
    return np.array(padded)

max_seq_len = 100

print("Converting train texts to sequences...")
train_sequences = [text_to_embedding_sequence(text, glove_model, embedding_dim) for text in train_df['clean_text']]
X_train_seq = pad_sequences(train_sequences, max_seq_len, embedding_dim)

print("Converting validation texts to sequences...")
val_sequences = [text_to_embedding_sequence(text, glove_model, embedding_dim) for text in val_df['clean_text']]
X_val_seq = pad_sequences(val_sequences, max_seq_len, embedding_dim)

print("Converting test texts to sequences...")
test_sequences = [text_to_embedding_sequence(text, glove_model, embedding_dim) for text in test_df['clean_text']]
X_test_seq = pad_sequences(test_sequences, max_seq_len, embedding_dim)

y_train = train_df['label_enc'].values
y_val = val_df['label_enc'].values
y_test = test_df['label_enc'].values

Converting train texts to sequences...
Converting validation texts to sequences...
Converting test texts to sequences...


# 5. Data to Tensors

In [16]:
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
X_val_tensor   = torch.tensor(X_val_seq, dtype=torch.float32)
X_test_tensor  = torch.tensor(X_test_seq, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_val_tensor   = torch.tensor(y_val, dtype=torch.long)
y_test_tensor  = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset   = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset  = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Train set: {len(train_dataset)} samples")
print(f"Validation set: {len(val_dataset)} samples")
print(f"Test set: {len(test_dataset)} samples")

Train set: 15997 samples
Validation set: 2000 samples
Test set: 2000 samples


# 6. Model Definitions

In [17]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim=128, num_layers=1, output_dim=2, dropout_rate=0.5):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(
            input_size=embedding_dim, 
            hidden_size=hidden_dim, 
            num_layers=num_layers, 
            batch_first=True, 
            dropout=dropout_rate if num_layers > 1 else 0
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, (hn, cn) = self.lstm(x)
        hidden = hn[-1]
        hidden = self.dropout(hidden)
        out = self.fc(hidden)
        return out

In [18]:
class CNNClassifier(nn.Module):
    def __init__(self, embedding_dim, max_seq_len, num_classes, num_filters=100, kernel_sizes=[3, 4, 5], dropout_rate=0.5):
        super(CNNClassifier, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=k) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)
        self.max_seq_len = max_seq_len

    def forward(self, x):
        x = x.permute(0, 2, 1)  # (batch, seq_len, embedding_dim) -> (batch, embedding_dim, seq_len)
        x = [F.relu(conv(x)).max(dim=2)[0] for conv in self.convs]  # Convolution + Global Max Pooling
        x = torch.cat(x, dim=1)  # Concatenate feature maps
        x = self.dropout(x)
        x = self.fc(x)
        return x

# 7. Training and Evaluation

Training

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20, patience=5, model_name="Model"):
    model = model.to(device) #Move model to device
    best_val_loss = float('inf')
    best_model_state = None
    best_val_acc = 0
    epochs_no_improve = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct, total = 0, 0

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - {model_name}", leave=False)

        for X_batch, y_batch in progress_bar:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device) #Move data to device

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

            progress_bar.set_postfix(loss=loss.item(), acc=100 * correct / total)

        # Validation
        model.eval()
        val_correct, val_total = 0, 0
        val_predictions = []
        val_true_labels = []
        total_val_loss = 0.0  # Initialize total_val_loss
        val_report = {} #initialise to empty dictionary
        with torch.no_grad():
            for X_val_batch, y_val_batch in val_loader:
                X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device) #Move data to device
                val_outputs = model(X_val_batch)
                loss = criterion(val_outputs, y_val_batch)
                total_val_loss += loss.item()
                _, val_predicted = torch.max(val_outputs, 1)
                val_total += y_val_batch.size(0)
                val_correct += (val_predicted == y_val_batch).sum().item()
                val_predictions.extend(val_predicted.cpu().numpy())
                val_true_labels.extend(y_val_batch.cpu().numpy())

        val_acc = 100 * val_correct / val_total
        avg_val_loss = total_val_loss / len(val_loader) if len(val_loader) > 0 else 0.0 #handle case of val_loader being empty
        val_report = classification_report(val_true_labels, val_predictions, output_dict=True, zero_division=0) if len(val_loader) > 0 else {}

        print(f"Epoch {epoch+1}/{epochs} - Loss: {running_loss/len(train_loader):.4f} - Val Loss: {avg_val_loss:.4f} - Val Acc: {val_acc:.2f}% - {model_name}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_val_acc = val_acc
            epochs_no_improve = 0
            best_model_state = model.state_dict()
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print(f"Early stopping triggered at epoch {epoch+1} - {model_name}")
            break

    model.load_state_dict(best_model_state)
    print(f"Best Validation Accuracy: {best_val_acc:.2f}% - {model_name}")
    return model, best_val_acc, val_report

evaluation

In [20]:
def evaluate_model(model, test_loader, model_name="Model"):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device) #Move data to device
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(y_batch.cpu().numpy())

    test_accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, output_dict=True, zero_division=0)
    print(f"Test Accuracy: {test_accuracy:.2f}% - {model_name}")
    print(classification_report(true_labels, predictions))
    return test_accuracy, report

In [21]:
def save_results(model_name, test_accuracy, report, val_accuracy, val_report):
    results = {
        "Model": model_name,
        "Test_Accuracy": test_accuracy,
        "Test_Precision": report['weighted avg']['precision'],
        "Test_Recall": report['weighted avg']['recall'],
        "Test_F1-Score": report['weighted avg']['f1-score'],
        "Val_Accuracy": val_accuracy,
        "Val_Precision": val_report['weighted avg']['precision'],
        "Val_Recall": val_report['weighted avg']['recall'],
        "Val_F1-Score": val_report['weighted avg']['f1-score']
    }
    results_df = pd.DataFrame([results])
    results_file = os.path.join(RESULTS_DIR, "deep_models_results.csv")
    if not os.path.exists(results_file):
        results_df.to_csv(results_file, index=False, header=True)
    else:
        results_df.to_csv(results_file, index=False, header=False, mode='a')
    print(f"Deep Learning results saved to: {results_file}")

# 8. Model Training and Evaluation

LSTM

In [23]:
lstm_model = LSTMClassifier(embedding_dim, hidden_dim=128, num_layers=1, output_dim=num_classes, dropout_rate=0.5)
lstm_model, val_accuracy_lstm, val_report_lstm = train_model(lstm_model, train_loader, val_loader, criterion, optim.Adam(lstm_model.parameters(), lr=0.001), epochs=20, model_name="LSTM")
test_accuracy_lstm, report_lstm = evaluate_model(lstm_model, test_loader, model_name="LSTM")
save_results("LSTMClassifier", test_accuracy_lstm, report_lstm, val_accuracy_lstm, val_report_lstm)

                                                                                         

Epoch 1/20 - Loss: 2.3465 - Val Loss: 1.9043 - Val Acc: 33.35% - LSTM


                                                                                         

Epoch 2/20 - Loss: 1.8623 - Val Loss: 1.9134 - Val Acc: 33.25% - LSTM


                                                                                          

Epoch 3/20 - Loss: 1.4056 - Val Loss: 1.0972 - Val Acc: 53.65% - LSTM


                                                                                          

Epoch 4/20 - Loss: 0.9962 - Val Loss: 0.6110 - Val Acc: 71.30% - LSTM


                                                                                          

Epoch 5/20 - Loss: 0.7996 - Val Loss: 1.4692 - Val Acc: 49.90% - LSTM


                                                                                          

Epoch 6/20 - Loss: 1.7354 - Val Loss: 1.3237 - Val Acc: 53.85% - LSTM


                                                                                          

Epoch 7/20 - Loss: 0.9106 - Val Loss: 0.5301 - Val Acc: 77.40% - LSTM


                                                                                          

Epoch 8/20 - Loss: 0.5274 - Val Loss: 0.4424 - Val Acc: 81.05% - LSTM


                                                                                          

Epoch 9/20 - Loss: 0.4037 - Val Loss: 0.2711 - Val Acc: 90.20% - LSTM


                                                                                            

Epoch 10/20 - Loss: 0.3194 - Val Loss: 0.1988 - Val Acc: 92.45% - LSTM


                                                                                            

Epoch 11/20 - Loss: 0.2196 - Val Loss: 0.1826 - Val Acc: 93.90% - LSTM


                                                                                            

Epoch 12/20 - Loss: 0.1962 - Val Loss: 0.0975 - Val Acc: 96.30% - LSTM


                                                                                            

Epoch 13/20 - Loss: 0.1402 - Val Loss: 0.1040 - Val Acc: 96.30% - LSTM


                                                                                            

Epoch 14/20 - Loss: 0.1326 - Val Loss: 0.1194 - Val Acc: 96.30% - LSTM


                                                                                             

Epoch 15/20 - Loss: 0.1542 - Val Loss: 0.1073 - Val Acc: 96.35% - LSTM


                                                                                             

Epoch 16/20 - Loss: 0.1237 - Val Loss: 0.0669 - Val Acc: 97.00% - LSTM


                                                                                             

Epoch 17/20 - Loss: 0.1255 - Val Loss: 0.0744 - Val Acc: 96.65% - LSTM


                                                                                             

Epoch 18/20 - Loss: 0.1091 - Val Loss: 0.1026 - Val Acc: 96.30% - LSTM


                                                                                             

Epoch 19/20 - Loss: 0.1121 - Val Loss: 0.0899 - Val Acc: 96.65% - LSTM


                                                                                             

Epoch 20/20 - Loss: 0.0956 - Val Loss: 0.0696 - Val Acc: 96.45% - LSTM
Best Validation Accuracy: 97.00% - LSTM
Test Accuracy: 0.96% - LSTM
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       100
           1       1.00      0.99      0.99       100
           2       1.00      0.95      0.97       100
           3       0.97      1.00      0.99       100
           4       1.00      1.00      1.00       100
           5       0.99      1.00      1.00       100
           6       1.00      0.99      0.99       100
           7       1.00      0.99      0.99       100
           8       1.00      1.00      1.00       100
           9       1.00      1.00      1.00       100
          10       1.00      1.00      1.00       100
          11       0.99      1.00      1.00       100
          12       0.98      1.00      0.99       100
          13       0.99      1.00      1.00       100
          14       1.00      1.00      1.00       

CNN

In [24]:
cnn_model = CNNClassifier(embedding_dim, max_seq_len, num_classes)
cnn_model, val_accuracy_cnn, val_report_cnn = train_model(cnn_model, train_loader, val_loader, criterion, optim.Adam(cnn_model.parameters(), lr=0.001), epochs=20, model_name="CNN")
test_accuracy_cnn, report_cnn = evaluate_model(cnn_model, test_loader, model_name="CNN")
save_results("CNNClassifier", test_accuracy_cnn, report_cnn, val_accuracy_cnn, val_report_cnn)

                                                                                          

Epoch 1/20 - Loss: 0.6924 - Val Loss: 0.0734 - Val Acc: 96.60% - CNN


                                                                                           

Epoch 2/20 - Loss: 0.1145 - Val Loss: 0.0583 - Val Acc: 97.30% - CNN


                                                                                           

Epoch 3/20 - Loss: 0.1068 - Val Loss: 0.0761 - Val Acc: 96.80% - CNN


                                                                                           

Epoch 4/20 - Loss: 0.0957 - Val Loss: 0.0923 - Val Acc: 96.45% - CNN


                                                                                           

Epoch 5/20 - Loss: 0.0884 - Val Loss: 0.0815 - Val Acc: 96.90% - CNN


                                                                                           

Epoch 6/20 - Loss: 0.0926 - Val Loss: 0.0672 - Val Acc: 96.80% - CNN


                                                                                            

Epoch 7/20 - Loss: 0.0875 - Val Loss: 0.0677 - Val Acc: 96.65% - CNN
Early stopping triggered at epoch 7 - CNN
Best Validation Accuracy: 97.30% - CNN
Test Accuracy: 0.97% - CNN
              precision    recall  f1-score   support

           0       0.89      0.86      0.87       100
           1       0.99      0.99      0.99       100
           2       1.00      0.99      0.99       100
           3       1.00      1.00      1.00       100
           4       1.00      1.00      1.00       100
           5       0.99      0.99      0.99       100
           6       1.00      1.00      1.00       100
           7       1.00      0.98      0.99       100
           8       1.00      1.00      1.00       100
           9       1.00      1.00      1.00       100
          10       1.00      1.00      1.00       100
          11       0.99      1.00      1.00       100
          12       0.97      1.00      0.99       100
          13       1.00      1.00      1.00       100
          14

# 9. Save Models

In [None]:
torch.save(lstm_model.state_dict(), os.path.join(MODEL_DIR, "lstm_model.pth"))
torch.save(cnn_model.state_dict(), os.path.join(MODEL_DIR, "cnn_model.pth"))