## A notebook for parameter tuning

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import numpy as np
import pandas as pd
import torch.nn.functional as F
import gensim
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

In [21]:
# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print("Using GPU:")
    print(torch.cuda.get_device_name(0))  # 0 is the index of the GPU
else:
    print("Using CPU")

Using GPU:
Quadro T1000 with Max-Q Design


Read Inputs (not sure why shape mismatch TODO check)

In [22]:
from sklearn.model_selection import train_test_split
df_news = pd.read_pickle('..\input\df_news.pkl')

# Use train_test_split for a more consistent and reproducible split
df_training, df_test = train_test_split(df_news, test_size=0.2, random_state=42)

# check that the df is balanced
print('df_news:', df_news['Real'].value_counts(normalize=True))
print('df_training:', df_training['Real'].value_counts(normalize=True))
print('df_test:', df_test['Real'].value_counts(normalize=True))

df_news: Real
0    0.522985
1    0.477015
Name: proportion, dtype: float64
df_training: Real
0    0.524278
1    0.475722
Name: proportion, dtype: float64
df_test: Real
0    0.517817
1    0.482183
Name: proportion, dtype: float64


Data Preparation

In [23]:
from torch.utils.data import Dataset, DataLoader # this is for data loading

df_training = df_training[['vector', 'subject', 'Real']]
df_test = df_test[['vector', 'subject', 'Real']]

# apply encoding consistently
label_encoder = LabelEncoder()

# fit LabelEncoder on the training set and transform both sets
df_training['subject'] = label_encoder.fit_transform(df_training['subject'])
df_test['subject'] = label_encoder.transform(df_test['subject'])  # Use transform, not fit_transform

# we make a set of features for vector+subject and then a set of feature for vector only
class VectorizedTextDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        feature = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return feature, label
    
def create_data(df, mode='subj', batch_size=2, shuffle=False):
    # Get the vectors (Word2Vec or equivalent)
    X_vectors = np.array(df['vector'].tolist())

    # If mode is 'subj', include subject features
    if mode == 'subj':
        df_features = df.drop(columns=['vector', 'Real'])
        X_features = df_features.values
        X = np.hstack([X_vectors, X_features])  # Concatenate vectors with subject features
    else:
        X = X_vectors  # Only use vectors
    
    y = np.array(df['Real'].tolist())  # Labels (1 or 0)
    
    # Create Dataset and DataLoader
    input_dataset = VectorizedTextDataset(X, y)
    input_loader = DataLoader(input_dataset, batch_size=batch_size, shuffle=shuffle)
    
    return input_loader, X, y

# Create DataLoaders for training and test sets
train_loader_subj, X_train_subj, y_train_subj = create_data(df_training, mode='subj')
test_loader_subj, X_test_subj, y_test_subj = create_data(df_test, mode='subj')

train_loader, X_train, Y_train = create_data(df_training, mode='')
test_loader, x_test, y_test = create_data(df_test, mode='')

## Simple feed-forward neural network for binary classification.

In [13]:
class TextClassifier(nn.Module): # create a class with the neural networ module
    def __init__(self, input_dim): # input dimensions
        super(TextClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, 1) # nn.Linear (input, output) - dimensions of input (features) and output (labels)
        self.sigmoid = nn.Sigmoid() # maps the output onto a 0 to 1 line, representing the probability of it belonging to a 1 
    
    def forward(self, x):  # forward nn 
        x = self.fc(x) # first the input is passed through the fully connected linear layer 
        x = self.sigmoid(x) # then it goes through the sigmoid 
        return x
    
def model_training(input_loader, X_train, learning_rate, epochs):
    # Model, Loss, Optimizer
    input_dim = X_train.shape[1]  # Number of features - 100 since this is the # components of the vector
    model = TextClassifier(input_dim)
    criterion = nn.BCELoss()  # loss function minimise 
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # optimizer - lr learning rate, a critical hyperparameter that controls how large or small the updates to the weights will be at each step
    # Training loop
    for epoch in range(epochs):  # choose epochs - one complete pass through the entire dataset
        model.train()
        for features, labels in input_loader:  # input_loader is train_loader
            optimizer.zero_grad() #  reset the gradients for each new epoch 
            outputs = model(features) 
            loss = criterion(outputs.squeeze(), labels.float())  # compute the loss
            loss.backward()
            optimizer.step() # update the weights 
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')
    return model

def evaluate_model(model, loader, threshold):
    model.eval() # model is in evaluate mode
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in loader: # the loader is the dataset
            outputs = model(features) # get the outputs
            predicted = (outputs.squeeze() > threshold).long()  # predicted label - it is assigned to whichever has a probability >0.5 (threshold can be changed)
            correct += (predicted == labels).sum().item() 
            total += labels.size(0)
    accuracy = correct / total
    print(f'Accuracy {accuracy}')
    return accuracy

In [14]:
# train the model with the subject
model = model_training(train_loader, X_train, learning_rate=0.1, epochs=5)
model_subj = model_training(train_loader_subj, X_train_subj, learning_rate=0.1, epochs=5)

Epoch 1, Loss: 0.00018206570530310273
Epoch 2, Loss: 4.6553395804949105e-05
Epoch 3, Loss: 5.472005796036683e-05
Epoch 4, Loss: 6.729816959705204e-05
Epoch 5, Loss: 6.0085090808570385e-05
Epoch 1, Loss: 3.713506885105744e-05
Epoch 2, Loss: 5.006815172237111e-06
Epoch 3, Loss: 4.768373855768004e-07
Epoch 4, Loss: 4.768373855768004e-07
Epoch 5, Loss: 2.0861668872385053e-06


In [15]:
# evaluate model performances 
model_eval_subj = evaluate_model(model_subj, train_loader_subj, threshold=0.5)
model_eval = evaluate_model(model, train_loader, threshold=0.5)

Accuracy 0.9333760231638732
Accuracy 0.9400579096831672


Changing the learning rate

In [19]:
def new_lr(learningrate):
    # train the model with the subject
    model = model_training(train_loader, X_train, learning_rate=learningrate, epochs=5)
    model_subj = model_training(train_loader_subj, X_train_subj, learning_rate=learningrate, epochs=5)
    model_eval_subj = evaluate_model(model_subj, train_loader_subj, threshold=0.5)
    model_eval = evaluate_model(model, train_loader, threshold=0.5)
    return
new_lr(0.001)

Epoch 1, Loss: 0.05407719314098358
Epoch 2, Loss: 0.031646665185689926
Epoch 3, Loss: 0.0234074704349041
Epoch 4, Loss: 0.019186798483133316
Epoch 5, Loss: 0.016677243635058403
Epoch 1, Loss: 0.05170110613107681
Epoch 2, Loss: 0.029679466038942337
Epoch 3, Loss: 0.021408934146165848
Epoch 4, Loss: 0.017055576667189598
Epoch 5, Loss: 0.014419246464967728
Accuracy 0.9556489782281864
Accuracy 0.9542012361490061


Seeing how the code converges, changing the number of epochs would not make it improve. 

Would adding a hidden layer improve the accuracy?

In [21]:
class TextClassifier_hiddenlayer(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(TextClassifier_hiddenlayer, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)  # First layer (input to hidden)
        self.relu = nn.ReLU()  # Activation function for the hidden layer
        self.fc2 = nn.Linear(hidden_dim, 1)  # Second layer (hidden to output)
        self.sigmoid = nn.Sigmoid()  # Sigmoid for binary classification
    
    def forward(self, x):
        x = self.fc1(x)  # Pass input through the first linear layer (input to hidden)
        x = self.relu(x)  # Apply ReLU activation
        x = self.fc2(x)  # Pass through the second linear layer (hidden to output)
        x = self.sigmoid(x)  # Apply sigmoid activation for output
        return x
    
input_dim = X_train.shape[1]
hidden_dim = 100  # Example value, you can adjust it
model_2 = TextClassifier_hiddenlayer(input_dim, hidden_dim)

criterion = nn.BCELoss()  # loss function minimise 
optimizer = optim.Adam(model_2.parameters(), lr=0.001)  # optimizer - lr learning rate, a critical hyperparameter that controls how large or small the updates to the weights will be at each step
# Adam optimizer

# Training loop
for epoch in range(5):  # choose epochs - one complete pass through the entire dataset
    model_2.train()
    for features, labels in train_loader:
        optimizer.zero_grad() #  reset the gradients for each new epoch 
        outputs = model_2(features) 
        loss = criterion(outputs.squeeze(), labels.float())  # compute the loss
        loss.backward()
        optimizer.step() # update the weights 
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

print(f'Train Accuracy: {evaluate_model(train_loader)}')
print(f'Test Accuracy: {evaluate_model(test_loader)}')

Epoch 1, Loss: 0.10727664083242416
Epoch 2, Loss: 0.00014839951472822577
Epoch 3, Loss: 0.009782358072698116
Epoch 4, Loss: 0.00034248080919496715
Epoch 5, Loss: 0.00020904683333355933
Train Accuracy: 0.9564563728492678
Test Accuracy: 0.9555679287305122


No. The data is probably too simple to benefit from a hidden layer (hidden layers are more useful in image recognition tasks)

## How does XGBoost perform instead?

Hyperparameters that can be tuned: 
- estimators
- learning rate
- tree depth
- scale_pos_weight (imbalanced classes)

Alternatives: 
use scikit optimizer like grisearchCV to tune these parameters. 

In [25]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score 

model_xgb = xgb.XGBClassifier(scale_pos_weight=2, random_state=42, n_estimators=500)
model_xgb.fit(X_train, Y_train)

# make predictions on the test set
y_pred = model_xgb.predict(x_test)
y_actual = y_test
# xgboost comparison
accuracy = accuracy_score(y_actual, y_pred)
f1 = f1_score(y_actual, y_pred)
precision = precision_score(y_actual, y_pred)
recall = recall_score(y_actual, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 0.9793
F1 Score: 0.9785
Precision: 0.9781
Recall: 0.9790
