## A notebook for parameter tuning

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import numpy as np
import pandas as pd
import torch.nn.functional as F
import gensim
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

In [2]:
# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print("Using GPU:")
    print(torch.cuda.get_device_name(0))  # 0 is the index of the GPU
else:
    print("Using CPU")

Using GPU:
Quadro T1000 with Max-Q Design


Read Inputs

In [3]:
df_news = pd.read_pickle('..\input\df_news.pkl')

## Simple feed-forward neural network for binary classification.

Parameters to hypertune:
 - learning rate
 - 

In [6]:
from torch.utils.data import Dataset, DataLoader # this is for data loading
from sklearn.model_selection import train_test_split

class VectorizedTextDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        feature = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return feature, label
    
df_training = df_news[['vector', 'subject', 'Real']]

# Convert features to numpy arrays
# X = np.array(df_training['vector'].tolist()) # features - array of the vectors 
X_vectors = np.array(df_training['vector'].tolist())
df_features = df_training.drop(columns=['vector', 'Real'])

# Convert features to numpy array
X_features = df_features.values

# Combine word2vec vectors with features
X = np.hstack([X_vectors, X_features])
y = np.array(df_training['Real'].tolist()) # label - array of the 1 or 0s for each vector

# Split data into test and train - here we do an 80/20 split. Optional: additional validation set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Dataset instances
train_dataset = VectorizedTextDataset(X_train, y_train) # get the data in a structured dataset (abstract class that wraps around). 
test_dataset = VectorizedTextDataset(X_test, y_test)

# Create DataLoader instances
# this is for loading the data onto the GPU
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

class TextClassifier(nn.Module): # create a class with the neural networ module
    def __init__(self, input_dim): # input dimensions
        super(TextClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, 1) # nn.Linear (input, output) - dimensions of input (features) and output (labels)
        self.sigmoid = nn.Sigmoid() # maps the output onto a 0 to 1 line, representing the probability of it belonging to a 1 
    
    def forward(self, x):  # forward nn 
        x = self.fc(x) # first the input is passed through the fully connected linear layer 
        x = self.sigmoid(x) # then it goes through the sigmoid 
        return x
    
# Model, Loss, Optimizer
input_dim = X_train.shape[1]  # Number of features - 100 since this is the # components of the vector
model = TextClassifier(input_dim)
criterion = nn.BCELoss()  # loss function minimise 
optimizer = optim.Adam(model.parameters(), lr=0.001)  # optimizer - lr learning rate, a critical hyperparameter that controls how large or small the updates to the weights will be at each step
# Adam optimizer

# Training loop
for epoch in range(5):  # choose epochs - one complete pass through the entire dataset
    model.train()
    for features, labels in train_loader:
        optimizer.zero_grad() #  reset the gradients for each new epoch 
        outputs = model(features) 
        loss = criterion(outputs.squeeze(), labels.float())  # compute the loss
        loss.backward()
        optimizer.step() # update the weights 
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

def evaluate_model(loader):
    model.eval() # model is in evaluate mode
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in loader: # the loader is the dataset
            outputs = model(features) # get the outputs
            predicted = (outputs.squeeze() > 0.5).long()  # predicted label - it is assigned to whichever has a probability >0.5 (threshold can be changed)
            correct += (predicted == labels).sum().item() 
            total += labels.size(0)
    accuracy = correct / total
    return accuracy

print(f'Train Accuracy: {evaluate_model(train_loader)}')
print(f'Test Accuracy: {evaluate_model(test_loader)}')

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

Changing the parameters 

In [19]:
def evaluate_model_2(loader):
    model.eval() # model is in evaluate mode
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in loader: # the loader is the dataset
            outputs = model(features) # get the outputs
            predicted = (outputs.squeeze() > 0.5).long()  # we only want to predict more than 0.8 - funny how you lose accuracy
            correct += (predicted == labels).sum().item() 
            total += labels.size(0)
    accuracy = correct / total
    return accuracy

print(f'Train Accuracy: {evaluate_model_2(train_loader)}')
print(f'Test Accuracy: {evaluate_model_2(test_loader)}')

Train Accuracy: 0.9564563728492678
Test Accuracy: 0.9555679287305122


What if we use a hidden layer instead, does that improve the accuracy?

In [21]:
class TextClassifier_hiddenlayer(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(TextClassifier_hiddenlayer, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)  # First layer (input to hidden)
        self.relu = nn.ReLU()  # Activation function for the hidden layer
        self.fc2 = nn.Linear(hidden_dim, 1)  # Second layer (hidden to output)
        self.sigmoid = nn.Sigmoid()  # Sigmoid for binary classification
    
    def forward(self, x):
        x = self.fc1(x)  # Pass input through the first linear layer (input to hidden)
        x = self.relu(x)  # Apply ReLU activation
        x = self.fc2(x)  # Pass through the second linear layer (hidden to output)
        x = self.sigmoid(x)  # Apply sigmoid activation for output
        return x
    
input_dim = X_train.shape[1]
hidden_dim = 100  # Example value, you can adjust it
model_2 = TextClassifier_hiddenlayer(input_dim, hidden_dim)

criterion = nn.BCELoss()  # loss function minimise 
optimizer = optim.Adam(model_2.parameters(), lr=0.001)  # optimizer - lr learning rate, a critical hyperparameter that controls how large or small the updates to the weights will be at each step
# Adam optimizer

# Training loop
for epoch in range(5):  # choose epochs - one complete pass through the entire dataset
    model_2.train()
    for features, labels in train_loader:
        optimizer.zero_grad() #  reset the gradients for each new epoch 
        outputs = model_2(features) 
        loss = criterion(outputs.squeeze(), labels.float())  # compute the loss
        loss.backward()
        optimizer.step() # update the weights 
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

print(f'Train Accuracy: {evaluate_model_2(train_loader)}')
print(f'Test Accuracy: {evaluate_model_2(test_loader)}')

Epoch 1, Loss: 0.10727664083242416
Epoch 2, Loss: 0.00014839951472822577
Epoch 3, Loss: 0.009782358072698116
Epoch 4, Loss: 0.00034248080919496715
Epoch 5, Loss: 0.00020904683333355933
Train Accuracy: 0.9564563728492678
Test Accuracy: 0.9555679287305122


## Comparison with tree classifier - Extreme Gradient Boosting

Pros: faster to implement


In [None]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

def run_xgboost(X_train, y_train, X_test, y_test):
        
        # initialize XGBoost classifier with class weights
        model_xgb = xgb.XGBClassifier(scale_pos_weight=2, random_state=42)
        model_xgb.fit(X_train,y_train)

        # make predictions on the test set
        y_pred = model_xgb.predict(X_test)

        # print model evaluations
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)

        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        return 