In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import ast
import itertools

# Load the datasets
train_df = pd.read_csv("examples/train_hc_dataset_small_20240915_044230.csv")
val_df = pd.read_csv("examples/val_hc_dataset_small_20240915_044230.csv")

# Parse the 'prob_diff_values' column from string to list of floats
train_df['prob_diff_values'] = train_df['prob_diff_values'].apply(lambda x: ast.literal_eval(x))
val_df['prob_diff_values'] = val_df['prob_diff_values'].apply(lambda x: ast.literal_eval(x))

# Encode the 'verdict' column to numerical labels
le = LabelEncoder()
train_df['verdict_label'] = le.fit_transform(train_df['verdict'])
val_df['verdict_label'] = le.transform(val_df['verdict'])  # Use transform instead of fit_transform

# Determine the maximum sequence length across both datasets
max_seq_length = max(train_df['prob_diff_values'].apply(len).max(), val_df['prob_diff_values'].apply(len).max())

# Function to pad sequences to the same length
def pad_sequences(sequences, maxlen, padding='post', value=0.0):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < maxlen:
            if padding == 'post':
                seq = seq + [value] * (maxlen - len(seq))
            else:
                seq = [value] * (maxlen - len(seq)) + seq
        else:
            seq = seq[:maxlen]
        padded_sequences.append(seq)
    return np.array(padded_sequences, dtype=np.float32)

# Pad the sequences
X_train = pad_sequences(train_df['prob_diff_values'], maxlen=max_seq_length)
Y_train = train_df['verdict_label'].values

X_val = pad_sequences(val_df['prob_diff_values'], maxlen=max_seq_length)
Y_val = val_df['verdict_label'].values

# Create a PyTorch Dataset class
class SequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# Define a function to train and evaluate the model with given hyperparameters
def train_and_evaluate_model(params):
    # Unpack hyperparameters
    hidden_size = params['hidden_size']
    num_layers = params['num_layers']
    dropout = params['dropout']
    learning_rate = params['learning_rate']
    batch_size = params['batch_size']
    num_epochs = params['num_epochs']
    model_type = params['model_type']
    
    # Create datasets and dataloaders
    train_dataset = SequenceDataset(X_train, Y_train)
    val_dataset = SequenceDataset(X_val, Y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    # Define the model
    if model_type == 'LSTM':
        class SequenceClassifier(nn.Module):
            def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout):
                super(SequenceClassifier, self).__init__()
                self.hidden_size = hidden_size
                self.num_layers = num_layers
                
                self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
                self.fc = nn.Linear(hidden_size, num_classes)
                self.dropout = nn.Dropout(dropout)
                
            def forward(self, x):
                h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
                c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
                out, _ = self.lstm(x.unsqueeze(-1), (h0, c0))
                out = self.dropout(out[:, -1, :])
                out = self.fc(out)
                return out
    elif model_type == 'GRU':
        class SequenceClassifier(nn.Module):
            def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout):
                super(SequenceClassifier, self).__init__()
                self.hidden_size = hidden_size
                self.num_layers = num_layers
                
                self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
                self.fc = nn.Linear(hidden_size, num_classes)
                self.dropout = nn.Dropout(dropout)
                
            def forward(self, x):
                h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
                out, _ = self.gru(x.unsqueeze(-1), h0)
                out = self.dropout(out[:, -1, :])
                out = self.fc(out)
                return out
    else:
        raise ValueError("Invalid model type. Choose 'LSTM' or 'GRU'.")
    
    # Instantiate the model
    input_size = 1  # Each element in the sequence is a single float
    num_classes = len(le.classes_)
    model = SequenceClassifier(input_size, hidden_size, num_layers, num_classes, dropout)
    
    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training loop
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for sequences, labels in train_loader:
            # Forward pass
            outputs = model(sequences)
            loss = criterion(outputs, labels)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')
    
    # Evaluation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for sequences, labels in val_loader:
            outputs = model(sequences)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    val_accuracy = 100 * correct / total
    return val_accuracy

# Hyperparameter options
hidden_sizes = [64, 128, 256]
num_layers_list = [1, 2]
dropouts = [0.0, 0.2, 0.5]
learning_rates = [0.001, 0.0001]
batch_sizes = [16, 32, 64]
num_epochs_list = [10]  # You can adjust this
model_types = ['LSTM', 'GRU']

# Create all combinations of hyperparameters
param_grid = {
    'hidden_size': hidden_sizes,
    'num_layers': num_layers_list,
    'dropout': dropouts,
    'learning_rate': learning_rates,
    'batch_size': batch_sizes,
    'num_epochs': num_epochs_list,
    'model_type': model_types
}

# Generate all combinations of hyperparameters
keys, values = zip(*param_grid.items())
hyperparameter_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

# Hyperparameter optimization loop
best_accuracy = 0.0
best_params = None

for params in hyperparameter_combinations:
    print(f"Testing parameters: {params}")
    val_accuracy = train_and_evaluate_model(params)
    print(f"Validation Accuracy: {val_accuracy:.2f}%\n")
    
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_params = params

print(f"Best Validation Accuracy: {best_accuracy:.2f}%")
print(f"Best Hyperparameters: {best_params}")


Testing parameters: {'hidden_size': 64, 'num_layers': 1, 'dropout': 0.0, 'learning_rate': 0.001, 'batch_size': 16, 'num_epochs': 10, 'model_type': 'LSTM'}
