In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
dataset = 'detailed'
test_fold = 1
chosen_feature = ['loglog_count', 'log_variance', 'log_range_value', 'loglog_sum_diff']

In [None]:
# Load data
folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
features_df = pd.read_csv(f'../../training_data/{dataset}/features.csv')[['sequenceID'] + chosen_feature]
target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

In [None]:
# Split data into training and test sets
train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

In [None]:
features_df_train

In [None]:
# Set random seed for reproducibility
torch.manual_seed(12345)
np.random.seed(12345)

# Sample data (replace with your actual data)
X_train = np.random.rand(100, 10)
y_train = np.random.rand(100, 1)
X_test = np.random.rand(20, 10)
y_test = np.random.rand(20, 1)

# Preprocess by standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Split train set into subtrain and validation (8:2 ratio)
X_subtrain, X_val, y_subtrain, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=12345)

# Convert data to PyTorch tensors
X_subtrain_tensor = torch.tensor(X_subtrain, dtype=torch.float32)
y_subtrain_tensor = torch.tensor(y_subtrain, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Define the MLP model
class MLPRegressor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MLPRegressor, self).__init__()
        self.hidden1 = nn.Linear(input_dim, 64)
        self.hidden2 = nn.Linear(64, 64)
        self.output = nn.Linear(64, output_dim)
    
    def forward(self, x):
        x = torch.relu(self.hidden1(x))
        x = torch.relu(self.hidden2(x))
        x = self.output(x)
        return x

# Initialize the model, loss function, and optimizer
input_dim = X_subtrain.shape[1]
output_dim = y_subtrain.shape[1]
model = MLPRegressor(input_dim, output_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training with early stopping
best_val_loss = float('inf')
patience = 20
patience_counter = 0

for epoch in range(1000):  # Large number to allow early stopping
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    outputs = model(X_subtrain_tensor)
    loss = criterion(outputs, y_subtrain_tensor)
    
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    
    # Validation step
    model.eval()
    val_outputs = model(X_val_tensor)
    val_loss = criterion(val_outputs, y_val_tensor).item()
    
    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model.state_dict()  # Save the best model state
        patience_counter = 0
    else:
        patience_counter += 1
    
    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch + 1}")
        break

# Load the best model
model.load_state_dict(best_model)

# Predict on the test set
model.eval()
test_predictions = model(X_test_tensor).detach().numpy()

# Evaluate the model
test_mse = mean_squared_error(y_test, test_predictions)
print(f"Test MSE: {test_mse}")
