In [None]:
import sys
sys.path.append('../functions')

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import time

from data_processing import load_and_process_data
from models import NeuralNetwork, train_ensemble, predict_ensemble
from metrics import calculate_metrics, calculate_observed_confidence
from post_processing import create_errorbar_plot, plot_abs_error_vs_std, plot_std_histogram, plot_calibration_curve

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Hyperparameters
BATCH_SIZE = 4
EPOCHS = 250
N_ENSEMBLES = 20
NEURONS = (256, 512, 256)
LEARNING_RATE = 0.0003
num_zero_threshold = 3600

# Load and process data
file_path = "../data/other_property/Tm.csv"
X_count, Y = load_and_process_data(file_path, num_zero_threshold)

In [None]:
# Split the data
xtrain, xtemp, ytrain, ytemp = train_test_split(X_count, Y, test_size=0.2, random_state=11)
xval, xtest, yval, ytest = train_test_split(xtemp, ytemp, test_size=0.5, random_state=42)

# Create DataLoaders
def create_dataloader(x, y, batch_size):
    tensor_x = torch.FloatTensor(x.values).to(device)
    tensor_y = torch.FloatTensor(y).unsqueeze(1).to(device)
    dataset = TensorDataset(tensor_x, tensor_y)
    return DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)

train_loader = create_dataloader(xtrain, ytrain, BATCH_SIZE)
val_loader = create_dataloader(xval, yval, BATCH_SIZE)
test_loader = create_dataloader(xtest, ytest, BATCH_SIZE)

# For training, create a separate shuffled loader
train_loader_shuffled = DataLoader(TensorDataset(torch.tensor(xtrain.values).float(), torch.tensor(ytrain).float()), batch_size=BATCH_SIZE, shuffle=True)

# Train ensemble
start_time = time.time()

models = [NeuralNetwork(xtrain.shape[1], *NEURONS).to(device) for _ in range(N_ENSEMBLES)]
for i, model in enumerate(models):
    print(f"Training Model {i+1}/{N_ENSEMBLES}")
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    train_ensemble(model, optimizer, train_loader, train_loader_shuffled, EPOCHS)

end_time = time.time()
total_time = end_time - start_time
print(f"Total training time: {total_time:.2f} seconds")

In [None]:
# Make predictions
mean_train, std_train, actual_train = predict_ensemble(models, train_loader)
mean_test, std_test, actual_test = predict_ensemble(models, test_loader)

mean_train = np.array(mean_train).flatten()
std_train = np.array(std_train).flatten()
actual_train = np.array(actual_train).flatten()
mean_test = np.array(mean_test).flatten()
std_test = np.array(std_test).flatten()
actual_test = np.array(actual_test).flatten()

# Calculate metrics
confidence_levels = np.arange(0, 1.05, 0.05)
train_metrics = calculate_metrics(actual_train, mean_train, std_train, confidence_levels)
test_metrics = calculate_metrics(actual_test, mean_test, std_test, confidence_levels)

# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Dataset': ['Training', 'Test'],
    'MAE': [train_metrics['MAE'], test_metrics['MAE']],
    'RMSE': [train_metrics['RMSE'], test_metrics['RMSE']],
    'R2': [train_metrics['R2'], test_metrics['R2']],
    'Spearman': [train_metrics['Spearman'], test_metrics['Spearman']],
    'Calibration Area': [train_metrics['Calibration Area'], test_metrics['Calibration Area']]
})

# Print the DataFrame
print(metrics_df)

In [None]:
# Plot results
create_errorbar_plot(actual_train, mean_train, std_train, 'blue', 'Training')
create_errorbar_plot(actual_test, mean_test, std_test, 'green', 'Test')

In [None]:
# Plot additional figures
abs_error_train = np.abs(actual_train - mean_train)
abs_error_test = np.abs(actual_test - mean_test)

plot_abs_error_vs_std(abs_error_train, std_train, 'Training', 'blue')
plot_abs_error_vs_std(abs_error_test, std_test, 'Test', 'green')

In [None]:
actual_train.shape

In [None]:
plot_std_histogram(std_train, 'Training', 'blue')
plot_std_histogram(std_test, 'Test', 'green')

In [None]:
# Calculate observed confidence
observed_confidence_train = calculate_observed_confidence(actual_train, mean_train, std_train, confidence_levels)
observed_confidence_test = calculate_observed_confidence(actual_test, mean_test, std_test, confidence_levels)

# Plot calibration curves
plot_calibration_curve(confidence_levels, observed_confidence_train, 'Training')
plot_calibration_curve(confidence_levels, observed_confidence_test, 'Test')