In [1]:
import sys
sys.path.append('../functions')

# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import time
from ngboost import NGBRegressor
from ngboost.distns import Normal
# Import custom modules
from data_processing import load_and_process_data
from models import train_gpr, predict_gpr
from metrics import calculate_metrics, calculate_observed_confidence
from post_processing import create_errorbar_plot, plot_abs_error_vs_std, plot_std_histogram, plot_calibration_curve

In [None]:
num_zero_threshold = 3600

# Load and process data
file_path = "../data/other_property/Tm.csv"
X_count, Y = load_and_process_data(file_path, num_zero_threshold)

# Split the data
xtrain, xtemp, ytrain, ytemp = train_test_split(X_count, Y, test_size=0.2, random_state=11)
xval, xtest, yval, ytest = train_test_split(xtemp, ytemp, test_size=0.5, random_state=42)

# Convert Pandas DataFrame to NumPy arrays for xtrain and xtest if they are not already
xtrain_np = xtrain if isinstance(xtrain, np.ndarray) else xtrain.values
xtest_np = xtest if isinstance(xtest, np.ndarray) else xtest.values

# Reshape ytrain and ytest if they are not already 2D arrays
ytrain_np = ytrain.reshape(-1, 1) if len(ytrain.shape) == 1 else ytrain
ytest_np = ytest.reshape(-1, 1) if len(ytest.shape) == 1 else ytest

In [None]:
start_time = time.time()

model = NGBRegressor(
    Dist=Normal, 
    n_estimators=4000, 
    learning_rate=0.1, 
    minibatch_frac=0.5,
    verbose=1,
    col_sample=0.5,
    random_state=42, 
    natural_gradient=True
    )
    
# Train model
model.fit(xtrain, ytrain)
    
end_time = time.time() 
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds.")

In [None]:
# Make predictions with uncertainty on training and test data
pred_train = model.pred_dist(xtrain)
pred_test = model.pred_dist(xtest)

# Extract mean and std values
mean_train = pred_train.loc
std_train = pred_train.scale

mean_test = pred_test.loc
std_test = pred_test.scale

# Ensure all arrays are properly shaped
ytrain_np = ytrain.flatten()
ytest_np = ytest.flatten()
mean_train = mean_train.flatten()
mean_test = mean_test.flatten()
std_train = std_train.flatten()
std_test = std_test.flatten()

# Calibration curve
confidence_levels = np.arange(0, 1.05, 0.05)

# Calculate metrics
train_metrics = calculate_metrics(ytrain_np, mean_train, std_train, confidence_levels)
test_metrics = calculate_metrics(ytest_np, mean_test, std_test, confidence_levels)

# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Dataset': ['Training', 'Test'],
    'MAE': [train_metrics['MAE'], test_metrics['MAE']],
    'RMSE': [train_metrics['RMSE'], test_metrics['RMSE']],
    'R2': [train_metrics['R2'], test_metrics['R2']],
    'Spearman': [train_metrics['Spearman'], test_metrics['Spearman']],
    'Calibration Area': [train_metrics['Calibration Area'], test_metrics['Calibration Area']]
})

# Print the DataFrame
print(metrics_df)

In [None]:
# Plot results
create_errorbar_plot(ytrain.flatten(), mean_train.flatten(), std_train.flatten(), 'blue', 'Training')
create_errorbar_plot(ytest.flatten(), mean_test.flatten(), std_test.flatten(), 'green', 'Test')

In [None]:
# Plot additional figures
abs_error_train = np.abs(ytrain.flatten() - mean_train.flatten())
abs_error_test = np.abs(ytest.flatten() - mean_test.flatten())

plot_abs_error_vs_std(abs_error_train, std_train.flatten(), 'Training', 'blue')
plot_abs_error_vs_std(abs_error_test, std_test.flatten(), 'Test', 'green')

In [None]:
plot_std_histogram(std_train.flatten(), 'Training', 'blue')
plot_std_histogram(std_test.flatten(), 'Test', 'green')

In [None]:
# Calculate observed confidence
observed_confidence_train = calculate_observed_confidence(ytrain.flatten(), mean_train.flatten(), std_train.flatten(), confidence_levels)
observed_confidence_test = calculate_observed_confidence(ytest.flatten(), mean_test.flatten(), std_test.flatten(), confidence_levels)

# Plot calibration curves
plot_calibration_curve(confidence_levels, observed_confidence_train, 'Training')
plot_calibration_curve(confidence_levels, observed_confidence_test, 'Test')