In [None]:
import pandas as pd
import pyrenn as pr
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from scikeras.wrappers import KerasRegressor
import numpy as np

In [None]:
# Read the Excel file
file_path = 'data/Data2502.xlsx'
xls = pd.ExcelFile(file_path)

# Load the sheets into DataFrames
training_data_1 = pd.read_excel(xls, header=0, skiprows=1, sheet_name=0)
training_data_2 = pd.read_excel(xls, header=0, skiprows=1, sheet_name=1)
training_data_3 = pd.read_excel(xls, header=0, skiprows=1, sheet_name=2)

generated_data_1 = pd.read_excel(xls, header=None, sheet_name=4)
generated_data_2 = pd.read_excel(xls, header=None, sheet_name=5)
generated_data_3 = pd.read_excel(xls, header=None, sheet_name=6)

In [None]:
def process_data(data):
    data = data.iloc[1:]
    input_data = data.iloc[:, 1:14]
    output_data = data.iloc[:, 16]

    return input_data, output_data

def process_generated_data(data):
    data.columns = data.iloc[1]
    df = data.drop([0, 1, 2]).reset_index(drop=True)
    input_data = df.iloc[:, 1:14]

    return input_data

# Train Random Forest model
def train_rf_models(X_train, y_train):
    rf = RandomForestRegressor()
    param_grid_rf = {
        'n_estimators': [100, 200, 500, 1000],
        'max_depth': [5, 10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [2, 4, 6, 8],
        'bootstrap': [True],
        'n_jobs': [-1],
        'random_state': [42]
    }
    grid_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, scoring='r2')
    grid_rf.fit(X_train, y_train)
    best_rf = grid_rf.best_estimator_

    return best_rf

# Train ANN model
def train_ann_model(X_train, y_train):
    X_train = X_train.T
    y_train = y_train.T
    net = pr.CreateNN([X_train.shape[0], 1, 1])
    pr.train_LM(X_train, y_train, net, k_max=50, E_stop=1e-5)

    return pr, net

In [None]:
data = process_generated_data(generated_data_1)
print(data.head())

data = process_data(training_data_1)
X_train, y_train = data
# print(X_train.head())

In [None]:
# Train and evaluate rf models
rf_models = []
for i, training_data in enumerate([training_data_1, training_data_2, training_data_3]):
    X, y = process_data(training_data)
    # Splitting the data into training and extra validation sets
    X_train, X_extra_val, y_train, y_extra_val = train_test_split(X, y, test_size=90, random_state=42)
    # Further split the training data into training and test sets for model evaluation
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42)
    # Split the test data into training and validation sets for model evaluation
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

    # Random Forest
    rf_model = train_rf_models(X_train, y_train)

    # Predictions
    y_train_pred = rf_model.predict(X_train)
    y_test_pred = rf_model.predict(X_test)
    y_val_pred = rf_model.predict(X_val)
    y_extra_val_pred = rf_model.predict(X_extra_val)

    # R2 scores
    r2_train_rf = r2_score(y_train, y_train_pred)
    r2_test_rf = r2_score(y_test, y_test_pred)
    r2_val_rf = r2_score(y_val, y_val_pred)
    r2_extra_val_rf = r2_score(y_extra_val, y_extra_val_pred)

    # MSE / RMSD / MAE
    mse_train_rf = mean_squared_error(y_train, y_train_pred)
    mse_test_rf = mean_squared_error(y_test, y_test_pred)
    mse_val_rf = mean_squared_error(y_val, y_val_pred)
    mse_extra_val_rf = mean_squared_error(y_extra_val, y_extra_val_pred)

    rmsd_train_rf = np.sqrt(mse_train_rf)
    rmsd_test_rf = np.sqrt(mse_test_rf)
    rmsd_val_rf = np.sqrt(mse_val_rf)
    rmsd_extra_val_rf = np.sqrt(mse_extra_val_rf)

    mae_train_rf = mean_absolute_error(y_train, y_train_pred)
    mae_test_rf = mean_absolute_error(y_test, y_test_pred)
    mae_val_rf = mean_absolute_error(y_val, y_val_pred)
    mae_extra_val_rf = mean_absolute_error(y_extra_val, y_extra_val_pred)

    # print rf results
    print(f"Random Forest Model {i + 1}:")
    print(f"R2 Train: {r2_train_rf:.4f} | RMSD Train: {rmsd_train_rf:.4f} | MAE Train: {mae_train_rf:.4f}")
    print(f"R2 Test: {r2_test_rf:.4f} | RMSD Test: {rmsd_test_rf:.4f} | MAE Test: {mae_test_rf:.4f}")
    print(f"R2 Validation: {r2_val_rf:.4f} | RMSD Val: {rmsd_val_rf:.4f} | MAE Val: {mae_val_rf:.4f}")
    print(f"R2 Extra Validation: {r2_extra_val_rf:.4f} | RMSD ExtraVal: {rmsd_extra_val_rf:.4f} | MAE ExtraVal: {mae_extra_val_rf:.4f}")

    rf_models.append({
        'model': rf_model,
        'r2_train': r2_train_rf,
        'r2_test': r2_test_rf,
        'r2_val': r2_val_rf,
        'r2_extra_val': r2_extra_val_rf,
        'mse_train': mse_train_rf,
        'mse_test': mse_test_rf,
        'mse_val': mse_val_rf,
        'mse_extra_val': mse_extra_val_rf,
        'rmsd_train': rmsd_train_rf,
        'rmsd_test': rmsd_test_rf,
        'rmsd_val': rmsd_val_rf,
        'rmsd_extra_val': rmsd_extra_val_rf,
        'mae_train': mae_train_rf,
        'mae_test': mae_test_rf,
        'mae_val': mae_val_rf,
        'mae_extra_val': mae_extra_val_rf
    })

In [None]:
# Train and evaluate ann models
ann_models = []
for i, training_data in enumerate([training_data_1, training_data_2, training_data_3]):
    X, y = process_data(training_data)
    # ANN
    scaler_X = StandardScaler()
    X_scaled = scaler_X.fit_transform(X)
    y = y.values.reshape(-1, 1)
    y = np.array(y, dtype=np.float32)

    # Splitting the data into training and extra validation sets
    X_train, X_extra_val, y_train, y_extra_val = train_test_split(X_scaled, y, test_size=90, random_state=42)
    # Further split the training data into training and test sets for model evaluation
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42)
    # Split the test data into training and validation sets for model evaluation
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

    print(f"Training ANN Model {i + 1}:")
    pr, net = train_ann_model(X_train, y_train)
    print("Done")
    
    # Predictions from pyrenn (ensure shapes align)
    y_train_pred = pr.NNOut(X_train.T, net)
    y_test_pred = pr.NNOut(X_test.T, net)
    y_val_pred = pr.NNOut(X_val.T, net)
    y_extra_val_pred = pr.NNOut(X_extra_val.T, net)

    # Compute R2 scores (as before)
    r2_train_ann = r2_score(y_train, y_train_pred)
    r2_test_ann = r2_score(y_test, y_test_pred)
    r2_val_ann = r2_score(y_val, y_val_pred)
    r2_extra_val_ann = r2_score(y_extra_val, y_extra_val_pred)

    # Compute MSE / RMSD / MAE for ANN (flatten arrays to avoid shape issues)
    mse_train_ann = mean_squared_error(np.ravel(y_train), np.ravel(y_train_pred))
    mse_test_ann = mean_squared_error(np.ravel(y_test), np.ravel(y_test_pred))
    mse_val_ann = mean_squared_error(np.ravel(y_val), np.ravel(y_val_pred))
    mse_extra_val_ann = mean_squared_error(np.ravel(y_extra_val), np.ravel(y_extra_val_pred))

    rmsd_train_ann = np.sqrt(mse_train_ann)
    rmsd_test_ann = np.sqrt(mse_test_ann)
    rmsd_val_ann = np.sqrt(mse_val_ann)
    rmsd_extra_val_ann = np.sqrt(mse_extra_val_ann)

    mae_train_ann = mean_absolute_error(np.ravel(y_train), np.ravel(y_train_pred))
    mae_test_ann = mean_absolute_error(np.ravel(y_test), np.ravel(y_test_pred))
    mae_val_ann = mean_absolute_error(np.ravel(y_val), np.ravel(y_val_pred))
    mae_extra_val_ann = mean_absolute_error(np.ravel(y_extra_val), np.ravel(y_extra_val_pred))

    # Print ANN results with added metrics
    print(f"ANN Model {i + 1}:")
    print(f"R2 Train: {r2_train_ann:.4f} | RMSD Train: {rmsd_train_ann:.4f} | MAE Train: {mae_train_ann:.4f}")
    print(f"R2 Test: {r2_test_ann:.4f} | RMSD Test: {rmsd_test_ann:.4f} | MAE Test: {mae_test_ann:.4f}")
    print(f"R2 Validation: {r2_val_ann:.4f} | RMSD Val: {rmsd_val_ann:.4f} | MAE Val: {mae_val_ann:.4f}")
    print(f"R2 Extra Validation: {r2_extra_val_ann:.4f} | RMSD ExtraVal: {rmsd_extra_val_ann:.4f} | MAE ExtraVal: {mae_extra_val_ann:.4f}")

    ann_models.append({
        'model': [pr, net],
        'r2_train': r2_train_ann,
        'r2_test': r2_test_ann,
        'r2_val': r2_val_ann,
        'r2_extra_val': r2_extra_val_ann,
        'mse_train': mse_train_ann,
        'mse_test': mse_test_ann,
        'mse_val': mse_val_ann,
        'mse_extra_val': mse_extra_val_ann,
        'rmsd_train': rmsd_train_ann,
        'rmsd_test': rmsd_test_ann,
        'rmsd_val': rmsd_val_ann,
        'rmsd_extra_val': rmsd_extra_val_ann,
        'mae_train': mae_train_ann,
        'mae_test': mae_test_ann,
        'mae_val': mae_val_ann,
        'mae_extra_val': mae_extra_val_ann
    })
    

In [None]:
# write rf predictions to excel
rf_results = pd.DataFrame()
for i, data in enumerate([training_data_1, training_data_2, training_data_3]):
    X, y = process_data(data)   
    rf_model = rf_models[i]['model']
    y_pred = rf_model.predict(X)

    # write y and predictions to excel
    rf_results[f'y{i+1}'] = y
    rf_results[f'y_pred_rf{i+1}'] = y_pred

rf_results.to_excel('data/rf_results.xlsx', sheet_name='Sheet1', index=False)

# write generated data predictions to excel
generated_results = pd.DataFrame()
for i, data in enumerate([generated_data_1, generated_data_2, generated_data_3]):
    X = process_generated_data(data)
    rf_model = rf_models[i]['model']
    y_pred = rf_model.predict(X)

    # write predictions to excel
    generated_results[f'y_pred_rf{i+1}'] = y_pred

# write generated data predictions to same excel file
with pd.ExcelWriter('data/rf_results.xlsx', engine='openpyxl', mode='a') as writer:
    generated_results.to_excel(writer, sheet_name='generated_data', index=False)

In [None]:
# write ann predictions to excel
ann_results = pd.DataFrame()
for i, data in enumerate([training_data_1, training_data_2, training_data_3]):
    X, y = process_data(data)
    scaler_X = StandardScaler()
    X_scaled = scaler_X.fit_transform(X)
    y = y.values.reshape(-1, 1)
    y = np.array(y, dtype=np.float32)

    pr, net = ann_models[i]['model']
    y_pred = pr.NNOut(X_scaled.T, net)

    # write y and predictions to excel
    ann_results[f'y{i+1}'] = y.ravel()
    ann_results[f'y_pred_ann{i+1}'] = y_pred

ann_results.to_excel('data/ann_results.xlsx', sheet_name='Sheet1', index=False)

# write generated data predictions to excel
generated_results = pd.DataFrame()
for i, data in enumerate([generated_data_1, generated_data_2, generated_data_3]):
    X = process_generated_data(data)
    scaler_X = StandardScaler()
    X_scaled = scaler_X.fit_transform(X)

    pr, net = ann_models[i]['model']
    y_pred = pr.NNOut(X_scaled.T, net)

    # write predictions to excel
    generated_results[f'y_pred_ann{i+1}'] = y_pred

# write generated data predictions to same excel file
with pd.ExcelWriter('data/ann_results.xlsx', engine='openpyxl', mode='a') as writer:
    generated_results.to_excel(writer, sheet_name='generated_data', index=False)
