In [None]:
%reset

In [None]:
import optuna
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import  mean_squared_error, mean_absolute_percentage_error, r2_score
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

In [None]:
#Set this value to true if hyperparameter tuning is complete and the test set should be loaded and predicted on
OUTPUT_TEST = False

In [None]:
#Load the training and validation datasets
X_train = pd.read_csv("../data/cleaned/training.csv")
y_train = pd.read_csv("../data/cleaned/training_labels.csv")
X_val = pd.read_csv("../data/cleaned/validation.csv")
y_val = pd.read_csv("../data/cleaned/validation_labels.csv")

In [None]:
#Some columns headers contain '[' or ']' which are not compatable with sklearn. They are change to '(' and ')' respectively.
columns = X_train.columns
for col in columns:
    if '[' in col or ']' in col:
        old_name = col
        col = col.replace('[', '(')
        col = col.replace(']', ')')
        
        X_train = X_train.rename(columns={old_name:col})
        X_val = X_val.rename(columns={old_name:col})

In [None]:
#Splitting of the training set into a vedrification and training set with a 90/10 split. This verification set is used for optuna hyperparameter tuning.
X_train, X_verif, y_train, y_verif = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [None]:
#Reset the indicies after splitting the dataset
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_verif = X_verif.reset_index(drop=True)
y_verif = y_verif.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

In [None]:
#Check performance with no tuning to ensure performance is improving
sanity_check = XGBRegressor()
sanity_check.fit(X_train, y_train)
val_preds = sanity_check.predict(X_val)
sanity_val_error = mean_squared_error(y_val,val_preds,squared=False)
val_true = y_val.to_numpy().squeeze()
sanity_val_r = r2_score(val_true,val_preds)[0]
print("SANITY CHECK VALUES:")
print("Validation RMSE:", sanity_val_error)
print("Validation R:", sanity_val_r)


In [None]:

def objective(trial):
    #Define the objective function

    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.5, 0.9),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.5, 0.9),
    }

    params["tree_method"] = "hist"

    # Fit the model
    optuna_model = XGBRegressor(**params)
    optuna_model.fit(X_train, y_train)

    # Make predictions
    verif_pred = optuna_model.predict(X_verif)
    verif_loss = mean_absolute_percentage_error(y_verif,verif_pred)*100
    verif_error = mean_squared_error(y_verif,verif_pred,squared=False)

    # Evaluate predictions
    error = verif_loss + verif_error
    
    return error


In [None]:
sampler = optuna.samplers.CmaEsSampler()
study = optuna.create_study(sampler=sampler)
study.optimize(objective, n_trials=50)

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
params = trial.params
#params = {'max_depth': 8, 'learning_rate': 0.32451199000475434, 'n_estimators': 525, 'min_child_weight': 6, 'gamma': 6.149852814458083e-05, 'subsample': 0.8924095316799702, 'colsample_bytree': 0.7277371421020629}
model = XGBRegressor(**params)
print(params)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_percentage_error as mape

val_preds = model.predict(X_val)
train_preds = model.predict(X_train)
error = mean_squared_error(y_val,val_preds,squared=False)
r_error = r2_score(val_true,val_preds)[0]
train_error = mean_squared_error(y_train, train_preds)
train_true = y_train.to_numpy().squeeze()
train_r_error = r2_score(train_true, train_preds)[0]
print("Validation RMSE:", error)
print("Difference from sanity check:", error - sanity_val_error)
print("Validation R:", r_error)
print("Difference from sanity check:", r_error - sanity_val_r)
print("Validation PE", mean_absolute_percentage_error(val_true, val_preds))

print("Training RMSE:", train_error)
print("Training R:", train_r_error)

In [None]:
if not OUTPUT_TEST:
    raise ValueError("OUTPUT_TEST set to False. If you would like to output final test values set to True and continue running from here")

In [None]:
X_test = pd.read_csv("../data/cleaned/test.csv")
y_test = pd.read_csv("../data/cleaned/test_labels.csv")

In [None]:
columns = X_test.columns
for col in columns:
    if '[' in col or ']' in col:
        old_name = col
        col = col.replace('[', '(')
        col = col.replace(']', ')')
        
        X_test = X_test.rename(columns={old_name:col})

In [None]:
test_preds = model.predict(X_test)
train_preds = model.predict(X_train)

In [None]:
#Save test true vals and predictions to csv

pred_data = pd.DataFrame(test_preds)
pred_filepath = '../data/predictions/XG/test_pred_xg.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)
pred_data = pd.DataFrame(y_test)
pred_filepath = '../data/predictions/XG/test_true_xg.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)

#Save train true vals and predictions to csv

pred_data = pd.DataFrame(train_preds)
pred_filepath = '../data/predictions/XG/train_pred_xg.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)
pred_data = pd.DataFrame(y_train)
pred_filepath = '../data/predictions/XG/train_true_xg.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)

In [None]:
#Save inputs to csv

pred_data = pd.DataFrame(X_train)
pred_filepath = '../data/predictions/XG/train_input_xg.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)
true_data = pd.DataFrame(X_test)
true_filepath = '../data/predictions/XG/test_input_xg.csv'
true_data.to_csv(true_filepath, index=False, header=False)

In [None]:
#Read in values from csv and calculate RMSE and r values

test_pred_data = np.genfromtxt('../data/predictions/XG/test_pred_xg.csv', delimiter=',', filling_values=np.nan)
test_true_data = np.genfromtxt('../data/predictions/XG/test_true_xg.csv', delimiter=',', filling_values=np.nan)
train_pred_data = np.genfromtxt('../data/predictions/XG/train_pred_xg.csv', delimiter=',', filling_values=np.nan)
train_true_data = np.genfromtxt('../data/predictions/XG/train_true_xg.csv', delimiter=',', filling_values=np.nan)

test_rmse = mean_squared_error(test_true_data,test_pred_data,squared=False)
test_r = r2_score(test_true_data,test_pred_data)
pearson_r = stats.pearsonr(test_true_data,test_pred_data)

train_rmse = mean_squared_error(train_true_data,train_pred_data,squared=False)
train_r = stats.pearsonr(train_true_data,train_pred_data)


print("Train:")
print(train_rmse)
print('Test:')
print(test_rmse)
print(test_r)
print(pearson_r)

In [None]:
print("percent Error:", mean_absolute_percentage_error(test_true_data, test_pred_data)*100)

In [None]:
split_df = pd.DataFrame({'true':test_true_data,'pred':test_pred_data})
split_df = split_df.sort_values(by='true')
split_df.reset_index(inplace=True, drop=True)
mid = (max(test_true_data) + min(test_true_data))/2

diff = 1000
idx = -1
for i in range(len(split_df)):
    new_diff = abs(split_df.iloc[i]['true'] - mid)
    if new_diff <= diff:
        diff = new_diff
        idx = i
print(len(split_df.iloc[idx:]['true'])/len(split_df))
top_half_true = split_df.iloc[idx:]['true'].to_numpy().squeeze()
top_half_pred = split_df.iloc[idx:]['pred'].to_numpy().squeeze()
print("adjusted percent Error:", mean_absolute_percentage_error(top_half_true, top_half_pred)*100)