<a href="https://colab.research.google.com/github/imeldp96/qsar_study/blob/main/XGBoost_Regressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kennard_stone #to install kennard-stone splitting algorithm

In [None]:
#to mount google drive
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [None]:
# to import dependencies
import pandas as pd
import numpy as np
from pandas import DataFrame as df
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score, mean_absolute_percentage_error
#import kennard_stone as ks #import kennard_stone as ks #activate this line to install ks as well (after installing), deactivate if not needed

random = 42 #random number for consistent results, can be changed to anything

In [None]:
data = pd.read_csv('/content/drive/MyDrive/data/all.csv', index_col=[0], header=[0]) #change accordingly with the location of your csv file

In [None]:
#X= data.drop('pIC50', axis=1) #Select the descriptor's columns

#X=data[['X3', 'X21','X11']] #vi-qc
#X=data[['X11', 'X15', 'X16']] #ga-qc

y = data['pIC50']  # Setting y as the target variable.
X.head()

In [None]:
#Kennard-Stone Algorithm for training-test set division
#X_train, X_test, y_train, y_test = ks.train_test_split(X, y, test_size=0.2) #change test_size accordingly with the proportion of training-test set

In [None]:
#Dont run this if you run the KS set division
#only run this if the training and test set has been divided into different rows

X_train = X.iloc[:19] #select rows for train set
X_test = X.iloc[19:28] #select rows for test set

y_train = y.iloc[:19]
y_test = y.iloc[19:28]

In [None]:
print(y_test)

#HYPERPARAMETERS TUNING

In [None]:
# Importing core libraries
import numpy as np
import pandas as pd
from time import time
from pprint import pprint
import joblib
from functools import partial

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Classifier/Regressor
import xgboost
from xgboost import XGBRegressor

# Model selection
from sklearn.model_selection import KFold, StratifiedKFold

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search

param_grid = {'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'learning_rate': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'max_depth': [6, 8, 10, 12],
    'min_child_weight':[0, 1, 10],
    'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'gamma' : [0, 0.1, 0.5, 1.0],
    'reg_alpha' : [0, 1, 10, 20],
    'reg_lambda' : [0, 1, 10, 20]
              } #you can modify the range
# Create a based model
xgb = XGBRegressor(booster='gbtree', device='cpu', objective='reg:squarederror', verbosity=2, tree_method='auto', n_jobs=-1)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = xgb, param_grid = param_grid,
                          cv = 8, n_jobs = -1, verbose = 2, scoring=['r2', 'neg_mean_squared_error'], refit = 'neg_mean_squared_error')

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
optimized_xgb = XGBRegressor(**best_params, random_state=random, booster='gbtree', objective='reg:squarederror', verbosity=2, tree_method='auto') #fitting the best hyperparams
optimized_xgb.fit(X_train, y_train)

#XGBRegressor

In [None]:
import xgboost
from xgboost import XGBRegressor

In [None]:
xgb_fit = optimized_xgb.fit(X_train, y_train)

In [None]:
#prediction on the training dataset
ytrain_pred = xgb_fit.predict(X_train)
#training model evaluation
#training r-sq
print('The training r_sq is: %.3f'% xgb_fit.score(X_train, y_train))
#RMSE
print('The RMSE is: %.3f'% np.sqrt(mean_squared_error(y_train, ytrain_pred)))
#MAPE
print('The MAPE is: %.3f'% mean_absolute_percentage_error(y_train, ytrain_pred))

In [None]:
#prediction on the testing data
ytest_pred = xgb_fit.predict(X_test)

#testing coef. of determination
print('The testing r_sq is: %.3f'% r2_score(y_test, ytest_pred))
#model evaluation metrics on test set
#RMSE
print('The RMSE is: %.3f'% np.sqrt(mean_squared_error(y_test, ytest_pred)))
#MAPE
print('The MAPE is: %.3f'% mean_absolute_percentage_error(y_test, ytest_pred))

In [None]:
#get the predicted targets
df_ytrain = pd.DataFrame(y_train)
df_ytrainpred = pd.DataFrame(ytrain_pred)
df_ytest = pd.DataFrame(y_test)
df_ytestpred = pd.DataFrame(ytest_pred)

print(df_ytrainpred)
print(df_ytestpred)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_scatter_with_line(y_train, y_train_pred, y_test, y_test_pred):
    fig = plt.figure(figsize=(8, 8))  # Set the figure size
    plt.scatter(y_train, y_train_pred, color='blue', label='Training Set', alpha=0.5, s=70)
    plt.scatter(y_test, y_test_pred, color='red', label='Test Set', alpha=0.5, s=50)
    plt.plot([np.min(np.concatenate([y_train, y_test])), np.max(np.concatenate([y_train, y_test]))],
             [np.min(np.concatenate([y_train, y_test])), np.max(np.concatenate([y_train, y_test]))],
             color='black', linestyle='--')  # Diagonal line
    plt.title('Experimental vs Predicted', fontsize=16)
    plt.xlabel('Experimental', fontsize=14)
    plt.ylabel('Predicted', fontsize=14)
    plt.axis('square')  # Set aspect ratio to be equal
    plt.legend(fontsize=12)
    plt.grid(False)

    # Set face color of the figure to white
    fig.patch.set_facecolor('none')

    plt.show()

# Assuming you have y_train, ytrain_pred, y_test, and ytest_pred as arrays
plot_scatter_with_line(y_train, ytrain_pred, y_test, ytest_pred)