### Imports

<img src="https://mt.mahidol.ac.th/wp-content/uploads/2019/10/OriginalLOGO.png">

### QSAR Modelling of Steroid sul fatase inhibitors

In [1]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from matplotlib import style
import tpot
from tqdm import tqdm
import time
import random
style.use("ggplot")



<p> There are 5 rows and 883 columns for the dataset</p>

In [3]:
X=np.load("X.npy")### Loading the Latest Data Gained from Deep Learning Kernel
y=np.load("y.npy")## Loading the Latest Data Gained from Deep Learning Kernel

In [4]:
X.shape

(1470464, 13)

### Machine Learning Model Benchmark on New Data

In [5]:
def machine_learning_train(machine_learning_algorithm,X,y,trials):
    ''''
    This function essentialy takes the Dimensionaly Reduced X-features and then train it on an Machine Learning
    Algorithm specified by the user and then test it using different two metric:R2 Score and MSE.However note that training is done
    for a specific number of trials for an 80/20 split and respectively tested. The plots shows results of metrics over trials.
    
    '''
    
    style.use("ggplot")
    clf=machine_learning_algorithm
    r2_train=[]
    r2_test=[]
    mse_train=[]
    mse_test=[]
    
    for i in tqdm(range(trials)):
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)##80/20 indpedent split
        clf.fit(X_train,y_train)##training
        
        y_pred_train=clf.predict(X_train)
        y_pred_test=clf.predict(X_test)##testing
        
        r2_train.append(r2_score(y_train,y_pred_train))
        r2_test.append(r2_score(y_test,y_pred_test))
        mse_train.append(mean_squared_error(y_train,y_pred_train))
        mse_test.append(mean_squared_error(y_test,y_pred_test))
        
        
    print("The mean R2 score for {} is {}--Train".format(str(machine_learning_algorithm),np.mean(r2_train)))
    print("The mean R2 score for {} is {}--Test".format(str(machine_learning_algorithm),np.mean(r2_test)))
    
    print("The mean MSE score for {} is {}--Train".format(str(machine_learning_algorithm),np.mean(mse_train)))
    print("The mean MSE score for {} is {}--Test".format(str(machine_learning_algorithm),np.mean(mse_test)))
    
    fig, axs = plt.subplots(2, 2)
    fig.set_size_inches(11,8)
    axs[0, 0].plot(list(range(trials)), r2_train,'tab:red')
    axs[0, 0].set_title('R2--Score--Train--{}--trials'.format(trials))
    axs[0, 0].set_ylabel("R2 Score")
    
    
    
    axs[0, 1].plot(list(range(trials)), r2_test, 'tab:orange')
    axs[0, 1].set_title('R2--Score--Test--{}--trials'.format(trials))
    axs[1, 0].plot(list(range(trials)), mse_train,'tab:blue')
    
    
    axs[1, 0].set_title('MSE--Score--Train--{}--trials'.format(trials))
    axs[1, 0].set_ylabel("MSE Score")
    axs[1, 0].set_xlabel("Trials")
    axs[1, 1].plot(list(range(trials)), mse_test,'tab:purple')
    axs[1, 1].set_title('MSE--Score--Test--{}--trials'.format(trials))
    axs[1, 1].set_xlabel("Trials")
    
    
    axs[0,0].set_ylim([0,1])
    axs[0,1].set_ylim([0,1])
    axs[1,0].set_ylim([0,2])
    axs[1,1].set_ylim([0,2])
    plt.grid(False)
    
    return (r2_train,r2_test,mse_train,mse_test)
   
    
    

In [6]:
def scrambled_pair_prediction(machine_learning_algorithm,X,y,trials):
    style.use("classic")
    
    
    clf=machine_learning_algorithm

    r2_test_scrambled=[]
    r2_test_orginal=[]
    r2_train_orginal=[]
    r2_train_scrambled=[]
    y_pred_test_scrambled=[]
    y_pred_train_scrambled=[]
    y_pred_test=[]
    y_pred_train=[]
    y_train_combined=[]
    y_train_scrambled_combined=[]
    y_test_combined=[]
    y_test_scrambled_combined=[]

    
    for i in tqdm(range(10)):##10 time shuffled
        X_shuffled=np.copy(X_reduced)
        y_shuffled=np.copy(y)
        np.random.shuffle(X_shuffled)
        np.random.shuffle(y_shuffled)
        X_train_scrambled,X_test_scrambled,y_train_scrambled,y_test_scrambled=train_test_split(X_shuffled,y_shuffled,test_size=0.2)##80/20 indpedent split on scrambled data

        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)##80/20 indpedent split on orginal
        for i in range(trials):##for each shuffled ,run this many times of this trial and collect the revelant for later plotting
            y_pred_test_scrambled_trial=[]
            y_pred_train_scrambled_trial=[]
            y_pred_test_trial=[]
            y_pred_train_trial=[]

            
            clf.fit(X_train_scrambled,y_train_scrambled)
            y_pred_test_scrambled_trial=clf.predict(X_test_scrambled)##testing on scrambled data
            y_pred_train_scrambled_trial=clf.predict(X_train_scrambled)##testing on scrambled data

            clf.fit(X_train,y_train)
            y_pred_test_trial=clf.predict(X_test)##testing on unscrambled data
            y_pred_train_trial=clf.predict(X_train)##testing on scrambled data
            
            y_pred_test_scrambled.append(y_pred_test_scrambled_trial)
            y_pred_train_scrambled.append(y_pred_train_scrambled_trial)
            y_pred_test.append(y_pred_test_trial)
            y_pred_train.append(y_pred_train_trial)
            y_train_combined.append(y_train)
            y_train_scrambled_combined.append(y_train_scrambled)
            y_test_combined.append(y_test)
            y_test_scrambled_combined.append(y_test_scrambled)
            
            r2_test_scrambled.append(r2_score(y_test_scrambled,y_pred_test_scrambled_trial))
            r2_test_orginal.append(r2_score(y_test,y_pred_test_trial))


            r2_train_scrambled.append(r2_score(y_train_scrambled,y_pred_train_scrambled_trial))
            r2_train_orginal.append(r2_score(y_train,y_pred_train_trial))
        
    
    fig=plt.figure()
    fig.set_facecolor('white')
    fig.set_size_inches(14,14)
    ax1=fig.add_subplot(211)
    ax2=fig.add_subplot(212)
   
        
    ax1.grid(False)
    ax2.grid(False)

    
    ax1.scatter(np.array(y_test_combined).reshape(-1,),np.array(y_pred_test).reshape(-1,),c='red',label='Orginal')
    ax1.set_title('Predicted vs Orginal pChEMBL--Test Data')
    
    
    ax1.scatter(np.array(y_test_scrambled_combined).reshape(-1,), np.array(y_pred_test_scrambled).reshape(-1,),c='blue',label='Scrambled')
    
    
    
    ax2.scatter(np.array(r2_test_scrambled).reshape(-1,),np.array(r2_test_orginal).reshape(-1,),c='red',label="test")
    ax2.set_title('R2 for Scrambled vs Orginal')
    
    ax2.scatter(np.array(r2_train_scrambled).reshape(-1),np.array(r2_train_orginal).reshape(-1,),c='blue',label="train")
   
    
#     fig.text(0.02, 0.5, 'Predicted pChEMBL', ha='center',fontweight="bold")
#     fig.text(0.5,0.5,'R2-Orginal',ha='center',fontweight="bold")
    
    ax1.legend(loc=4)
    ax2.legend(loc=4)
    ax1.set_xlabel("Experimental pChEMBL",fontweight="bold")
    ax2.set_xlabel("R2-Scrambled",fontweight="bold")
    
    
    ax1.set_ylabel("Predicted pChEMBL",fontweight="bold")
    ax2.set_ylabel("R2-Orginal",fontweight="bold")
    
    
    
    
    
        
        
        
    

### Gradient Boosting

In [None]:
r2_train_gb,r2_test_gb,mse_train_gb,mse_test_gb=machine_learning_train(GradientBoostingRegressor(),X,y,20)

 10%|█         | 2/20 [43:33<6:33:38, 1312.16s/it]

In [None]:
scrambled_pair_prediction(GradientBoostingRegressor(),X,y,1)

### Support Vector Regression

In [None]:
r2_train_svr,r2_test_svr,mse_train_svr,mse_test_svr=machine_learning_train(SVR(),X,y,20)

In [None]:
scrambled_pair_prediction(SVR(),X,y,1)

### Linear Regression

In [None]:
r2_train_lr,r2_test_lr,mse_train_lr,mse_test_lr=machine_learning_train(LinearRegression(),X,y,20)

In [None]:
scrambled_pair_prediction(LinearRegression(),X,y,1)

### Lasso Regression

In [None]:
r2_train_lasso,r2_test_lasso,mse_train_lasso,mse_test_lasso=machine_learning_train(Lasso(),X,y,20)

In [None]:
scrambled_pair_prediction(Lasso(),X,y,1)

### Ridge Regression

In [None]:
r2_train_ridge,r2_test_ridge,mse_train_ridge,mse_test_ridge=machine_learning_train(Ridge(),X,y,20)

In [None]:
scrambled_pair_prediction(Ridge(),X,y,1)

### Random Forest Regression

In [None]:
r2_train_rf,r2_test_rf,mse_train_rf,mse_test_rf=machine_learning_train(RandomForestRegressor(),X,y,20)

In [None]:
scrambled_pair_prediction(RandomForestRegressor(),X,y,1)

### KNN Regression

In [None]:
r2_train_knn,r2_test_knn,mse_train_knn,mse_test_knn=machine_learning_train(KNeighborsRegressor(),X,y,20)

In [None]:
scrambled_pair_prediction(KNeighborsRegressor(),X,y,1)