<a href="https://colab.research.google.com/github/jwasswa2023/Physpropnet/blob/main/Shallow_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install rdkit-pypi

In [None]:
!pip install mordred

In [None]:
!pip install mol2vec

In [None]:
#!pip install --pre deepchem
!pip install --pre deepchem[tensorflow]
import deepchem
deepchem.__version__

In [None]:
import numpy as np
import pandas as pd
import os
import deepchem as dc
import numpy as np
from deepchem import metrics
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
###df = pd.read_csv("/content/HL_data.csv")
df1 = pd.read_csv("/content/desalted_BioHC.csv")
df1.head()

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from rdkit import Chem
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from lightgbm import LGBMRegressor

import time
# Start timer
start_time = time.time()

# Assuming df1 contains your dataset
mols = [Chem.MolFromSmiles(smiles) for smiles in df1.SMILES]
feat = dc.feat.MACCSKeysFingerprint()
arr = feat.featurize(mols)

y = df1['LogHalfLife']
X = pd.DataFrame(arr)

# Convert features and targets to numpy arrays
X = np.array(X)
y = np.array(y)

# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize outer loop cross-validation
outer_kf = KFold(n_splits=5, shuffle=True, random_state=42)
outer_mae_scores = []
outer_r2_scores = []
outer_rmse_scores = []

# Initialize an empty list to store best models
best_models = []

# Outer loop: Splitting data into training and testing sets
for outer_train_idx, outer_test_idx in outer_kf.split(X_train):
    X_outer_train, X_outer_test = X_train[outer_train_idx], X_train[outer_test_idx]
    y_outer_train, y_outer_test = y_train[outer_train_idx], y_train[outer_test_idx]

    # Initialize inner loop cross-validation for hyperparameter tuning
    inner_kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Initialize LGBM regressor
    model = LGBMRegressor()

    # Grid of hyperparameters to search
    param_grid = {
        'verbose': [-1], # to avoid warnings
        'boosting_type': ['gbdt'],
        'num_leaves': [5, 15, 30],
        'max_depth': [50, 100, 300, -1],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [100, 200, 300],
        'subsample_for_bin': [50, 100, 200],
        'min_split_gain': [0.0],
        'min_child_weight': [0.001],
        'min_child_samples': [20],
        'subsample': [1.0],
    }

    # Initialize RandomizedSearchCV for hyperparameter tuning
    random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=60, scoring='neg_mean_absolute_error', cv=inner_kf, random_state=42)

    # Perform hyperparameter tuning on the inner training data
    random_search.fit(X_outer_train, y_outer_train)

    # Get the best hyperparameters from the random search
    best_model = random_search.best_estimator_

    # Print the best set of hyperparameters
    print("Best Hyperparameters:", best_model.get_params())

    # Store the best model
    best_models.append(best_model)

    # Train the best model on the outer training data
    best_model.fit(X_outer_train, y_outer_train)

    # Make predictions on the outer test set
    y_pred = best_model.predict(X_outer_test)

    # Evaluate model
    mae = mean_absolute_error(y_outer_test, y_pred)
    r2 = r2_score(y_outer_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_outer_test, y_pred))
    outer_mae_scores.append(mae)
    outer_r2_scores.append(r2)
    outer_rmse_scores.append(rmse)

# Calculate average scores and standard deviations across outer folds
avg_outer_mae = np.mean(outer_mae_scores)
std_outer_mae = np.std(outer_mae_scores)
avg_outer_r2 = np.mean(outer_r2_scores)
std_outer_r2 = np.std(outer_r2_scores)
avg_outer_rmse = np.mean(outer_rmse_scores)
std_outer_rmse = np.std(outer_rmse_scores)

print("Average Outer MAE:", avg_outer_mae)
print("Standard Deviation of Outer MAE:", std_outer_mae)
print("Average Outer R2 Score:", avg_outer_r2)
print("Standard Deviation of Outer R2 Score:", std_outer_r2)
print("Average Outer RMSE:", avg_outer_rmse)
print("Standard Deviation of Outer RMSE:", std_outer_rmse)

# End timer and report elapsed time
end_time = time.time()
elapsed_time = (end_time - start_time) / 60  # minutes
print(f"\n⏱️ Total execution time: {elapsed_time:.2f} minutes")




Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': 300, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 200, 'n_jobs': None, 'num_leaves': 30, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200, 'subsample_freq': 0, 'verbose': -1}




Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': 100, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 5, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200, 'subsample_freq': 0, 'verbose': -1}




Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.2, 'max_depth': 50, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 200, 'n_jobs': None, 'num_leaves': 5, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 100, 'subsample_freq': 0, 'verbose': -1}




Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.2, 'max_depth': 100, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 300, 'n_jobs': None, 'num_leaves': 15, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200, 'subsample_freq': 0, 'verbose': -1}




Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.2, 'max_depth': 300, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 300, 'n_jobs': None, 'num_leaves': 5, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 50, 'subsample_freq': 0, 'verbose': -1}
Average Outer MAE: 0.40062596220103786
Standard Deviation of Outer MAE: 0.05282443469295639
Average Outer R2 Score: 0.5381577690303478
Standard Deviation of Outer R2 Score: 0.06166661234953794
Average Outer RMSE: 0.503227910838878
Standard Deviation of Outer RMSE: 0.05782258217192767

⏱️ Total execution time: 0.35 minutes




In [None]:
# Testing the best models on the test dataset
test_mae_scores = []
test_r2_scores = []
test_rmse_scores = []

for best_model in best_models:
    y_test_pred = best_model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae_scores.append(test_mae)
    test_r2_scores.append(test_r2)
    test_rmse_scores.append(test_rmse)

# Calculate average scores and standard deviations for test dataset
avg_test_mae = np.mean(test_mae_scores)
std_test_mae = np.std(test_mae_scores)
avg_test_r2 = np.mean(test_r2_scores)
std_test_r2 = np.std(test_r2_scores)
avg_test_rmse = np.mean(test_rmse_scores)
std_test_rmse = np.std(test_rmse_scores)

print("Average Test MAE:", avg_test_mae)
print("Standard Deviation of Test MAE:", std_test_mae)
print("Average Test R2 Score:", avg_test_r2)
print("Standard Deviation of Test R2 Score:", std_test_r2)
print("Average Test RMSE:", avg_test_rmse)
print("Standard Deviation of Test RMSE:", std_test_rmse)


Average Test MAE: 0.48569526436959115
Standard Deviation of Test MAE: 0.017085829073995962
Average Test R2 Score: 0.2842252995526658
Standard Deviation of Test R2 Score: 0.03723516902614386
Average Test RMSE: 0.630627456256072
Standard Deviation of Test RMSE: 0.016374812673039918


