In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
data = pd.read_csv('/content/final_prepared_datset.csv',
)
data

In [None]:
X = data.drop('pIC50', axis=1)
X

In [None]:
Y = data.pIC50
Y

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
from sklearn.feature_selection import VarianceThreshold

def remove_low_variance(input_data, threshold=0.1):
    selection = VarianceThreshold(threshold)
    selection.fit(input_data)
    return input_data[input_data.columns[selection.get_support(indices=True)]]

X = remove_low_variance(X, threshold=0.1)
X

In [None]:
X.shape

In [None]:
X.to_csv('descriptor_list.csv', index = False)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
X_test.shape, Y_test.shape

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
Y_train_imputed = imputer.fit_transform(Y_train.values.reshape(-1, 1))
Y_train_imputed = Y_train_imputed.ravel()

Y_test_imputed = imputer.transform(Y_test.values.reshape(-1, 1))
Y_test_imputed = Y_test_imputed.ravel()

np.random.seed(100)
model = RandomForestRegressor(n_estimators=100)

model.fit(X_train, Y_train_imputed)

r2 = model.score(X_test, Y_test_imputed)
print(r2)

In [None]:
Y_pred = model.predict(X_test)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

rf_model = RandomForestRegressor(random_state=100)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='r2', verbose=1)
grid_search.fit(X_train, Y_train_imputed)
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_rf_model = RandomForestRegressor(**best_params, random_state=100)

best_rf_model.fit(X_train, Y_train_imputed)

r2 = best_rf_model.score(X_test, Y_test_imputed)
print("Best R2 score:", r2)
print("Best parameters:", best_params)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import numpy as np

param_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [None, 15, 20, 25],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_model = RandomForestRegressor(random_state=100)

Y_train_imputed = np.nan_to_num(Y_train, nan=np.nanmean(Y_train))

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='r2', verbose=1)
grid_search.fit(X_train, Y_train_imputed)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_rf_model = RandomForestRegressor(**best_params, random_state=100)

best_rf_model.fit(X_train, Y_train_imputed)

r2 = best_rf_model.score(X_test, Y_test_imputed)
print("Best R2 score:", r2)
print("Best parameters:", best_params)

In [None]:
Y_pred = model.predict(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

best_params = {
    'n_estimators': 200,
    'max_depth': 20,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'random_state': 100
}

final_rf_model = RandomForestRegressor(**best_params)

Y_train_imputed = np.nan_to_num(Y_train, nan=np.nanmean(Y_train))

final_rf_model.fit(X_train, Y_train_imputed)

Y_test_imputed = np.nan_to_num(Y_test, nan=np.nanmean(Y_test))

r2 = final_rf_model.score(X_test, Y_test_imputed)
print("R2 score on testing data:", r2)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
sns.set_style("white")

plt.figure(figsize=(5, 5))
ax = sns.regplot(x=Y_test, y=Y_pred, scatter_kws={'alpha':0.8})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set_xlim(3, 12)
ax.set_ylim(3, 12)

plt.savefig('Regression_Model.png', dpi=600, bbox_inches='tight')

plt.show()

In [None]:
import pickle

with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(final_rf_model, f)