In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

In [6]:
import pandas as pd
import numpy as np

# Read the CSV file and drop rows with missing values
df = pd.read_csv('new_merged_features_IC50_g12c.csv')
df = df.dropna()  # or use df.dropna(inplace=True)

# Prepare X by dropping the non-feature columns
X = df.drop(columns=['IC50 (nM)', 'ChEMBL ID', 'Smiles'])

# Clean the target values: remove '<' and '>' and convert to float
y = df['IC50 (nM)'].str.lstrip('<>').astype(float)

# Create a DataFrame for y
y_df = pd.DataFrame({'IC50 (nM)': y})

# Convert IC50 from nM to M and compute pIC50
y_df['pIC50'] = -np.log10(y_df['IC50 (nM)'] / 1e9)

y_df.head()

y = y_df['pIC50']

In [7]:
model = RandomForestRegressor(max_depth=20, min_samples_leaf=10)
pipe = Pipeline([('scaler', StandardScaler()), ('model', model)])

scorers = {
    'r2': make_scorer(r2_score),
    'mse': make_scorer(mean_squared_error)
}

cv = KFold(n_splits=15, shuffle=True, random_state=None)
scores_r2 = cross_val_score(pipe, X, y, cv=cv, scoring=scorers['r2'])
scores_mse = cross_val_score(pipe, X, y, cv=cv, scoring=scorers['mse'])

mean_r2 = scores_r2.mean()
mean_mse = scores_mse.mean()

print("Mean R2 Score:", mean_r2)
# print("Mean MSE Score:", mean_mse)

Mean R2 Score: 0.7767330085352526
