In [1]:
import pandas as pd
import numpy as np 
from chembl_webresource_client.new_client import new_client
from sklearn.utils.class_weight import compute_class_weight
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

import deepchem as dc
from deepchem.feat import MolGraphConvFeaturizer


In [2]:
target = new_client.target
target_query = target.search('dengue fever')
targets = pd.DataFrame.from_dict(target_query)


In [3]:
selected_target = targets.target_chembl_id[12]


In [4]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)

In [5]:
df.shape[0]

1284

In [6]:
df2 = df[df.standard_value.notna()]


### **Labeling compounds as either being active, inactive or intermediate**
The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**.

In [7]:
bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

In [8]:
counter_=0
for values in bioactivity_class:
    if values=='active':
        counter_+=1


In [9]:
mol_cid = []
for i in df2.molecule_chembl_id:
  mol_cid.append(i)
canonical_smiles = []
for i in df2.canonical_smiles:
  canonical_smiles.append(i)
standard_value = []
for i in df2.standard_value:
  standard_value.append(i)
data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))
df3 = pd.DataFrame( data_tuples,  columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])

selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = df2[selection]


In [10]:
df3
for index,row in df3.iterrows():
  if float(row['standard_value']) >= 10000:
    df3.loc[index,'Compound Activity']='Inactive'
    # bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    df3.loc[index,'Compound Activity']='Intermediate'
  else:
    df3.loc[index,'Compound Activity']='Active'
        
df3
    # df3.loc[index,'Compound Activity']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.loc[index,'Compound Activity']='Inactive'


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,Compound Activity
0,CHEMBL1401841,COc1ccc2nc3cccc(OC)c3nc2c1,100000.0,Inactive
1,CHEMBL1608853,O=C(O)c1ccc2c(c1)C(=O)/C(=C\c1ccco1)C2=O,12310.0,Inactive
2,CHEMBL1429799,O=C1NN(c2ccccc2)C(=O)/C1=C\c1ccccc1OCC(=O)N1CC...,100000.0,Inactive
3,CHEMBL246446,O=C(O)c1ccc2nc(-c3ccco3)c(-c3ccco3)nc2c1,50970.0,Inactive
4,CHEMBL1383455,CCn1nc([N+](=O)[O-])c(C(C#N)c2nc3ccccc3n2C)c(C...,100000.0,Inactive
...,...,...,...,...
1279,CHEMBL5190612,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4ccc(C(F)(F)F...,25200.0,Inactive
1280,CHEMBL5176952,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4cccc(C(F)(F)...,23900.0,Inactive
1281,CHEMBL5184942,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4ccccc4C(F)(F...,24000.0,Inactive
1282,CHEMBL4526128,O=C(N[C@@H](Cc1ccc(O)cc1)C(=O)O)c1cc(-c2ccccc2...,9610.0,Intermediate


In [11]:
from deepchem.feat import CircularFingerprint
# Initialize the featurizer


def create_features(length_features,data_base):
    featurizer = CircularFingerprint(size=length_features,radius=4)
    feature_names=[]

    for i in range(1,length_features+1):
        feature_names.append('feature'+str(i))
    
    Classification_DB=pd.DataFrame(columns=['molecule_chembl_id']+feature_names+['Compound Activity'])
    Regression_DB=pd.DataFrame(columns=['molecule_chembl_id']+feature_names+['standard_value'])

    for index,row in data_base.iterrows():
        # print(row['molecule_chembl_id'])
        # print(featurizer.featurize(row['canonical_smiles'])[0])
        # print([row['Compound Activity']])
        new_row_list_regression=[[row['molecule_chembl_id']]+ featurizer.featurize(row['canonical_smiles'])[0].tolist()+ [row['standard_value']]]
        new_row_list_classification=[[row['molecule_chembl_id']]+ featurizer.featurize(row['canonical_smiles'])[0].tolist()+ [row['Compound Activity']]]
        # print(new_row_list)

        Classification_DB_new=pd.DataFrame(new_row_list_classification,columns=['molecule_chembl_id']+feature_names+['Compound Activity'],index=[0])
        Regression_DB_new=pd.DataFrame(new_row_list_regression,columns=['molecule_chembl_id']+feature_names+['standard_value'],index=[0])

        Classification_DB = pd.concat([Classification_DB, Classification_DB_new], ignore_index=True)
        Regression_DB = pd.concat([Regression_DB, Regression_DB_new], ignore_index=True)

    return Classification_DB,Regression_DB
        

Classification_,Regression_=create_features(2048,df3)   


# Featurize molecules
# features = featurizer.featurize('COc1ccc2nc3cccc(OC)c3nc2c1')

# Output the features
# for i, feature in enumerate(features):
#     print(f"Molecule {i+1} ECFP Fingerprint:")
#     print(feature)


  Classification_DB = pd.concat([Classification_DB, Classification_DB_new], ignore_index=True)
  Regression_DB = pd.concat([Regression_DB, Regression_DB_new], ignore_index=True)


Classification Problem is below

In [12]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

X_Regression=Classification_.iloc[:,1:-1].values
y_Classification=Classification_.iloc[:,-1].values
le = LabelEncoder()

# Encode target labels ('Active', 'Inactive', 'Intermediate') to integers
y_encoded = le.fit_transform(y_Classification)
X_train, X_test, y_train, y_test = train_test_split(X_Regression, y_encoded, test_size=0.2, random_state=42)
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)

# Initialize the model
clf = RandomForestClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))



Accuracy: 0.90
Classification Report:
              precision    recall  f1-score   support

    Inactive       0.90      0.99      0.94       208
Intermediate       0.90      0.45      0.60        40

    accuracy                           0.90       248
   macro avg       0.90      0.72      0.77       248
weighted avg       0.90      0.90      0.89       248



In [13]:


#Regression
Regression_
X_Regression=Regression_.iloc[:,1:-1].values
y_Regression=Regression_.iloc[:,-1].values

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and get scores
scores = cross_val_score(xg_reg, X_Regression, y_Regression, cv=cv, scoring='r2')

# Convert negative MSE to positive and calculate RMSE

print(f"Cross-validated r2 scores: {scores}")
print(f"Mean r2: {scores.mean()}, Std r2: {scores.std()}")

Cross-validated r2 scores: [0.22467536 0.28365099 0.2552641  0.19531083 0.25174803]
Mean r2: 0.2421298623085022, Std r2: 0.02995230603023846
