In [1]:
import pandas as pd
import numpy as np 
from chembl_webresource_client.new_client import new_client
from sklearn.utils.class_weight import compute_class_weight
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

import deepchem as dc
from deepchem.feat import MolGraphConvFeaturizer


In [2]:
target = new_client.target
target_query = target.search('dengue fever')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Dengue virus,Dengue virus,15.0,False,CHEMBL613757,[],ORGANISM,12637
1,[],Yellow fever virus,Yellow fever virus,13.0,False,CHEMBL613731,[],ORGANISM,11089
2,[],Rift Valley fever virus,Rift Valley fever virus,11.0,False,CHEMBL613130,[],ORGANISM,11588
3,[],African swine fever virus,African swine fever virus,11.0,False,CHEMBL613714,[],ORGANISM,10497
4,[],dengue virus type 4,dengue virus type 4,11.0,False,CHEMBL613728,[],ORGANISM,11070
5,[],Sandfly fever Sicilian virus,Sandfly fever Sicilian virus,11.0,False,CHEMBL612238,[],ORGANISM,28292
6,[],dengue virus type 1,dengue virus type 1,11.0,False,CHEMBL613360,[],ORGANISM,11053
7,[],dengue virus type 2,dengue virus type 2,11.0,False,CHEMBL613966,[],ORGANISM,11060
8,[],dengue virus type 3,dengue virus type 3,11.0,False,CHEMBL612717,[],ORGANISM,11069
9,[],Sandfly fever Naples virus,Sandfly fever Naples virus,11.0,False,CHEMBL613279,[],ORGANISM,206160


In [3]:
selected_target = targets.target_chembl_id[12]
selected_target


'CHEMBL5980'

In [4]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)

In [5]:
df.columns

Index(['action_type', 'activity_comment', 'activity_id', 'activity_properties',
       'assay_chembl_id', 'assay_description', 'assay_type',
       'assay_variant_accession', 'assay_variant_mutation', 'bao_endpoint',
       'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment',
       'data_validity_description', 'document_chembl_id', 'document_journal',
       'document_year', 'ligand_efficiency', 'molecule_chembl_id',
       'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value',
       'potential_duplicate', 'qudt_units', 'record_id', 'relation', 'src_id',
       'standard_flag', 'standard_relation', 'standard_text_value',
       'standard_type', 'standard_units', 'standard_upper_value',
       'standard_value', 'target_chembl_id', 'target_organism',
       'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type',
       'units', 'uo_units', 'upper_value', 'value'],
      dtype='object')

In [6]:
df2 = df[df.standard_value.notna()]


### **Labeling compounds as either being active, inactive or intermediate**
The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**.

In [7]:
bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

In [8]:
counter_=0
for values in bioactivity_class:
    if values=='active':
        counter_+=1


In [9]:
mol_cid = []
for i in df2.molecule_chembl_id:
  mol_cid.append(i)
canonical_smiles = []
for i in df2.canonical_smiles:
  canonical_smiles.append(i)
standard_value = []
for i in df2.standard_value:
  standard_value.append(i)
data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))
df3 = pd.DataFrame( data_tuples,  columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])

selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = df2[selection]


In [10]:
df3
for index,row in df3.iterrows():
  if float(row['standard_value']) >= 10000:
    df3.loc[index,'Compound Activity']='Inactive'
    # bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    df3.loc[index,'Compound Activity']='Intermediate'
  else:
    df3.loc[index,'Compound Activity']='Active'
        
df3
    # df3.loc[index,'Compound Activity']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.loc[index,'Compound Activity']='Inactive'


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,Compound Activity
0,CHEMBL1401841,COc1ccc2nc3cccc(OC)c3nc2c1,100000.0,Inactive
1,CHEMBL1608853,O=C(O)c1ccc2c(c1)C(=O)/C(=C\c1ccco1)C2=O,12310.0,Inactive
2,CHEMBL1429799,O=C1NN(c2ccccc2)C(=O)/C1=C\c1ccccc1OCC(=O)N1CC...,100000.0,Inactive
3,CHEMBL246446,O=C(O)c1ccc2nc(-c3ccco3)c(-c3ccco3)nc2c1,50970.0,Inactive
4,CHEMBL1383455,CCn1nc([N+](=O)[O-])c(C(C#N)c2nc3ccccc3n2C)c(C...,100000.0,Inactive
...,...,...,...,...
1279,CHEMBL5190612,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4ccc(C(F)(F)F...,25200.0,Inactive
1280,CHEMBL5176952,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4cccc(C(F)(F)...,23900.0,Inactive
1281,CHEMBL5184942,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4ccccc4C(F)(F...,24000.0,Inactive
1282,CHEMBL4526128,O=C(N[C@@H](Cc1ccc(O)cc1)C(=O)O)c1cc(-c2ccccc2...,9610.0,Intermediate


In [11]:
from deepchem.feat import CircularFingerprint
# Initialize the featurizer
import pickle

def create_features(length_features,data_base,create_pickle=False):
    featurizer = CircularFingerprint(size=length_features,radius=4)
    if create_pickle:
        with open('featurizer.pickle', 'wb') as file:
            pickle.dump(featurizer, file)

    feature_names=[]

    for i in range(1,length_features+1):
        feature_names.append('feature'+str(i))
    
    Classification_DB=pd.DataFrame(columns=['molecule_chembl_id']+feature_names+['Compound Activity'])
    Regression_DB=pd.DataFrame(columns=['molecule_chembl_id']+feature_names+['standard_value'])

    for index,row in data_base.iterrows():
        # print(row['molecule_chembl_id'])
        # print(featurizer.featurize(row['canonical_smiles'])[0])
        # print([row['Compound Activity']])
        new_row_list_regression=[[row['molecule_chembl_id']]+ featurizer.featurize(row['canonical_smiles'])[0].tolist()+ [row['standard_value']]]
        new_row_list_classification=[[row['molecule_chembl_id']]+ featurizer.featurize(row['canonical_smiles'])[0].tolist()+ [row['Compound Activity']]]
        # print(new_row_list)

        Classification_DB_new=pd.DataFrame(new_row_list_classification,columns=['molecule_chembl_id']+feature_names+['Compound Activity'],index=[0])
        Regression_DB_new=pd.DataFrame(new_row_list_regression,columns=['molecule_chembl_id']+feature_names+['standard_value'],index=[0])

        Classification_DB = pd.concat([Classification_DB, Classification_DB_new], ignore_index=True)
        Regression_DB = pd.concat([Regression_DB, Regression_DB_new], ignore_index=True)

    return Classification_DB,Regression_DB
        

Classification_,Regression_=create_features(2048,df3,True)   


# Featurize molecules
# features = featurizer.featurize('COc1ccc2nc3cccc(OC)c3nc2c1')

# Output the features
# for i, feature in enumerate(features):
#     print(f"Molecule {i+1} ECFP Fingerprint:")
#     print(feature)


  Classification_DB = pd.concat([Classification_DB, Classification_DB_new], ignore_index=True)
  Regression_DB = pd.concat([Regression_DB, Regression_DB_new], ignore_index=True)


In [12]:
Classification_.to_pickle('ECFP_Classification_features.pkl')

Classification Problem is below

In [13]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import re
X_Regression=Classification_.iloc[:,1:-1].values
y_Classification=Classification_.iloc[:,-1].values
le = LabelEncoder()

# Encode target labels ('Active', 'Inactive', 'Intermediate') to integers
y_encoded = le.fit_transform(y_Classification)
X_train, X_test, y_train, y_test = train_test_split(X_Regression, y_encoded, test_size=0.2, random_state=42)
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)

# Initialize the model
clf = RandomForestClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
with open('random_forest_class.pickle', 'wb') as file:
    pickle.dump(clf, file)

# Print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")

class_report=classification_report(y_test, y_pred, target_names=le.classes_)
lines = class_report.strip().split('\n')

# Initialize an empty list to store rows of the DataFrame
data_class_report = []

# Use regex to extract values from each line
for line in lines[2:]:  # Skip the first two lines (header and space)
    line = re.sub(r'\s+', ' ', line.strip())  # Replace multiple spaces with one space
    parts = line.split(' ')
    if len(parts) == 5:  # If it's a class row (e.g., Inactive, Intermediate)
        label, precision, recall, f1_score, support = parts
        data_class_report.append([label, float(precision), float(recall), float(f1_score), int(support)])
    elif len(parts) == 4:  # If it's an accuracy or average row
        label, precision, recall, f1_score = parts
        data_class_report.append([label, float(precision), float(recall), float(f1_score), None])

# Convert to DataFrame
class_report_df = pd.DataFrame(data_class_report, columns=['Label', 'Precision', 'Recall', 'F1-Score', 'Support'])

print(class_report_df)
class_report_df.to_pickle('class_report.pkl')


Accuracy: 0.90
Classification Report:
          Label  Precision  Recall  F1-Score  Support
0      Inactive        0.9    0.99      0.94      208
1  Intermediate        0.9    0.45      0.60       40


In [14]:
unique_classes=[]
for y_c in y_Classification:
    if y_c not in unique_classes:
        unique_classes.append(y_c)
unique_classes

['Inactive', 'Intermediate']

In [15]:


#Regression
Regression_
X_Regression=Regression_.iloc[:,1:-1].values
y_Regression=Regression_.iloc[:,-1].values

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and get scores
scores = cross_val_score(xg_reg, X_Regression, y_Regression, cv=cv, scoring='r2')

# Convert negative MSE to positive and calculate RMSE

print(f"Cross-validated r2 scores: {scores}")
print(f"Mean r2: {scores.mean()}, Std r2: {scores.std()}")

Cross-validated r2 scores: [0.22467536 0.28365099 0.2552641  0.19531083 0.25174803]
Mean r2: 0.2421298623085022, Std r2: 0.02995230603023846


In [17]:
file_path = "chembl_30_chemreps.txt.gz"
import gzip

# Read a small part of the file to inspect the header
with gzip.open(file_path, "rt") as file:
    # Read the first few lines to check headers and content
    sample_data = [next(file) for _ in range(5)]

# Print the sample data to inspect headers
print("Sample Data:")
for line in sample_data:
    print(line.strip())

with gzip.open(file_path, "rt") as file:
    # Assuming the file is tab-separated and has headers
    df = pd.read_csv(file, sep='\t', usecols=['chembl_id', 'canonical_smiles','standard_inchi','standard_inchi_key'])

# Display the DataFrame
print(df.head())


Sample Data:
chembl_id	canonical_smiles	standard_inchi	standard_inchi_key
CHEMBL153534	Cc1cc(-c2csc(N=C(N)N)n2)cn1C	InChI=1S/C10H13N5S/c1-6-3-7(4-15(6)2)8-5-16-10(13-8)14-9(11)12/h3-5H,1-2H3,(H4,11,12,13,14)	MFRNFCWYPYSFQQ-UHFFFAOYSA-N
CHEMBL440060	CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@@H](N)CCSC)[C@@H](C)O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N[C@@H](CC(N)=O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)NCC(=O)N[C@@H](CO)C(=O)N[C@@H](CCCN=C(N)N)C(N)=O	InChI=1S/C123H212N44O34S/c1-19-63(12)96(164-115(196)81(47-62(10)11)163-119(200)97(68(17)169)165-103(184)70(124)36-42-202-18)118(199)143-52-92(175)147-65(14)100(181)149-67(16)102(183)157-82(48-69-50-136-57-145-69)114(195)162-83(49-90(128)173)106(187)141-51-91

In [18]:
# length_=round(df.shape[0]*0.4)
df_1M=df.iloc[1:1000000,:].copy()
df_1M.to_pickle('ALL_Candidates.pkl')

In [19]:
input_file_path = "chembl_30_chemreps.txt.gz"
output_file_path = "chembl_30_chemreps_half.txt.gz"

# Read and count the total lines in the original gz file
with gzip.open(input_file_path, "rt") as file:
    lines = file.readlines()

# Calculate the number of lines to keep (half of the original)
half_size = len(lines) // 2

# Write the first half of the lines to a new gz file
with gzip.open(output_file_path, "wt") as outfile:
    outfile.writelines(lines[:half_size])

print(f"Successfully created a file with half the size: {output_file_path}")

Successfully created a file with half the size: chembl_30_chemreps_half.txt.gz


In [27]:
lines

['chembl_id\tcanonical_smiles\tstandard_inchi\tstandard_inchi_key\n',
 'CHEMBL153534\tCc1cc(-c2csc(N=C(N)N)n2)cn1C\tInChI=1S/C10H13N5S/c1-6-3-7(4-15(6)2)8-5-16-10(13-8)14-9(11)12/h3-5H,1-2H3,(H4,11,12,13,14)\tMFRNFCWYPYSFQQ-UHFFFAOYSA-N\n',
 'CHEMBL440060\tCC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@@H](N)CCSC)[C@@H](C)O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N[C@@H](CC(N)=O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)NCC(=O)N[C@@H](CO)C(=O)N[C@@H](CCCN=C(N)N)C(N)=O\tInChI=1S/C123H212N44O34S/c1-19-63(12)96(164-115(196)81(47-62(10)11)163-119(200)97(68(17)169)165-103(184)70(124)36-42-202-18)118(199)143-52-92(175)147-65(14)100(181)149-67(16)102(183)157-82(48-69-50-136-57-145-69)114(195)162-83(49-90(128)173)106(187)

https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_30/