<a href="https://colab.research.google.com/github/hhandc/drug_repo/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Requirements

In [1]:
!pip install PyTDC rdkit biopython scikit-learn pandas numpy

Collecting PyTDC
  Downloading pytdc-1.1.14.tar.gz (151 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/151.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m143.4/151.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m151.3/151.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rdkit
  Downloading rdkit-2024.9.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting accelerate==0.33.0 (from PyTDC)
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting dataclasses<1.0,>=0.6 (from PyTDC)
  Downloading dataclasses-0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting datasets<2.20.0 (from PyTDC)
  Downloading da

# Predicting Drug to Target Interaction Score

In [20]:
from tdc.multi_pred import DTI
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from google.colab import files

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

print("Loading Drug-Target Interaction (DTI) dataset...")
data = DTI(name='BindingDB_Kd')

# Extract data
df = data.get_data()
print(f"Dataset loaded with {len(df)} entries")

amyloid_beta_smiles = "CC[C@H](C)[C@@H](C(=O)N[C@@H]([C@@H](C)CC)C(=O)NCC(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](C(C)C)C(=O)NCC(=O)NCC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](C(C)C)C(=O)O)NC(=O)[C@H](C)NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(=O)N)NC(=O)[C@H](CO)NC(=O)CNC(=O)[C@H](C(C)C)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](C)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)N)NC(=O)[C@H](CC3=CNC=N3)NC(=O)[C@H](CC4=CNC=N4)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CC5=CC=C(C=C5)O)NC(=O)CNC(=O)[C@H](CO)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CC6=CNC=N6)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CC7=CC=CC=C7)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](C)NC(=O)[C@H](CC(=O)O)N"

def extract_drug_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol),
        rdMolDescriptors.CalcTPSA(mol)
    ]


drug_features = df['Drug'].apply(extract_drug_features)

drug_features = np.array([x for x in drug_features if x is not None])


target_features = np.array(extract_drug_features(amyloid_beta_smiles)).reshape(1, -1)


if len(drug_features) == 0 or target_features is None:
    raise ValueError("No valid drug-target pairs found. Check dataset filtering and feature extraction.")


X = np.hstack((drug_features, np.tile(target_features, (len(drug_features), 1))))
y = df['Y'][:len(X)]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
if len(X_train) == 0:
    raise ValueError("Training set is empty. Check dataset size and filtering.")

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Model MSE: {mse}")


predictions = model.predict(X)


results = pd.DataFrame({
    'Drug': df['Drug'][:len(X)],
    'Predicted_Score': predictions
})


results_sorted = results.sort_values(by='Predicted_Score', ascending=False)



display(results_sorted)


results_sorted.to_csv("drug_repurposing_results.csv", index=False)
files.download("drug_repurposing_results.csv")

Found local copy...
Loading...


Loading Drug-Target Interaction (DTI) dataset...


Done!


Dataset loaded with 52274 entries
Model MSE: 148519654508.2647


Unnamed: 0,Drug,Predicted_Score
1122,NCc1ccccc1,8.708721e+06
1117,NCc1ccccc1,8.708721e+06
1110,NCc1ccccc1,8.708721e+06
7705,c1ccc(-n2cccc2)cc1,8.474115e+06
7135,O=C(O)Cc1cccc(Oc2ccccc2)c1,8.194056e+06
...,...,...
1430,COC(=O)/C=C1/C[C@H]2C[C@]3(O)O[C@@H](C[C@H](OC...,3.048587e-01
1421,COC(=O)/C=C1/C[C@H]2C[C@]3(O)O[C@@H](C[C@H](OC...,3.048587e-01
1418,COC(=O)/C=C1/C[C@H]2C[C@]3(O)O[C@@H](C[C@H](OC...,3.048587e-01
1740,COC(=O)C[C@@H]1N=C(c2ccc(Cl)cc2)c2c(sc(C(=O)NC...,1.378293e-01


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>