In [1]:
from DeepPurpose import utils, dataset
from DeepPurpose import DTI as models
import ast
import numpy as np
import json
import os
import requests
import warnings
import pandas as pd
warnings.filterwarnings("ignore")

In [2]:
kiba_data = dataset.load_process_KIBA()

Beginning Processing...
100% [............................................................................] 338300 / 338300Beginning to extract zip file...
Done!


In [3]:
X_drugs, X_targets, y = dataset.load_process_KIBA() 
# X_drugs ... smiles representation of drugs
# X_targets ... amino acid sequence of protein targets
# y ... binding affinity scores

Beginning Processing...
100% [............................................................................] 338300 / 338300Beginning to extract zip file...
Done!


In [4]:
print("SMILES Array:", X_drugs.shape)
print("Protein Sequences Array:", X_targets.shape)
print("Affinity Scores Array:", y.shape)

SMILES Array: (118254,)
Protein Sequences Array: (118254,)
Affinity Scores Array: (118254,)


### Map Uniprot ID to drug sequence

In [5]:
# load mapping file
file = ast.literal_eval(open('proteins.txt', 'r').read())

In [6]:
# create dataframe for kiba_data
kiba_data = np.stack((X_drugs, X_targets, y), axis=-1) # Shape = (118254, 3)

# map uniprot ids to sequences
result = []
for row in kiba_data:
    value = row[1]
    key = list(filter(lambda x: file[x] == value, file))[0] 
    new_row = np.append(row, key)
    result.append(new_row)
result = np.array(result)

kiba_data_df = pd.DataFrame(result, columns=['Smiles', 'molecules', 'target_affinity', 'uniprot_id'])

In [7]:
kiba_data_df.head()

Unnamed: 0,Smiles,molecules,target_affinity,uniprot_id
0,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,11.1,O00141
1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MSWSPSLTTQTCGAWEMKERLGTGGFGNVIRWHNQETGEQIAIKQC...,11.1,O14920
2,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MERPPGLRPGAGGPWEMRERLGTGGFGNVCLYQHRELDLKIAIKSC...,11.1,O15111
3,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,11.1,P00533
4,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,11.1,P04626


In [None]:
# save df if needed
kiba_data_df.to_csv('kiba_data_df.csv')

### Map Uniprot ID to Pdb ID

In [8]:
# load mapping file
def load_mappings(file):
    with open(file, 'r') as f:
        data = json.load(f)
    
    mappings = {}
    
    # Iterate over the results and add the first occurrence of each Uniprot ID
    for entry in data['results']:
        uniprot_id = entry['from']
        pdb_id = entry['to']
        
        if uniprot_id not in mappings: # Only add the first occurence
            mappings[uniprot_id] = pdb_id
        
    return mappings

mappings = load_mappings('idmapping_2023_11_12.json')
print(f"Loaded {len(mappings)} Uniprot to PDB mappings.") 
# dict structure: {'results': [{'from': 'O00141', 'to': '2R5T'}, {...}, ...]}

Loaded 195 Uniprot to PDB mappings.


In [9]:
# map uniprot to pdb id
def map_uniprot_to_pdb(df, mappings):
    df['pdb_id'] = df['uniprot_id'].map(mappings)
    return df

# Add pdb ids to Kiba df
kiba_data_df_with_ids = map_uniprot_to_pdb(kiba_data_df, mappings)

In [10]:
kiba_data_df_with_ids.head()

Unnamed: 0,Smiles,molecules,target_affinity,uniprot_id,pdb_id
0,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,11.1,O00141,2R5T
1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MSWSPSLTTQTCGAWEMKERLGTGGFGNVIRWHNQETGEQIAIKQC...,11.1,O14920,3BRT
2,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MERPPGLRPGAGGPWEMRERLGTGGFGNVCLYQHRELDLKIAIKSC...,11.1,O15111,3BRT
3,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,11.1,P00533,1IVO
4,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,11.1,P04626,1MFG


In [13]:
# Check if there are any NaN values in the 'pdb_id' column
nan_pdb_ids = kiba_data_df_with_ids[kiba_data_df_with_ids['pdb_id'].isna()]

if not nan_pdb_ids.empty:
    print(f"Rows with NaN pdb_id:{len(nan_pdb_ids)}")
else:
    print("No NaN values found in 'pdb_id' column.")

# for simplicity we replace the nan values in the pdb_id col with the correspoding uniprot_ids
kiba_data_df_with_ids['pdb_id'] = kiba_data_df_with_ids['pdb_id'].fillna(kiba_data_df_with_ids['uniprot_id'])
# Now, the rows that had NaN in the 'pdb_id' column will have the corresponding 'uniprot_id' in their place

No NaN values found in 'pdb_id' column.


In [14]:
# save df if needed
kiba_data_df_with_ids.to_csv('kiba_data_df_with_ids.csv')

In [None]:
# ONLY EXECUTE IF YOU WANT TO REMOVE THE ROWS
# Remove rows where the 'pdb_id' column has NaN values
# kiba_data_clean = kiba_data_with_pdb.dropna(subset=['pdb_id'])
# Check how many rows were removed
# print(f"Rows remaining after dropping NaN pdb_ids: {len(kiba_data_clean)}")

### Download the PDB files

In [None]:
# download PDB from PDB or AlphaFold based on availability
def download_pdb(pdb_id, download_dir):
    # Check if the PDB file already exists
    exists = 0
    pdb_file_path = os.path.join(download_dir, f"{pdb_id}.pdb")
    if os.path.exists(pdb_file_path):
        #print(f"File for {pdb_id} already exists.")
        exists += 1
        return pdb_file_path, exists

    # Try downloading from PDB database first
    pdb_url = f'https://files.rcsb.org/download/{pdb_id}.pdb'
    response = requests.get(pdb_url)
    
    if response.status_code == 200:
        # Save PDB file to local directory
        with open(pdb_file_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded {pdb_id} from PDB.")
    else:
        # If not found in PDB, try AlphaFold (use alternative base URL)
        alphafold_url = f'https://alphafold.ebi.ac.uk/files/AF-{pdb_id}-F1-model_v2.pdb'
        response = requests.get(alphafold_url)

        if response.status_code == 200:
            # Save PDB file from AlphaFold
            with open(pdb_file_path, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded {pdb_id} from AlphaFold.")
        else:
            print(f"Failed to download {pdb_id}. Neither PDB nor AlphaFold has the file.")
            return None

    return pdb_file_path

# Create a directory to save the downloaded PDB files
download_dir = "PDB_files"
# download_dir = "../PDB_files" # new file path (TODO: check if it works)
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Get unique pdb_ids (including missing ones from NaN)
unique_pdb_ids = kiba_data_df_with_ids['pdb_id'].dropna().unique() # 226

# Download each PDB file (only once)
counter = 0
for pdb_id in unique_pdb_ids:
    p, e = download_pdb(pdb_id, download_dir)
    if e == 1:
        counter+=1
print(counter)

226


## get protein features (protein_dict) for p2rank selection

In [None]:
import torch
# Load the dictionary from the file
protein_dict = torch.load("protein_dict.pt")
# protein_dict = torch.load("../data/protein_dict.pt") # new file path (TODO: check if it works)

## create dataset.ds for p2rank

In [None]:
######################################## ONL EXECURE ONCE!!! ###########################################
# Directory containing your PDB files
pdb_directory = 'PDB_files'
# pdb_directory = "../PDB_files" # new file path (TODO: check if it works)

# Output file name for the .ds file
ds_file = "protein_list.ds"

# List all .pdb files in the directory
pdb_files = [file for file in os.listdir(pdb_directory) if file.endswith(".pdb")]

# Write the file paths to the .ds file
with open(ds_file, "w") as f:
    for pdb_file in pdb_files:
        f.write(os.path.join(pdb_directory, pdb_file) + "\n")

print("File paths written to", ds_file)

File paths written to protein_list.ds


# p2rank for protein segmentation

In [7]:
import numpy as np
import json
import os
import sys
import requests
import warnings
import pandas as pd
import torch
from tqdm import tqdm

import rdkit.Chem as Chem
from rdkit.Chem import AllChem
from Bio.PDB import PDBParser
import torchmetrics
from torch_geometric.data import Dataset
from torch_geometric.loader import DataLoader

warnings.filterwarnings("ignore")

In [2]:
# load df
kiba_data_df_with_ids = pd.read_csv('kiba_data_df_with_ids.csv')
kiba_data_df_with_ids.head()

Unnamed: 0.1,Unnamed: 0,Smiles,molecules,target_affinity,uniprot_id,pdb_id
0,0,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,11.1,O00141,2R5T
1,1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MSWSPSLTTQTCGAWEMKERLGTGGFGNVIRWHNQETGEQIAIKQC...,11.1,O14920,3BRT
2,2,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MERPPGLRPGAGGPWEMRERLGTGGFGNVCLYQHRELDLKIAIKSC...,11.1,O15111,3BRT
3,3,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,11.1,P00533,1IVO
4,4,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,11.1,P04626,1MFG


In [None]:
# Function to process proteins with P2Rank --> do externally via bash console
# run via bash console due to path issues with P2Rank

#def run_p2rank_for_protein(protein_name, ds='protein_list.ds'):
    # P2Rank command
 ##   p2rank = "/c/Users/anja/OneDrive/Dokumente/GitHub/Practical-Work-for-AI/p2rank/prank.sh"
   # output_dir = "p2rank_files"
    
    
    #cmd = f"bash {p2rank} predict {ds} -o {output_dir} -threads 1"
    #os.system(cmd)

# Store the information in a list to be turned into a DataFrame
info = []

# Process each row in the dataframe
for i, line in tqdm(kiba_data_df_with_ids.iterrows(), total=kiba_data_df_with_ids.shape[0]):
    smiles = line['Smiles']
    compound_name = ""
    protein_name = line['pdb_id']
    affinity = line['target_affinity']  # Adjust this if your affinity column has a different name

    # Check for P2Rank prediction file
    p2rank_file = f"p2rank_files/{protein_name}.pdb_predictions.csv"
    # p2rank_file = f"../p2rank_files/{protein_name}.pdb_predictions.csv"  # new file path (TODO: check if it works)
    
    # Skip running P2Rank if prediction already exists only needed if p2rank is run in notebook per protein but since we run it externally, we can skip this
    # if not os.path.exists(p2rank_file):
       # run_p2rank_for_protein(protein_name)
    
    if os.path.exists(p2rank_file):
        pocket = pd.read_csv(p2rank_file, sep=',')
        pocket.columns = pocket.columns.str.strip()  # Clean column names

        if not pocket.empty:
            # Use the best pocket (rank 1, which should be the first row)
            best_pocket = pocket.iloc[0]
            com = ",".join([str(round(best_pocket[c], 3)) for c in ['center_x', 'center_y', 'center_z']])
            info.append([protein_name, compound_name, smiles, "best_p2rank_pocket", com, affinity])
        else:
            # Fallback: use protein center as the pocket center
            com = ",".join([str(a.round(3)) for a in protein_dict[protein_name][0].mean(axis=0).numpy()])
            info.append([protein_name, compound_name, smiles, "protein_center", com, affinity])
    else:
        # Fallback: use protein center as the pocket center
        com = ",".join([str(a.round(3)) for a in protein_dict[protein_name][0].mean(axis=0).numpy()])
        info.append([protein_name, compound_name, smiles, "protein_center", com, affinity])


# Convert the list of information into a DataFrame
info_df = pd.DataFrame(info, columns=[
    'protein_name', 'compound_name', 'smiles', 'pocket_name', 'pocket_com', 'target_affinity'
])

# Check the result
print(info_df.head())


100%|█████████████████████████████████████████████████████████████████████████| 118254/118254 [03:49<00:00, 515.34it/s]

  protein_name compound_name                                         smiles  \
0         2R5T                COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl   
1         3BRT                COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl   
2         3BRT                COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl   
3         1IVO                COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl   
4         1MFG                COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl   

          pocket_name             pocket_com  target_affinity  
0  best_p2rank_pocket    32.53,34.506,67.174             11.1  
1  best_p2rank_pocket   14.396,20.696,11.566             11.1  
2  best_p2rank_pocket   14.396,20.696,11.566             11.1  
3  best_p2rank_pocket  115.598,69.377,45.458             11.1  
4  best_p2rank_pocket     8.068,0.821,17.188             11.1  





In [15]:
info_df.head()

Unnamed: 0,protein_name,compound_name,smiles,pocket_name,pocket_com,target_affinity
0,2R5T,,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,best_p2rank_pocket,"32.53,34.506,67.174",11.1
1,3BRT,,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,best_p2rank_pocket,"14.396,20.696,11.566",11.1
2,3BRT,,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,best_p2rank_pocket,"14.396,20.696,11.566",11.1
3,1IVO,,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,best_p2rank_pocket,"115.598,69.377,45.458",11.1
4,1MFG,,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,best_p2rank_pocket,"8.068,0.821,17.188",11.1


In [17]:
info_df.to_csv('kiba_data_with_p2rank_info.csv')

In [4]:
# save kiba_data with p2rank info as pt file
kiba_data = pd.read_csv('kiba_data_with_p2rank_info.csv')
torch.save(kiba_data, 'kiba_data.pt')

# !!!!Stop here and continue in new notebook for dataset creation!!!!

## Load molecule_dict and protein_dict & kiba_data pt files

In [None]:
# load protein and molecule dictionaries & kiba_data
protein_dict = torch.load("protein_dict.pt")
molecule_dict = torch.load("molecule_dict.pt")
kiba_data = torch.load('kiba_data.pt') # kiba_data is the complete DataFrame with the P2Rank information

# Dataset class + Creation

I also return the target affinities together with the model input since some of the inputs might be discarded during training due to memory size issues. So I return both to keep them correctly assigned/ordered.

In [8]:
tankbind_src_folder_path = "./tankbind/"
sys.path.insert(0, tankbind_src_folder_path)

In [9]:
# imports from tankbind
from feature_utils import get_protein_feature, get_clean_res_list, extract_torchdrug_feature_from_mol, get_canonical_smiles
from utils import construct_data_from_graph_gvp, evaulate_with_affinity, evaulate
from model import get_model
from generation_utils import get_LAS_distance_constraint_mask, get_info_pred_distance, write_with_new_coords
from metrics import print_metrics, myMetric

In [None]:
class MyDataset_VS(Dataset):
    def __init__(self, root, data=None, protein_dict=None, molecule_dict=None, proteinMode=0, compoundMode=1,
                 pocket_radius=20, shake_nodes=None,
                 transform=None, pre_transform=None, pre_filter=None):
        self.data = data
        self.protein_dict = protein_dict
        self.molecule_dict = molecule_dict
        super().__init__(root, transform, pre_transform, pre_filter)
        self.data = torch.load(self.processed_paths[0])
        self.protein_dict = torch.load(self.processed_paths[1])
        self.molecule_dict = torch.load(self.processed_paths[2])
        self.proteinMode = proteinMode
        self.pocket_radius = pocket_radius
        self.compoundMode = compoundMode
        self.shake_nodes = shake_nodes

    @property
    def processed_file_names(self):
        return ['kiba_data.pt', 'protein_dict.pt', 'molecule_dict.pt']

    def process(self):
        # Save data and protein dictionary
        torch.save(self.data, self.processed_paths[0])
        torch.save(self.protein_dict, self.processed_paths[1])
        torch.save(self.molecule_dict, self.processed_paths[2])

    def len(self):
        return len(self.data)
    
    def get(self, idx):
        line = self.data.iloc[idx]
        smiles = line['smiles']
        target_affinity = line['target_affinity']
        pocket_com = line['pocket_com']
        pocket_com = np.array(pocket_com.split(",")).astype(float) if isinstance(pocket_com, str) else pocket_com
        pocket_com = pocket_com.reshape((1, 3))
        use_whole_protein = line.get('use_whole_protein', False)

        protein_name = line['protein_name']
        protein_data = self.protein_dict.get(protein_name)
        
        if protein_data is None:
            raise ValueError(f"Protein {protein_name} not found in pre-calculated protein dictionary")

        protein_node_xyz, protein_seq, protein_node_s, protein_node_v, protein_edge_index, protein_edge_s, protein_edge_v = protein_data

        # Load precomputed molecular features
        molecule_data = self.molecule_dict.get(smiles)
        if molecule_data is None:
            raise ValueError(f"SMILES {smiles} not found in precomputed molecular dictionary")
        
        coords, compound_node_features, input_atom_edge_list, input_atom_edge_attr_list, pair_dis_distribution = self.molecule_dict[smiles]

        data, input_node_list, keepNode = construct_data_from_graph_gvp(
            protein_node_xyz, protein_seq, protein_node_s, protein_node_v, 
            protein_edge_index, protein_edge_s, protein_edge_v,
            coords, compound_node_features, input_atom_edge_list, input_atom_edge_attr_list,
            pocket_radius=self.pocket_radius, use_whole_protein=use_whole_protein, includeDisMap=True,
            use_compound_com_as_pocket=False, chosen_pocket_com=pocket_com, compoundMode=self.compoundMode
        )
        data.compound_pair = pair_dis_distribution.reshape(-1, 16)
        
        return data, target_affinity

### create dataset instance:

In [None]:
dataset_path = '' # Specify the path where the dataset will be stored (TODO: some directory cleanup)
# dataset = MyDataset_VS(root=dataset_path, data=kiba_data, protein_dict=protein_dict, molecule_dict=molecule_dict) # only on first run, otherwise execute line below
# dataset = MyDataset_VS(root=dataset_path)

