In [1]:
# !pip install PyTDC

Collecting PyTDC
  Downloading PyTDC-1.0.6.tar.gz (142 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting biopython<2.0,>=1.78 (from PyTDC)
  Downloading biopython-1.83-cp38-cp38-win_amd64.whl.metadata (13 kB)
Collecting dataclasses<1.0,>=0.6 (from PyTDC)
  Using cached dataclasses-0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting fuzzywuzzy<1.0,>=0.18.0 (from PyTDC)
  Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting huggingface_hub<1.0,>=0.20.3 (from PyTDC)
  Downloading huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)
Collecting mygene<4.0.0,>=3.2.2 (from PyTDC)
  Using cached mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
INFO: pip is looking at multiple versions of pytdc to determine which version is compatible with other requirements. This could take a while.
Collecting PyTDC
  Downloading PyTDC-1.0.5.tar.gz (142 kB)
  Preparing metadata (setup.py): started
  Preparing met

In [1]:
import pandas as pd
from pathlib import Path
from tdc.multi_pred import DDI, DTI, PPI
from tdc.utils import get_label_map

# DDI

In [2]:
for dt in ['DrugBank', 'TWOSIDES']:
    folder = Path(f'TDC/DDI/{dt}/')
    folder.mkdir(exist_ok=True, parents=True)

    data = DDI(name = dt)
    data.get_data().to_csv(folder / 'total.csv', index=False, header=True)

    split = data.get_split(frac=[0.7, 0.1, 0.2], seed=42)
    for k, v in split.items():
        v.to_csv(folder / f"{k}.csv", index=False, header=True)
    
    if dt == 'TWOSIDES':
        labels = get_label_map(name = dt, task = 'DDI', name_column = 'Side Effect Name')
    else:
        labels = get_label_map(name = 'DrugBank', task = 'DDI')
    
    label_df =pd.DataFrame({'label': labels.keys(), 'description': labels.values()}).sort_values(by='label', ascending=True)
    label_df.to_csv(folder / f"label_description.csv", index=False, header=True)
print('finish')

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


finish


# DTA

In [3]:
for dt in ['BindingDB_Kd', 'BindingDB_IC50', 'BindingDB_Ki', 'DAVIS', 'KIBA']:
    total = False
    # if 'BindingDB' in dt:
    #     continue
    
    if 'BindingDB' in dt:
        for md in ['raw', 'max_affinity', 'mean']:
            data = DTI(name = dt)
            if not total:
                data.get_data().to_csv(folder / 'total.csv', index=False, header=True)
                total = True

            if md != 'raw':
                data.harmonize_affinities(mode=md)
            for sp in ['random', 'cold_split']:
                if sp == 'cold_split':
                    for c in ['Drug', 'Target']:
                        folder = Path(f'TDC/DTA/BindingDB/{dt.split("_")[1]}/{sp}/{c}/{md}')
                        folder.mkdir(exist_ok=True, parents=True)

                        split = data.get_split(method=sp, column_name=c, frac=[0.7, 0.1, 0.2], seed=42)
                        for k, v in split.items():
                            v.to_csv(folder / f"{k}.csv", index=False, header=True)
                else:
                    folder = Path(f'TDC/DTA/BindingDB/{dt.split("_")[1]}/{sp}/{md}')
                    folder.mkdir(exist_ok=True, parents=True)

                    split = data.get_split(frac=[0.7, 0.1, 0.2], seed=42)
                    for k, v in split.items():
                        v.to_csv(folder / f"{k}.csv", index=False, header=True)
    
    else:            
        data = DTI(name = dt)
        if dt == 'DAVIS':
            data.convert_to_log(form='binding')
        
        if not total:
            data.get_data().to_csv(folder / 'total.csv', index=False, header=True)
            total = True

        for sp in ['random', 'cold_split']:
            if sp == 'cold_split':
                for c in ['Drug', 'Target']:
                    folder = Path(f'TDC/DTA/{dt}/{sp}/{c}')
                    folder.mkdir(exist_ok=True, parents=True)

                    split = data.get_split(method=sp, column_name=c, frac=[0.7, 0.1, 0.2], seed=42)
                    for k, v in split.items():
                        v.to_csv(folder / f"{k}.csv", index=False, header=True)
            else:
                folder = Path(f'TDC/DTA/{dt}/{sp}')
                folder.mkdir(exist_ok=True, parents=True)

                split = data.get_split(frac=[0.7, 0.1, 0.2], seed=42)
                for k, v in split.items():
                    v.to_csv(folder / f"{k}.csv", index=False, header=True)
print('finish')

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
The scale is in original affinity scale, so we will take the minimum!
The original data has been updated!
Found local copy...
Loading...
Done!
The original data has been updated!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
The scale is in original affinity scale, so we will take the minimum!
The original data has been updated!
Found local copy...
Loading...
Done!
The original data has been updated!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
The scale is in original affinity scale, so we will take the minimum!
The original data has been updated!
Found local copy...
Loading...
Done!
The original data has been updated!
Found local copy...
Loading...
Done!
To log space...
Found local copy...
Loading...
Done!


finish


# PPI

In [4]:
data = PPI(name = 'HuRI')
data = data.neg_sample(frac = 1)
data.get_data().to_csv(folder / 'total.csv', index=False, header=True)

for sp in ['random', 'cold_split']:
    if sp == 'cold_split':
        for c in ['Protein1', 'Protein2']:
            folder = Path(f'TDC/PPI/HuRI/{sp}/{c}')
            folder.mkdir(exist_ok=True, parents=True)

            split = data.get_split(method=sp, column_name=c, frac=[0.7, 0.1, 0.2], seed=42)
            for k, v in split.items():
                v.to_csv(folder / f"{k}.csv", index=False, header=True)
    else:
        folder = Path(f'TDC/PPI/HuRI/{sp}')
        folder.mkdir(exist_ok=True, parents=True)

        split = data.get_split(frac=[0.7, 0.1, 0.2], seed=42)
        for k, v in split.items():
            v.to_csv(folder / f"{k}.csv", index=False, header=True)
print('finish')

Found local copy...
Loading...
Done!


AttributeError: 'DataFrame' object has no attribute 'append'

In [5]:
print('All finish')

All finish
