In [2]:
import numpy as np
from rdkit import Chem
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pandas as pd
from rdkit.Chem import AllChem
import pickle
import pubchempy as pcp

In [3]:
old_data_path="."
new_data_path="."

## 1. Replace drug ID with unique int id (from 0 to num_of_drugs)

In [4]:
df1 = pd.read_csv(old_data_path + '/ploypharmacy_facts_train.txt',
               sep='\t', header=None, names=['from', 'rel', 'to'])
df2 = pd.read_csv(old_data_path + '/ploypharmacy_facts_valid.txt',
               sep='\t', header=None, names=['from', 'rel', 'to'])
df3 = pd.read_csv(old_data_path + '/ploypharmacy_facts_test.txt',
               sep='\t', header=None, names=['from', 'rel', 'to'])
df_trivec = pd.concat([df1, df2, df3])

col = df_trivec.columns
list_of_ent_n = list(set(df_trivec[col[0]]).union(set(df_trivec[col[2]])))
num_of_ent = len(list_of_ent_n)
list_of_ent = pd.Series(list(range(num_of_ent)))
list_of_ent.index = list_of_ent_n
list_of_rel_n = list(set(df_trivec[col[1]]))
num_of_rel = len(list_of_rel_n)
list_of_rel = pd.Series(list(range(num_of_rel)))
list_of_rel.index = list_of_rel_n

df_tr = pd.DataFrame({'from': list(list_of_ent.loc[df1['from']]),
       'rel': list(list_of_rel.loc[df1['rel']]),
       'to': list(list_of_ent.loc[df1['to']])})
df_val = pd.DataFrame({'from': list(list_of_ent.loc[df2['from']]),
       'rel': list(list_of_rel.loc[df2['rel']]),
       'to': list(list_of_ent.loc[df2['to']])})
df_test = pd.DataFrame({'from': list(list_of_ent.loc[df3['from']]),
       'rel': list(list_of_rel.loc[df3['rel']]),
       'to': list(list_of_ent.loc[df3['to']])})

df_tr.to_csv(new_data_path + "/polyphar_train.csv", index=False)
df_val.to_csv(new_data_path + "/polyphar_validate.csv", index=False)
df_test.to_csv(new_data_path + "/polyphar_test.csv", index=False)

df_trivec = pd.concat([df_tr, df_val, df_test])
df_trivec.to_csv(new_data_path + "/polyphar_all.csv", index=False)

## 2. Add map for relations

In [8]:
se = pd.read_csv(old_data_path + '/se_maps.txt', sep='\t', header=None)
se.index = se[0]
se.drop(columns=[0], inplace=True)

rels = [i[3:] for i in list(list_of_rel.index)]
relatives_names = list_of_rel.to_frame()
relatives_names['names'] = [se.loc[r][1] for r in rels]
relatives_names.columns = ['id_in_data', 'names']
relatives_names.index = rels
relatives_names.to_csv(new_data_path + '/rel_maps.csv')

##3. Add map for enteties

In [9]:
drugs = [i[5:] for i in list(list_of_ent.index)]
ent_names = list_of_ent.to_frame()
ent_names.columns = ['id_in_data']
ent_names.index = drugs
ent_names.to_csv(new_data_path + '/ent_maps.csv')

In [None]:
## 4. Process ppi data¶

In [29]:
num_of_dugs = len(list_of_ent)

ppi = pd.read_csv(old_data_path + '/bio-decagon-ppi.csv')
len_ppi = ppi.shape[0]

targets = pd.read_csv(old_data_path + '/bio-decagon-targets-all.csv')
len_tar = targets.shape[0]

genes = list(set(ppi['Gene 1']).union(set(ppi['Gene 2']).union(set(targets['Gene']))))
gene_to_id = pd.Series(list(range(num_of_dugs, num_of_dugs + len(genes))))
gene_to_id.index = genes
gene_to_id.to_csv(new_data_path + '/gene_to_idx.csv', index=True)

ppi['Gene 1'] = gene_to_id.loc[ppi['Gene 1']].reset_index(drop=True)
ppi['Gene 2'] = gene_to_id.loc[ppi['Gene 2']].reset_index(drop=True)
ppi.to_csv(new_data_path + '/ppi_data.csv', index=False)

## 5. Process targets data

In [35]:
## Leave drugs that contains in combo se (in targets there more than 645 drugs)
targets = targets[ent_names.reindex(targets['STITCH'])['id_in_data'].reset_index(drop=True).notna()].dropna().reset_index(drop=True)
targets['STITCH'] = ent_names.loc[targets['STITCH']].reset_index(drop=True)
targets['Gene'] = gene_to_id.loc[targets['Gene']].reset_index(drop=True)
targets.to_csv(new_data_path + '/targets_data.csv', index=False)

# Get new data

## 1. Get morgan fingerprints

In [None]:
chems = [str(int(item[4:])) for item in list(ent_names.index)]
s = ", ".join(chems)

Find CID's in PubChem https://pubchem.ncbi.nlm.nih.gov/pc_fetch/pc_fetch.cgi and download mapping from CID to Smiles.
Save file as cid_smiles.tsv

Make morgan fingerprints

In [36]:
def cid_to_str(cid: int):
    cid = str(cid)
    return 'CID0'+ "".join(['0']*(8-len(cid))) + cid

cid_smiles_df = pd.read_table(old_data_path + "/cid_smiles.txt", sep='\t', header=None)
cid_smiles_df.columns = ['cid', 'smiles']

radius = 3
lst = []
fp = []
for row in cid_smiles_df.smiles:
    m1 = Chem.MolFromSmiles(row)
    fp1 = AllChem.GetMorganFingerprintAsBitVect(m1,radius,nBits=100)
    fp.append(fp1)
    lst.append(list(fp1))
res = pd.DataFrame(np.array(lst), dtype = np.int32)
res.index = [cid_to_str(i) for i in cid_smiles_df.cid]
## res.to_csv(new_data_path + f"/chemical_embed_morgan_fp_{radius}_100.csv")

## Make sure that all fp in right order

res = pd.concat([res, ent_names], axis=1, join="inner")
res.sort_values(["id_in_data"], inplace=True)
res.index = res['id_in_data']
res.drop(columns=['id_in_data'], inplace=True)
res.to_csv(new_data_path + f"/chemical_embed_morgan_fp_{radius}_100.csv")


## Replace smiles ID

In [None]:
cid_smiles = pd.read_table(old_data_path + "/cid_smiles.txt", sep="\t", header=None, index_col=0)
ent_maps = pd.read_csv(new_data_path + '/ent_maps.csv', index_col=0)
ent_maps.index = [int(i[3:]) for i in ent_maps.index]

smiles_df = pd.concat([cid_smiles, ent_maps], join='inner',axis=1).set_index('id_in_data').sort_index()
smiles_df.to_csv(new_data_path + '/descr_smiles.csv', index=False, header=False)

## Get molecular descriptors

In [None]:
scaler = StandardScaler()

molecules = [Chem.MolFromSmiles(mol) for mol in smiles_df[1]]
mol_descriptors = [desc[0] for desc in Chem.Descriptors.descList]
calc = MolecularDescriptorCalculator(mol_descriptors)

descriptors = [np.array(calc.CalcDescriptors(mol)) for mol in molecules]
df_descriptors = pd.DataFrame(np.vstack(descriptors))
df_descriptors = df_descriptors.loc[:, (df_descriptors != 0).any(axis=0)]

#Check NA. Only 4 in one molecule, fill with 0. Can make sure, that 0 seems like appropriate value.
np.where(np.asanyarray(np.isnan(df_descriptors)))

df_descriptors = df_descriptors.fillna(0)
df_descriptors = pd.DataFrame(scaler.fit_transform(df_descriptors))

df_descriptors.to_csv(f"{new_data_path}/mol_descriptors_191.csv")

**Warning! List of molecular descriptors in rdkit can change over time, so use file mol_descriptors_list.pickle to
restore used descriptors**

```python
with open('mol_descriptors_list.pickle', 'rb') as f:
    md_list = pickle.load(f)
```

Save only 100 descriptors that have more non-zero values, than other descriptors

In [None]:
df_descriptors = df_descriptors[(df_descriptors!=0).sum(axis=0).sort_values(ascending=False).index[:100]]
df_descriptors = pd.DataFrame(scaler.fit_transform(df_descriptors))
df_descriptors.to_csv(f"{new_data_path}/mol_descriptors_100.csv")

HVAE embeddings got from this repo: https://github.com/batmanlab/drugEmbedding

# Make splits

## Make split 1

In [31]:
degrees = pd.concat([df_tr['from'], df_tr['to']]).value_counts()
num_of_weak_triples = [degrees[degrees <=i].sum() for i in range(3, 1000)]
weak_nodes = list(degrees[degrees <=1000].index)

In [32]:
df_1 = df_tr[df_tr['from'].isin(weak_nodes)].sort_values("from")

triples_to_add_test = []
triples_to_add_val = []
for drug in list(set(df_1['from'])):
    tmp = df_1[df_1['from'] == drug]
    triples_to_add_test.append(tmp.iloc[:len(tmp)//4])
    triples_to_add_val.append(tmp.iloc[len(tmp)//4:len(tmp)//2])
df_1_test = pd.concat(triples_to_add_test)
df_1_val = pd.concat(triples_to_add_val)

df_2 = df_tr[df_tr['to'].isin(weak_nodes)].sort_values("to")

triples_to_add_test = []
triples_to_add_val = []
for drug in list(set(df_2['to'])):
    tmp = df_2[df_2['to'] == drug]
    triples_to_add_test.append(tmp.iloc[:len(tmp)//4])
    triples_to_add_val.append(tmp.iloc[len(tmp)//4:len(tmp)//2])
df_2_test = pd.concat(triples_to_add_test)
df_2_val = pd.concat(triples_to_add_val)

In [33]:
df_val_weak = pd.concat([df_val[df_val['from'].isin(weak_nodes)],df_val[df_val['to'].isin(weak_nodes)]]).drop_duplicates()
df_test_weak = pd.concat([df_test[df_test['from'].isin(weak_nodes)],df_test[df_test['to'].isin(weak_nodes)]]).drop_duplicates()

In [34]:
df_to_add_test = pd.concat([df_1_test, df_2_test]).drop_duplicates()
df_to_add_val = pd.concat([df_1_val, df_2_val]).drop_duplicates()

# drop intersections between test and val
df_to_add_val = df_to_add_val.drop(set(df_to_add_val.index).intersection(set(df_to_add_test.index)))

df_val_new = pd.concat([df_val_weak, df_to_add_val])
df_test_new = pd.concat([df_test_weak, df_to_add_test])
df_train_new = pd.concat([df_trivec, df_val_new, df_test_new]).drop_duplicates(keep=False)

In [35]:
## check if splits is okay
print((df_train_new.shape[0] + df_val_new.shape[0] + df_test_new.shape[0]) == df_trivec.shape[0])
print(pd.concat([df_train_new, df_val_new, df_test_new]).drop_duplicates().shape[0] == df_trivec.shape[0])

True
True


In [36]:
df_train_new.to_csv(new_data_path + "/polyphar_train_new.csv", index=False)
df_val_new.to_csv(new_data_path + "/polyphar_val_new.csv", index=False)
df_test_new.to_csv(new_data_path + "/polyphar_test_new.csv", index=False)

# Split 2

In [37]:
weak_nodes = list(degrees[degrees <=795].index)
len(degrees[degrees <=795])

df_1 = df_tr[df_tr['from'].isin(weak_nodes)].sort_values("from")

triples_to_add_test = []
triples_to_add_val = []
for drug in list(set(df_1['from'])):
    tmp = df_1[df_1['from'] == drug]
    triples_to_add_test.append(tmp.iloc[:len(tmp)//2])
    triples_to_add_val.append(tmp.iloc[len(tmp)//2:len(tmp)])
df_1_test = pd.concat(triples_to_add_test)
df_1_val = pd.concat(triples_to_add_val)

df_2 = df_tr[df_tr['to'].isin(weak_nodes)].sort_values("to")

triples_to_add_test = []
triples_to_add_val = []
for drug in list(set(df_2['to'])):
    tmp = df_2[df_2['to'] == drug]
    triples_to_add_test.append(tmp.iloc[:len(tmp)//2])
    triples_to_add_val.append(tmp.iloc[len(tmp)//2:len(tmp)])
df_2_test = pd.concat(triples_to_add_test)
df_2_val = pd.concat(triples_to_add_val)

In [38]:
df_val_weak = pd.concat([df_val[df_val['from'].isin(weak_nodes)],df_val[df_val['to'].isin(weak_nodes)]]).drop_duplicates()
df_test_weak = pd.concat([df_test[df_test['from'].isin(weak_nodes)],df_test[df_test['to'].isin(weak_nodes)]]).drop_duplicates()

In [39]:
df_to_add_test = pd.concat([df_1_test, df_2_test]).drop_duplicates()
df_to_add_val = pd.concat([df_1_val, df_2_val]).drop_duplicates()

# drop intersections between test and val
df_to_add_val = df_to_add_val.drop(set(df_to_add_val.index).intersection(set(df_to_add_test.index)))

df_val_new_2 = pd.concat([df_val_weak, df_to_add_val])
df_test_new_2 = pd.concat([df_test_weak, df_to_add_test])
df_train_new_2 = pd.concat([df_trivec, df_val_new, df_test_new]).drop_duplicates(keep=False)

In [40]:
## check if splits is okay
print((df_train_new_2.shape[0] + df_val_new_2.shape[0] + df_test_new_2.shape[0]) == df_trivec.shape[0])
print(pd.concat([df_train_new_2, df_val_new_2, df_test_new_2]).drop_duplicates().shape[0] == df_trivec.shape[0])

False
False


In [41]:
df_train_new_2.to_csv(new_data_path + "/polyphar_train_new_2.csv", index=False)
df_val_new_2.to_csv(new_data_path + "/polyphar_val_new_2.csv", index=False)
df_test_new_2.to_csv(new_data_path + "/polyphar_test_new_2.csv", index=False)