In [2]:
import pyTigerGraph as tg
import pandas as pd


In [203]:
 pd.options.display.max_colwidth = None

## Load & Format Data

# Stanford BioSnap Datasets

## Disease-drug relations

In [268]:
data_dir = '../core/dataset'

disease_drug_file = f'{data_dir}/input/DCh-Miner_miner-disease-chemical.tsv'
disease_drug_df = pd.read_csv(disease_drug_file, sep='\t')
# Rename columns
disease_drug_df.rename(columns={'# Disease(MESH)': 'Disease Mesh ID'}, inplace=True)

### Drug attribute information

In [273]:
drug_info_file = f'{data_dir}/input/drugbank vocabulary.csv'
drug_info_df = pd.read_csv(drug_info_file, usecols=['DrugBank ID', 'Common name', 'Synonyms'])
#drug_info_df.fillna('', inplace=True)

### Extract entity-name mappings (Drug)

In [306]:
# Series of common names
drug_com_df = drug_info_df[['DrugBank ID', 'Common name']].rename(columns={'Common name': 'Name'})
drug_com_df = drug_com_df[drug_com_df['Name'].notna()]
drug_com_df['Name'] = drug_com_df['Name'].str.strip()
drug_com_df['isCommon'] = True

# ' | ' delimited str of other names for the drug
drug_syn_df = drug_info_df[['DrugBank ID', 'Synonyms']].rename(columns={'Synonyms': 'Name'})
drug_syn_df['Name'] = drug_syn_df['Name'].str.split("\|")

# Explode the list of other names (excluding common names)
drug_syn_expl_df = drug_syn_df.explode('Name')
drug_syn_expl_df['Name'] = drug_syn_expl_df['Name'].str.strip()
drug_syn_expl_df = drug_syn_expl_df[~drug_syn_expl_df['Name'].isin(drug_com_df['Name'])]
drug_syn_expl_df = drug_syn_expl_df[drug_syn_expl_df['Name'].notna()]
drug_syn_expl_df['isCommon'] = False

drug_all_names_df = pd.concat((drug_com_df, drug_syn_expl_df))

drug_all_names_df

Unnamed: 0,DrugBank ID,Name,isCommon
0,DB00001,Lepirudin,True
1,DB00002,Cetuximab,True
2,DB00003,Dornase alfa,True
3,DB00004,Denileukin diftitox,True
4,DB00005,Etanercept,True
...,...,...,...
14583,DB16736,Allogenic thymocyte-depleted thymus tissue-agdc,False
14584,DB16737,Ac-GQFR-kbt,False
14588,DB16741,Bortezomib D-mannitol ester,False
14588,DB16741,Bortezomib D-mannitol symmetrical ester,False


## Disease Info

In [288]:
disease_info_file = f'{data_dir}/input/D-MeshMiner_miner-disease.tsv'
disease_info_df = pd.read_csv(disease_info_file, sep='\t')
disease_info_df.rename(columns={'# MESH_ID': 'Disease Mesh ID', 'Definitions': 'Description'}, inplace=True)
disease_info_df['Description'].fillna('', inplace=True)


In [318]:
disease_info_df.isna().sum(axis=0)

Disease Mesh ID       0
Name                  0
Description           0
Synonyms           2166
dtype: int64

### Extract entity-name mappings (Disease)

In [314]:
# Series of common names
disease_com_df = disease_info_df[['Disease Mesh ID', 'Name']]
disease_com_df = disease_com_df[disease_com_df['Name'].notna()]
disease_com_df['Name'] = disease_com_df['Name'].str.strip()
disease_com_df['isCommon'] = True

# '|' delimited str of other names for the disease
disease_syn_df = disease_info_df[['Disease Mesh ID', 'Synonyms']].rename(columns={'Synonyms': 'Name'})
disease_syn_df['Name'] = disease_syn_df['Name'].str.split("\|")

# Explode the list of other names (excluding common names)
disease_syn_expl_df = disease_syn_df.explode('Name')
disease_syn_expl_df['Name'] = disease_syn_expl_df['Name'].str.strip()
disease_syn_expl_df = disease_syn_expl_df[~disease_syn_expl_df['Name'].isin(disease_com_df['Name'])]
disease_syn_expl_df = disease_syn_expl_df[disease_syn_expl_df['Name'].notna()]
disease_syn_expl_df['isCommon'] = False

disease_all_names_df = pd.concat((disease_com_df, disease_syn_expl_df))
disease_all_names_df

Unnamed: 0,Disease Mesh ID,Name,isCommon
0,MESH:C538288,10p Deletion Syndrome (Partial),True
1,MESH:C535484,13q deletion syndrome,True
2,MESH:C579849,15q24 Microdeletion,True
3,MESH:C579850,16p11.2 Deletion Syndrome,True
4,MESH:C567076,"17,20-Lyase Deficiency, Isolated",True
...,...,...,...
11331,MESH:D020096,Entomophthoramycoses,False
11331,MESH:D020096,Entomophthoramycosis,False
11331,MESH:D020096,Phycomycoses,False
11331,MESH:D020096,Phycomycosis,False


In [309]:
disease_syn_df = disease_info_df[['Disease Mesh ID', 'Synonyms']].rename(columns={'Synonyms': 'Name'})
disease_syn_df['Name'].str.split('\|')

0                                                                                                              [Chromosome 10, 10p- Partial, Chromosome 10, monosomy 10p, Chromosome 10, Partial Deletion (short arm), Monosomy 10p]
1        [Chromosome 13q deletion, Chromosome 13q deletion syndrome, Chromosome 13q monosomy, Chromosome 13q syndrome, Deletion 13q, Deletion 13q syndrome, Monosomy 13q, Monosomy 13q syndrome, Orbeli's syndrome, Orbeli syndrome]
2                                                                                                                                          [15q24 Deletion, 15q24 Microdeletion Syndrome, Interstitial Deletion of Chromosome 15q24]
3                                                                                                                                                                                                                                NaN
4                                                                                   

## Write Formatted Data

In [319]:
disease_drug_df.to_csv(f'{data_dir}/output/disease-drug.tsv', index=False, sep='\t')
drug_info_df.to_csv(f'{data_dir}/output/drug.tsv', index=False, sep='\t')
disease_info_df.to_csv(f'{data_dir}/output/disease.tsv', index=False, sep='\t')
drug_all_names_df.to_csv(f'{data_dir}/output/drug-name.tsv', index=False, sep='\t')
disease_all_names_df.to_csv(f'{data_dir}/output/disease-name.tsv', index=False, sep='\t')

# Provider Drug Dataset

# Drug Rating Dataset

In [None]:
drug_ratings_df = pd.re