In [1]:
# import necessary packages
import pandas as pd

## Preparing the Drug Repurposing Hub Indications

#### *1. Import the indications from the Drug Repurposing Hub*

This file was taken from the Broad Institute's Drug Repurposing Hub:
https://repo-hub.broadinstitute.org/repurposing#download-data

In [2]:
# import the drug repurposing hub indications
file_path = '~/LINCS/ref_data/drug_ind/repurposing_drugs_20200324.txt'
drug_repo = pd.read_csv(file_path, sep='\t', usecols=['pert_iname', 'indication'], skiprows=9)
# remove drugs that do not have any indication
drug_repo = drug_repo[drug_repo['indication'].notna()]
drug_repo.head()

Unnamed: 0,pert_iname,indication
0,(R)-(-)-apomorphine,Parkinson's Disease
32,abacavir,human immunodeficiency virus (HIV-1)
34,abamectin,gastrointestinal parasites
39,abemaciclib,breast cancer
40,abiraterone,prostate cancer


In [3]:
# number of unique drugs available with indications
len(drug_repo['pert_iname'].unique())

2222

In [4]:
# split the 'indication' column and create a new series with duplicated rows for each indication
inds = drug_repo['indication'].str.split('|', expand=True).stack().reset_index(level=1, drop=True).rename('indication')
# create a new dataframe that maps each drug to one indication
drug_repo_ind = drug_repo.drop(columns='indication').join(inds).reset_index()
# keep only the name of drug, drug id, and the indication
drug_repo_ind = drug_repo_ind[['pert_iname', 'indication']]
drug_repo_ind.head()

Unnamed: 0,pert_iname,indication
0,(R)-(-)-apomorphine,Parkinson's Disease
1,abacavir,human immunodeficiency virus (HIV-1)
2,abamectin,gastrointestinal parasites
3,abemaciclib,breast cancer
4,abiraterone,prostate cancer


#### *2. Import the standardized format for the indications (created by Panos)*

In [5]:
# import the mesh strings for the indication in the Drug Repurposing Hub
file_path = '~/LINCS/ref_data/drug_ind/indications_to_mesh_normalizedString.txt'
ind_mesh_terms = pd.read_csv(file_path, usecols=['indication', 'mesh_term'], sep='\t')
# remove indications without any mesh terms
ind_mesh_terms = ind_mesh_terms[ind_mesh_terms['mesh_term'].notna()]
# remove mesh code
ind_mesh_terms.head()

Unnamed: 0,indication,mesh_term
0,Parkinson's Disease,Parkinson Disease
1,human immunodeficiency virus (HIV-1),HIV
3,breast cancer,Breast Neoplasms
4,prostate cancer,Prostatic Neoplasms
5,mantle cell lymphoma (MCL),"Lymphoma, Mantle-Cell"


#### *3. Replace the Drug Repurposing Hub indications with the standardized terms*

In [6]:
# create dataframe that contains the mesh terms for indications available for the drugs
drug_repo_mesh = pd.merge(drug_repo_ind, ind_mesh_terms, on='indication', how='inner').drop('indication', axis=1)
# rename the columns
drug_repo_mesh.columns = ['pert_iname', 'indication']
drug_repo_mesh.head()

Unnamed: 0,pert_iname,indication
0,(R)-(-)-apomorphine,Parkinson Disease
1,amantadine,Parkinson Disease
2,benserazide,Parkinson Disease
3,benztropine-mesylate,Parkinson Disease
4,biperiden,Parkinson Disease


In [7]:
# number of unique drugs available with standard indications
len(drug_repo_mesh['pert_iname'].unique())

1944

#### *4. Add the indications of the drugs that were used in LINCS dataset*

In [8]:
# import the perturbagens used in the LINCS dataset
file_path = '~/LINCS/ref_data/GSE70138_Broad_LINCS_pert_info.txt'
LINCS_drugs = pd.read_csv(file_path, sep='\t', usecols=['inchi_key', 'pert_id', 'pert_iname', 'pert_type'])
# remove the duplicate rows of drugs
LINCS_drugs.drop_duplicates(subset=['pert_iname'], keep='first', inplace=True)
# keep only the drugs used
LINCS_drugs = LINCS_drugs[LINCS_drugs['pert_type'] == 'trt_cp'].drop(columns=['pert_type', 'pert_id'], axis=1)
LINCS_drugs.head()

Unnamed: 0,inchi_key,pert_iname
0,GYBXAGDWMCJZJK-UHFFFAOYSA-N,10-DEBC
1,PHEDXBVPIONUQT-RGYGYFBISA-N,phorbol-myristate-acetate
2,QAOBBBBDJSWHMU-WMBBNPMCSA-N,"16,16-dimethylprostaglandin-e2"
3,DOMWKUIIPQCAJU-JKPPDDDBSA-N,17-hydroxyprogesterone-caproate
4,WWVANQJRLPIHNS-ZKWXMUAHSA-N,2-iminobiotin


In [9]:
# add the indications for the drugs available in the Drug Repurposing Hub
LINCS_drug_repo = pd.merge(LINCS_drugs.drop('inchi_key', axis=1), drug_repo_mesh, on='pert_iname', how='inner')
# save file to the same directory
#LINCS_drug_repo.to_csv('~/LINCS/ref_data/drug_ind/processed/LINCS_drug_repo.txt', index=False)
LINCS_drug_repo.head()

Unnamed: 0,pert_iname,indication
0,5-aminolevulinic-acid,"Keratosis, Actinic"
1,5-aminolevulinic-acid,Glioma
2,abacavir,HIV
3,abiraterone-acetate,Prostatic Neoplasms
4,acarbose,Diabetes Mellitus


In [10]:
# show the number of drugs available
len(LINCS_drug_repo['pert_iname'].unique())

800

In [11]:
# show the number of unique indications available
len(LINCS_drug_repo['indication'].unique())

341

In [12]:
LINCS_drug_repo.shape

(1368, 2)

#### *5. Create a collapsed version of indications, with each row being all indications for one unique drug*

In [13]:
# create another version with all indications for one drug
collapsed_drug_repo = LINCS_drug_repo.groupby('pert_iname')['indication'].agg(lambda row: '|'.join(row)).reset_index()
# save file to the same directory
#collapsed_drug_repo.to_csv('~/LINCS/ref_data/drug_ind/processed/collapsed_drug_repo_lincs_ind.txt', index=False)
collapsed_drug_repo.head()

Unnamed: 0,pert_iname,indication
0,5-aminolevulinic-acid,"Keratosis, Actinic|Glioma"
1,L-citrulline,Hypertension|Erectile Dysfunction
2,SN-38,Colorectal Neoplasms
3,abacavir,HIV
4,abiraterone-acetate,Prostatic Neoplasms


## Importing the Known Indications of LINCS drugs from RxNorm

#### *1. Get the InchiKeys for the drugs from the RXNorm API*
    a. Extracted the DrugBank IDs from the rxnorm mapped drugs to their indication using the following bash command:
    
    cut -f1 all_drugs_rxnorm_indications.txt | tail -n +2 > rxnorm_drugbank_id.txt

    b. Get the respective InchiKeys for the drugs using [PubChem](https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi), saved as rxnorm_drugbank_inchikey.txt


#### *2. Import the clinical indications available from RXNorm API*

In [14]:
# import the indications from rxnorm API
file_path = '~/LINCS/ref_data/drug_ind/all_drugs_rxnorm_indications.txt'
rxnorm_ind = pd.read_csv(file_path, sep='\t', usecols=['drug_name','indication'])
rxnorm_ind.head()

Unnamed: 0,drug_name,indication
0,Lepirudin,"Angina, Unstable"
1,Lepirudin,Thrombocytopenia
2,Lepirudin,Thromboembolism
3,Lepirudin,Myocardial Ischemia
4,Cetuximab,"Carcinoma, Squamous Cell"


In [15]:
# read in the inchikeys for the rxnorm indications
file_path = '~/LINCS/ref_data/drug_ind/rxnorm_drugbank_inchikey.txt'
drugbank_inchikey = pd.read_csv(file_path, sep='\t', names=['drugbank_id','inchi_key'])
# add the inchi keys to the drugs in RxNorm
rxnorm_ind_inchi = pd.concat([rxnorm_ind, drugbank_inchikey], axis=1)
# remove the the drugs without an inchikey
rxnorm_ind_inchi = rxnorm_ind_inchi[rxnorm_ind_inchi['inchi_key'].notna()]
rxnorm_ind_inchi.head()

Unnamed: 0,drug_name,indication,drugbank_id,inchi_key
13,Bivalirudin,"Angina, Unstable",DB00006,OIRCOABEOLEUMC-GEJPAHFPSA-N
14,Bivalirudin,Thromboembolism,DB00006,OIRCOABEOLEUMC-GEJPAHFPSA-N
15,Bivalirudin,Myocardial Ischemia,DB00006,OIRCOABEOLEUMC-GEJPAHFPSA-N
16,Leuprolide,Breast Neoplasms,DB00007,GFIJNRVAKGFPGQ-LIJARHBVSA-N
17,Leuprolide,Endometriosis,DB00007,GFIJNRVAKGFPGQ-LIJARHBVSA-N


#### *3. Add the clinical indications to drugs used in the LINCS dataset*

In [16]:
# add the clinical indications for drugs available in the LINCS dataset
LINCS_rxnorm = LINCS_drugs.merge(rxnorm_ind_inchi, on='inchi_key', how='left').drop(columns=['inchi_key', 'drug_name'], axis=1)
# remove drugs not associated with any indications
LINCS_rxnorm = LINCS_rxnorm[LINCS_rxnorm['indication'].notna()]
LINCS_rxnorm.head()

Unnamed: 0,pert_iname,indication,drugbank_id
12,5-aminolevulinic-acid,"Keratosis, Actinic",DB00855
13,fluorouracil,Breast Neoplasms,DB00544
14,fluorouracil,"Carcinoma, Basal Cell",DB00544
15,fluorouracil,Colonic Neoplasms,DB00544
16,fluorouracil,Head and Neck Neoplasms,DB00544


In [17]:
# number of unique LINCS drugs in rxnorm
len(LINCS_rxnorm['pert_iname'].unique())

580

In [18]:
# number of known LINCS drug-indication pairs from RxNorm
len(LINCS_rxnorm)

1946

## Import the Known and Clinical Indications of Drugs from AACT

#### *a. Import the known the clinical drug-indications from AACT with InChi keys*

In [19]:
# import the indications from AACT
file_path = '~/LINCS/ref_data/drug_ind/clinical_trials_phase_indication_drugDBid.txt'
aact_ind = pd.read_csv(file_path, sep='\t', usecols=['phase', 'indication', 'drug_drugbank_id', 'drug_drugbank_name'])

# remove drug-indication pairs not in a phase
aact_ind = aact_ind[aact_ind['phase'].notna()]
aact_ind = aact_ind[~(aact_ind['phase'] == 'Not Applicable')]
                    
# remove duplicate rows
aact_ind = aact_ind.drop_duplicates()

# rename the drug_drugbank_name
aact_ind.rename(columns={'drug_drugbank_id': 'drugbank_id'}, inplace=True)

# add the drug bank ids to the drugs in AACT
aact_ind_inchi = pd.merge(aact_ind, drugbank_inchikey.drop_duplicates(), on='drugbank_id', how='inner')

# remove drugs that do not have an inchi key
aact_ind_inchi = aact_ind_inchi[aact_ind_inchi['inchi_key'].notna()]

In [20]:
# store the AACT indications of drugs used in the LINCS dataset
LINCS_aact = pd.merge(LINCS_drugs, aact_ind_inchi.drop('drug_drugbank_name', axis=1), on='inchi_key', how='inner')

# remove inchi key column
LINCS_aact.drop('inchi_key', axis=1, inplace=True)

# store boolean series whether each drug-indication pair is known (in Phase 4)
is_known_ind = LINCS_aact['phase'] == 'Phase 4'

# store the clinical drug-indications (in Phase 1-3)
LINCS_aact_clin = LINCS_aact[~is_known_ind].drop('phase', axis=1)
# remove duplicate drug-indication pairs
LINCS_aact_clin.drop_duplicates(inplace=True)

# store the known drug-indication pairs
LINCS_aact_known = LINCS_aact[is_known_ind].drop('phase', axis=1)

# combine the known indications from AACT and RXNorm for LINCS drugs
aact_rxnorm_known = pd.concat([LINCS_rxnorm, LINCS_aact_known], sort=True)
# remove duplicate drug-indication pairs
aact_rxnorm_known.drop_duplicates(inplace=True)

# number of unique clinical drug-indication pairs 
print('Number of unique clinical drug-indication pairs: ' + str(len(LINCS_aact_clin)))

# number of unique known drug-indication pairs
print('Number of unique known drug-indication pairs: ' + str(len(aact_rxnorm_known)))

Number of unique clinical drug-indication pairs: 24741
Number of unique known drug-indication pairs: 10393


In [21]:
# store the clinical drug-indication pairs as a txt file
#LINCS_aact_clin.to_csv('~/LINCS/ref_data/drug_ind/processed/LINCS_clinical_ind (AACT).txt', index=False)
# store known drug-indication pairs as a txt file
#aact_rxnorm_known.to_csv('~/LINCS/ref_data/drug_ind/processed/LINCS_known_ind (RxNORM + AACT).txt', index=False)

In [22]:
# create a collapsed version with one row per unique drug, containining all the indications for that drug
collapsed_LINCS_aact_clin = LINCS_aact_clin.groupby('pert_iname')['indication'].agg(lambda row: '|'.join(row)).reset_index()
collapsed_aact_rxnorm_known = aact_rxnorm_known.groupby('pert_iname')['indication'].agg(lambda row: '|'.join(row)).reset_index()

print('Number of unique drugs available in known drug-indications dataset: ' + str(len(collapsed_LINCS_aact_clin)))
print('Number of unique drugs available in clinical drug-indications dataset: ' + str(len(collapsed_aact_rxnorm_known)))

Number of unique drugs available in known drug-indications dataset: 624
Number of unique drugs available in clinical drug-indications dataset: 632


In [23]:
# store the collapsed dataframes as txt files
#collapsed_LINCS_aact_clin.to_csv('~/LINCS/ref_data/drug_ind/processed/collapsed_LINCS_clinical_ind (AACT).txt', index=False)
#collapsed_aact_rxnorm_known.to_csv('~/LINCS/ref_data/drug_ind/processed/collapsed_LINCS_known_ind (RxNORM + AACT).txt', index=False)

## Number of LINCS drugs in both Drug Repurposing Hub and Clinical Drug-Indications (AACT)

In [27]:
# create a dataframe containing the drug-indication pairs shared across both datasets
intersection_ind = pd.merge(LINCS_drug_repo, LINCS_aact_clin.drop('drugbank_id', axis=1), 
                            on=['pert_iname', 'indication'], how='inner')

# report number of drug-indications pairs shared
print('Number of drug-indications shared between DRH and clincial drug-indications dataset: ' + str(len(intersection_ind)))
print('Number of drug-indication pairs only in Drug Repurposing Hub: ' + str(len(LINCS_drug_repo) - len(intersection_ind)))

Number of drug-indications shared between DRH and clincial drug-indications dataset: 488
Number of drug-indication pairs only in Drug Repurposing Hub: 880


In [28]:
# remove these drug-indication pairs from drug repurposing hub
# merge LINCS_drug_repo and aact_rxnorm_known
merged_df = pd.merge(LINCS_drug_repo, LINCS_aact_clin.drop('drugbank_id', axis=1),
                     on=['pert_iname', 'indication'], how='left', indicator=True)

# filter rows that only exist in LINCS_drug_repo
drug_repo_not_in_clin = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')
drug_repo_not_in_clin.shape

(880, 2)

In [33]:
LINCS_aact_clin.head()

Unnamed: 0,pert_iname,indication,drugbank_id
0,2-methoxyestradiol,Multiple Myeloma,DB02342
1,2-methoxyestradiol,"Neoplasms, Plasma Cell",DB02342
2,2-methoxyestradiol,Glioblastoma,DB02342
3,2-methoxyestradiol,Carcinoid Tumor,DB02342
4,2-methoxyestradiol,Prostatic Neoplasms,DB02342


In [32]:
drug_repo_not_in_clin.head()

Unnamed: 0,pert_iname,indication
2,abacavir,HIV
3,abiraterone-acetate,Prostatic Neoplasms
4,acarbose,Diabetes Mellitus
5,acebutolol,Hypertension
8,acexamic-acid,Wound Healing


In [30]:
# save drug repurposing hub without the shared drug-indication pairs in RxNorm and AACT
#drug_repo_not_in_clin.to_csv('~/LINCS/ref_data/drug_ind/normalized/LINCS_drug_repo_not_in_clin.txt', index=False)

In [31]:
# create another version with all indications for one drug
collapsed_drug_repo_not_in_clin = drug_repo_not_in_clin.groupby('pert_iname')['indication'].agg(lambda row: '|'.join(row)).reset_index()
# save file to the same directory
#collapsed_drug_repo_not_in_clin.to_csv('~/LINCS/ref_data/drug_ind/normalized/collapsed_LINCS_drug_repo_not_in_clin.txt', index=False)
collapsed_drug_repo.head()

Unnamed: 0,pert_iname,indication
0,5-aminolevulinic-acid,"Keratosis, Actinic|Glioma"
1,L-citrulline,Hypertension|Erectile Dysfunction
2,SN-38,Colorectal Neoplasms
3,abacavir,HIV
4,abiraterone-acetate,Prostatic Neoplasms


## Checking if clinical drug indications dataset contain drug indication pairs from the known dataset

In [None]:
# store the drug-indication pairs in both the known and clinical dataset
shared_drug_ind = pd.merge(LINCS_aact_clin.drop('drugbank_id', axis=1), 
                  aact_rxnorm_known.drop('drugbank_id', axis=1), 
                  on=['pert_iname', 'indication'], how='inner')

print('Number of clinical drug-indications pairs in known dataset: ' + str(len(shared)))

In [None]:
# remove these drug-indication pairs from known dataset
# merge known and clinical datasets
merged_df = pd.merge(aact_rxnorm_known.drop('drugbank_id', axis=1), LINCS_aact_clin.drop('drugbank_id', axis=1),
                     on=['pert_iname', 'indication'], how='left', indicator=True)

# remove rows shared across the known and clinical datasets
aact_rxnorm_known_not_in_clin = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')
aact_rxnorm_known_not_in_clin.shape

In [None]:
# store this data as a txt file
#aact_rxnorm_known_not_in_clin.to_csv('~/LINCS/ref_data/drug_ind/normalized/aact_rxnorm_known_not_in_clin.txt', index=False)

In [None]:
# create another version with all indications for one drug
collapsed_aact_rxnorm_known_not_in_clin = aact_rxnorm_known_not_in_clin.groupby('pert_iname')['indication'].agg(lambda row: '|'.join(row)).reset_index()
# save file to the same directory
#collapsed_aact_rxnorm_known_not_in_clin.to_csv('~/LINCS/ref_data/drug_ind/normalized/collapsed_aact_rxnorm_known_not_in_clin.txt', index=False)
collapsed_drug_repo.head()