# Categorize Drugs by Classes #

Create one unified file for categorizing the drugs into different classes and mechanisms

Pre-requisites:
- None beyond downloading NCI ALMANAC Dataset

In [1]:
import pandas as pd

In [2]:
nci_drug_id_to_name_fn = 'data/NCI-ALMANAC/ComboCompoundNames_small.txt'
nci_drug_id_to_name = pd.read_csv(nci_drug_id_to_name_fn, sep='\t', names=['NSC_ID', 'Drug_Name'])

nci_drug_id_to_name

Unnamed: 0,NSC_ID,Drug_Name
0,740,Methotrexate
1,750,Busulfan
2,752,Thioguanine
3,752,6-Thioguanine
4,755,Mercaptopurine
...,...,...
103,761431,vemurafenib
104,707389,Eribulin mesylate
105,737754,Pazopanib hydrochloride
106,753082,vemurafenib


## Manually retrieved the drug to therapy classes and more detailed drug classes##
- Stored in 'data_processed/drug_to_therapy_class_manual.csv' 
- Stored in 'data_processed/drug_to_class.tsv'

In [3]:
nci_name_to_therapy_fn = 'data_processed/drug_to_therapy_class_manual.csv'
nci_name_to_therapy = pd.read_csv(nci_name_to_therapy_fn, names=['Drug_Name', 'Therapy_Class'])

nci_name_to_therapy

Unnamed: 0,Drug_Name,Therapy_Class
0,Methotrexate,Chemotherapy
1,Busulfan,Chemotherapy
2,Thioguanine,Chemotherapy
3,6-Thioguanine,Chemotherapy
4,Mercaptopurine,Chemotherapy
...,...,...
103,vemurafenib,Targeted
104,Eribulin mesylate,Chemotherapy
105,Pazopanib hydrochloride,Targeted
106,vemurafenib,Targeted


In [4]:
# Get the manual drug to class file name and add it to the merged dataframe
nci_drug_id_to_class_fn = 'data_processed/drug_to_class.tsv'
nci_drug_id_to_class = pd.read_csv(nci_drug_id_to_class_fn, sep='\t', header=0)
nci_drug_id_to_class.columns = ['Drug_Name', 'Specific_Class']

nci_drug_id_to_class

Unnamed: 0,Drug_Name,Specific_Class
0,Methotrexate,Antimetabolite
1,Busulfan,Alkylating agent
2,Thioguanine,Antimetabolite
3,6-Thioguanine,Antimetabolite
4,Mercaptopurine,Antimetabolite
...,...,...
103,vemurafenib,BRAF inhibitor
104,Eribulin mesylate,Antimicrotubule agent
105,Pazopanib hydrochloride,Tyrosine kinase inhibitor
106,vemurafenib,BRAF inhibitor


In [5]:
# Make new dataframe with NSC_ID, Drug_Name, Therapy_Class, and Specific_Class
merged_df = nci_drug_id_to_name
merged_df['Therapy_Class'] = nci_name_to_therapy['Therapy_Class']
merged_df['Specific_Class'] = nci_drug_id_to_class['Specific_Class']

merged_df

Unnamed: 0,NSC_ID,Drug_Name,Therapy_Class,Specific_Class
0,740,Methotrexate,Chemotherapy,Antimetabolite
1,750,Busulfan,Chemotherapy,Alkylating agent
2,752,Thioguanine,Chemotherapy,Antimetabolite
3,752,6-Thioguanine,Chemotherapy,Antimetabolite
4,755,Mercaptopurine,Chemotherapy,Antimetabolite
...,...,...,...,...
103,761431,vemurafenib,Targeted,BRAF inhibitor
104,707389,Eribulin mesylate,Chemotherapy,Antimicrotubule agent
105,737754,Pazopanib hydrochloride,Targeted,Tyrosine kinase inhibitor
106,753082,vemurafenib,Targeted,BRAF inhibitor


In [6]:
# are there any NA values in merged_df?
merged_df.isna().sum()

NSC_ID            0
Drug_Name         0
Therapy_Class     0
Specific_Class    0
dtype: int64

In [7]:
# unique values of therapy_class?
merged_df['Therapy_Class'].value_counts()

Therapy_Class
Chemotherapy    67
Targeted        23
Other           18
Name: count, dtype: int64

In [8]:
# How many duplicates of NSC_ID are there?
print(merged_df['NSC_ID'].duplicated().sum())

# Which NSC_IDs are duplicated?
duplicated_ids = merged_df[merged_df['NSC_ID'].duplicated()]['NSC_ID']
print(duplicated_ids)

# Drop the duplicates
merged_df = merged_df.drop_duplicates(subset='NSC_ID')
print("Dropped duplicates")
print(merged_df['NSC_ID'].duplicated().sum())

3
3       752
17    24559
28    49842
Name: NSC_ID, dtype: int64
Dropped duplicates
0


In [9]:
# Save the merged dataframe to a file
nci_drug_id_to_therapy_class_fn = 'data_processed/almanac_nsc_to_drug_types.csv'
merged_df.to_csv(nci_drug_id_to_therapy_class_fn, index=False, header=True)