In [14]:
import xml.etree.ElementTree as ET
import pandas as pd

tree = ET.parse('../data/indication_gene_link/drugbank.xml')
root = tree.getroot()

drug_data = []

for drug in root.findall('{http://www.drugbank.ca}drug'):
    drugbank_id = drug.find('{http://www.drugbank.ca}drugbank-id').text if drug.find('{http://www.drugbank.ca}drugbank-id') is not None else "N/A"
    generic_name = drug.find('{http://www.drugbank.ca}name').text if drug.find('{http://www.drugbank.ca}name') is not None else "N/A"
    indication = drug.find('{http://www.drugbank.ca}indication').text if drug.find('{http://www.drugbank.ca}indication') is not None else "N/A"

    drug_data.append({
        "DrugBank ID": drugbank_id,
        "Generic Name": generic_name,
        "Indication": indication
    })

df = pd.DataFrame(drug_data)


In [15]:
import pandas as pd

filtered_df = pd.DataFrame(columns=df.columns.tolist() + ['Disease'])

disease_synonyms = {
    "Attention Deficit Hyperactivity Disorder": ["ADHD", "Attention", "Hyperkinetic Syndrome", "Brain Dysfunction"],
    "Alzheimer’s Disease": ["Alzheimer"],
    "Anxiety Disorder": ["Anxiety", "Hypervigilance", "Nervousness", "Anxiousness", "Anxieties"],
    "Autism Spectrum Disorder": ["ASD", "Autism", "Autistic"],
    "Bipolar Disorder": ["Manic Depression", "Bipolar", "BPAD", "Bipolar disorder"],
    "Eating Disorders": ["Eating", "Feeding"],
    "Major Depressive Disorder": ["Depressive", "MDD", "Involutional Paraphrenia", "depression"],
    "OCD & Tourette Syndrome": ["Obsessive", "OCD", "Tourette", "Compulsive"],
    "Post Traumatic Stress Disorder": ["Post Traumatic Stress Disorder", "PTSD", "Post-Traumatic Stress"],
    "Schizophrenia": ["Schizoaffective Disorder", "Schizophrenia"],
    "Substance Use Disorders": ["Substance Use Disorders", "SUD", "Drug Use", "Drug Abuse", "Substance Dependence", "Opioid dependence", "Opioid use","Cannabis use disorder", "Alcohol use", "Alcohol dependence"]
}


def find_disease(indication):
    for disease, synonyms in disease_synonyms.items():
        if indication:
            for synonym in [disease] + synonyms:
                if synonym.lower() in indication.lower():
                    return disease
    return "None"

df['Disease'] = df['Indication'].apply(find_disease)

filtered_df = df[df['Disease'] != "None"]

filtered_df.head()

Unnamed: 0,DrugBank ID,Generic Name,Indication,Disease
1,DB00002,Cetuximab,Cetuximab indicated for the treatment of local...,Eating Disorders
64,DB00068,Interferon beta-1b,Interferon beta-1b is a drug used for the trea...,Substance Use Disorders
100,DB00109,Enfuvirtide,Enfuvirtide is an antiretroviral drug used in ...,Substance Use Disorders
105,DB00114,Pyridoxal phosphate,For nutritional supplementation and for treati...,Eating Disorders
107,DB00116,Tetrahydrofolic acid,"For nutritional supplementation, also for trea...",Eating Disorders


In [16]:
unique_diseases = filtered_df['Disease'].unique()

print(unique_diseases)

['Eating Disorders' 'Substance Use Disorders' 'Major Depressive Disorder'
 'Schizophrenia' 'Alzheimer’s Disease'
 'Attention Deficit Hyperactivity Disorder' 'Anxiety Disorder'
 'Bipolar Disorder' 'OCD & Tourette Syndrome' 'Autism Spectrum Disorder']


In [17]:
filtered_df.to_csv('../data/indication_gene_link/filtered_drugbank.csv', index=False)

In [18]:
indication_names_by_disease = filtered_df.groupby('Disease')['Indication'].apply(lambda x: '; '.join(x.unique())).reset_index()
indication_names_df = pd.DataFrame(indication_names_by_disease)
indication_names_df.columns = ['Disease', 'Indication']
indication_names_df.head(7)
indication_names_df.to_csv('../data/indication_gene_link/dictionary_DB.csv', index=False)