In [12]:
import pandas as pd
import numpy as np

columns = ['Parent Molecule ChEMBL ID', 'MESH ID', 'MESH Heading','EFO IDs', 'EFO Terms', 'Parent Molecule Name']
indication_df = pd.read_csv('../data/indication_gene_link/chembl_indication.csv', sep=';', usecols=columns)

indication_df.head()

Unnamed: 0,Parent Molecule ChEMBL ID,Parent Molecule Name,MESH ID,MESH Heading,EFO IDs,EFO Terms
0,CHEMBL1201784,HEXAMINOLEVULINATE,D001749,Urinary Bladder Neoplasms,MONDO:0001187,urinary bladder cancer
1,CHEMBL2221250,CAPREOMYCIN,D010019,Osteomyelitis,EFO:0003102,osteomyelitis
2,CHEMBL651,METHADONE,D009293,Opioid-Related Disorders,EFO:0005611|EFO:0010702,opioid dependence|opioid use disorder
3,CHEMBL1822792,MK-2461,D009369,Neoplasms,EFO:0000616|EFO:0000311,neoplasm|cancer
4,CHEMBL2146883,COBIMETINIB,D013274,Stomach Neoplasms,EFO:0000503,gastric adenocarcinoma


In [13]:
disease_synonyms = {
    "Attention Deficit Hyperactivity Disorder": ["ADHD", "Attention", "Hyperkinetic Syndrome", "Brain Dysfunction"],
    "Alzheimer’s Disease": ["Alzheimer"],
    "Anxiety Disorder": ["Anxiety", "Hypervigilance", "Nervousness", "Anxiousness", "Anxieties"],
    "Autism Spectrum Disorder": ["ASD", "Autism", "Autistic"],
    "Bipolar Disorder": ["Manic Depression", "Bipolar", "BPAD", "Bipolar disorder"],
    "Eating Disorders": ["Eating", "Feeding"],
    "Major Depressive Disorder": ["Depressive", "MDD", "Involutional Paraphrenia", "depression"],
    "OCD & Tourette Syndrome": ["Obsessive", "OCD", "Tourette", "Compulsive"],
    "Post Traumatic Stress Disorder": ["Post Traumatic Stress Disorder", "PTSD", "Post-Traumatic Stress"],
    "Schizophrenia": ["Schizoaffective Disorder", "Schizophrenia"],
    "Substance Use Disorders": ["Substance Use Disorders", "SUD", "Drug Use", "Drug Abuse", "Substance Dependence", "Opioid dependence", "Opioid use","Cannabis use disorder", "Alcohol use", "Alcohol dependence"]
}

def find_main_disease(indication_name):
    indication_name_lower = indication_name.lower() if isinstance(indication_name, str) else ''
    for main_disease, synonyms in disease_synonyms.items():
        if main_disease.lower() in indication_name_lower:
            return main_disease
        for synonym in synonyms:
            if synonym.lower() in indication_name_lower:
                return main_disease
    return None  # Return None to indicate no match was found

# Apply the function to each row in 'MESH Heading' and 'EFO Terms' columns and create a new 'Disease' column
filtered_indications = indication_df.copy()
filtered_indications['Disease'] = filtered_indications['MESH Heading'].apply(find_main_disease)
filtered_indications['Disease'] = filtered_indications.apply(
    lambda row: row['Disease'] if row['Disease'] is not None else find_main_disease(row['EFO Terms']), axis=1
)

# Filter out rows where 'Disease' is None (Unknown)
filtered_indications = filtered_indications.dropna(subset=['Disease'])

# Now 'filtered_indications' DataFrame will have the relevant rows with the 'Disease' column populated
print(filtered_indications)

      Parent Molecule ChEMBL ID Parent Molecule Name  MESH ID  \
2                     CHEMBL651            METHADONE  D009293   
37                CHEMBL2028019          CARIPRAZINE  D001714   
89                  CHEMBL43048       MIDOMAFETAMINE  D001007   
109               CHEMBL2111047           PROLINTANE  D001289   
111               CHEMBL1237021           LURASIDONE  D003866   
...                         ...                  ...      ...   
45466             CHEMBL2108308          TERTOMOTIDE  D000544   
45480                 CHEMBL490           PAROXETINE  D001714   
45521             CHEMBL1232801         FOLINIC ACID  D012559   
45543                 CHEMBL809           SERTRALINE  D012559   
45552               CHEMBL36715            PIRACETAM  D001007   

                                        MESH Heading                  EFO IDs  \
2                           Opioid-Related Disorders  EFO:0005611|EFO:0010702   
37                                  Bipolar Disorder  EFO

In [14]:
gene_df = pd.read_csv('../data/indication_gene_link/Chembl_drug_targets_ChEMBLID_UniProtIDs_EnsemblIDs.csv')

target_df = pd.read_csv('../data/indication_gene_link/chembl_mechnism.csv', sep=';')

target_df = target_df[["Parent Molecule ChEMBL ID", "Mechanism of Action", "Target ChEMBL ID"]]

target_df = target_df.merge(filtered_indications[['Parent Molecule ChEMBL ID', 'Parent Molecule Name','MESH Heading','EFO Terms', 'EFO IDs', 'Disease']], on='Parent Molecule ChEMBL ID')

merged_df = target_df.merge(gene_df, left_on="Target ChEMBL ID", right_on="ChEMBLID")

merged_df = merged_df[['Parent Molecule ChEMBL ID','Parent Molecule Name', 'EnsemblID', 'MESH Heading','EFO Terms','EFO IDs','Disease']]

merged_df.head(30)

Unnamed: 0,Parent Molecule ChEMBL ID,Parent Molecule Name,EnsemblID,MESH Heading,EFO Terms,EFO IDs,Disease
0,CHEMBL134,CLONIDINE,ENSG00000150594,Attention Deficit Disorder with Hyperactivity,attention deficit hyperactivity disorder,EFO:0003888,Attention Deficit Hyperactivity Disorder
1,CHEMBL134,CLONIDINE,ENSG00000184160,Attention Deficit Disorder with Hyperactivity,attention deficit hyperactivity disorder,EFO:0003888,Attention Deficit Hyperactivity Disorder
2,CHEMBL134,CLONIDINE,ENSG00000274286,Attention Deficit Disorder with Hyperactivity,attention deficit hyperactivity disorder,EFO:0003888,Attention Deficit Hyperactivity Disorder
3,CHEMBL134,CLONIDINE,ENSG00000150594,"Child Development Disorders, Pervasive",autism spectrum disorder,EFO:0003756,Autism Spectrum Disorder
4,CHEMBL134,CLONIDINE,ENSG00000184160,"Child Development Disorders, Pervasive",autism spectrum disorder,EFO:0003756,Autism Spectrum Disorder
5,CHEMBL134,CLONIDINE,ENSG00000274286,"Child Development Disorders, Pervasive",autism spectrum disorder,EFO:0003756,Autism Spectrum Disorder
6,CHEMBL134,CLONIDINE,ENSG00000150594,Opioid-Related Disorders,opioid dependence,EFO:0005611,Substance Use Disorders
7,CHEMBL134,CLONIDINE,ENSG00000184160,Opioid-Related Disorders,opioid dependence,EFO:0005611,Substance Use Disorders
8,CHEMBL134,CLONIDINE,ENSG00000274286,Opioid-Related Disorders,opioid dependence,EFO:0005611,Substance Use Disorders
9,CHEMBL134,CLONIDINE,ENSG00000150594,"Stress Disorders, Post-Traumatic",post-traumatic stress disorder,EFO:0001358,Post Traumatic Stress Disorder


In [15]:
merged_df.to_csv('../data/indication_gene_link/chembl_chembl_2.csv', index=False)

In [16]:
indication_names_by_disease = merged_df.groupby('Disease')['MESH Heading'].apply(lambda x: '; '.join(x.unique())).reset_index()
indication_names_df = pd.DataFrame(indication_names_by_disease)
indication_names_df.columns = ['Disease', 'MESH Heading']
indication_names_df.head(7)
indication_names_df.to_csv('../data/indication_gene_link/dictionary_chembl.csv', index=False)