# Identifying mechanism of action (MoA) of the top significant drugs after drug proximity analysis

After running drug proximity analysis, the top prioritised significant drugs will have their MoA recovered and this will guide the decision for wet lab validation. You can decide what percentage of the drugs you want to get the MoA for with the "top_percent" variable.

In [166]:
import pandas as pd
import csv
import requests
from tqdm import tqdm

In [167]:
# This is the percentage of top significant drugs you want to do the MoA analysis for, if you set it to 0.1, the top 10% drugs (by z-value) will get an added MoA column, the rest will not
top_percent = 1.0

## For the results from drug proximity analysis run on the whole gene list (1000 iterations)

In [168]:
# ---------------------- STEP 1: Load significant drug data ---------------------- #

# Load data
significant_drugs_whole_1000_step1 = pd.read_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_step1.csv", index_col=0)
significant_drugs_whole_1000_step2 = pd.read_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_step2.csv", index_col=0)
significant_drugs_whole_1000_step3 = pd.read_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_step3.csv", index_col=0)

# Extract top 10% significant drugs
top_percent_step1 = round(top_percent * len(significant_drugs_whole_1000_step1))
top_percent_step2 = round(top_percent * len(significant_drugs_whole_1000_step2))
top_percent_step3 = round(top_percent * len(significant_drugs_whole_1000_step3))

top_percent_drugs_step1 = significant_drugs_whole_1000_step1.head(top_percent_step1).copy()
top_percent_drugs_step2 = significant_drugs_whole_1000_step2.head(top_percent_step2).copy()
top_percent_drugs_step3 = significant_drugs_whole_1000_step3.head(top_percent_step3).copy()

# Get source of drug via combined_DPI_with_source.csv which contains the DPI list with each row indicating whether it came from ChEMBL or DrugBank or both
combined_dpi_with_source = pd.read_csv("../data/networks/combined_DPI_with_source.csv")

def determine_source(sources):
    """Build a mapping from drug name to source(s)"""
    sources_set = set(sources)
    if sources_set == {"ChEMBL"}:
        return "ChEMBL"
    elif sources_set == {"DrugBank"}:
        return "DrugBank"
    else:
        return "Both"

# Map each drug to its source
drug_to_source = (
    combined_dpi_with_source
    .groupby("Drug_Name")["Source"]
    .apply(determine_source)
    .to_dict()
)

# Add source info to each top_percent_drugs table
for df in [top_percent_drugs_step1, top_percent_drugs_step2, top_percent_drugs_step3]:
    df["Source"] = df["drug"].map(drug_to_source)

### For drugs from DrugBank, get the MoA from pharmacologies.csv

In [170]:
# Extract drugs from DrugBank or from both DrugBank and ChEMBL (DrugBank has more accessible MoA than ChEMBL)
top_percent_drugs_step1_drugbank = top_percent_drugs_step1[(top_percent_drugs_step1["Source"] == "DrugBank") | (top_percent_drugs_step1["Source"] == "Both")]
top_percent_drugs_step2_drugbank = top_percent_drugs_step2[(top_percent_drugs_step2["Source"] == "DrugBank") | (top_percent_drugs_step2["Source"] == "Both")]
top_percent_drugs_step3_drugbank = top_percent_drugs_step3[(top_percent_drugs_step3["Source"] == "DrugBank") | (top_percent_drugs_step3["Source"] == "Both")]

# ---------------------- STEP 2: Load DrugBank DPI ---------------------- #

# Load DrugBank DPI with DrugBank IDs
drugbank_dpi_with_id = pd.read_csv("../data/networks/drugbank_DPI_with_ID.csv")
drugbank_dpi_with_id["Drug_Name"] = drugbank_dpi_with_id["Drug_Name"].str.lower()
drugbank_dpi_with_id["Drug_ID"] = drugbank_dpi_with_id["Drug_ID"].astype(int)

# Map Drug_Name -> Drug_ID using drugbank_dpi_with_id
drug_name_to_id = (
    drugbank_dpi_with_id
    .drop_duplicates("Drug_Name")
    .set_index("Drug_Name")["Drug_ID"]
    .to_dict()
)

# ---------------------- STEP 3: Load and clean pharmacologies.csv ---------------------- #

# Load pharmacology MoA information
# pharmacologies = pd.read_csv("../data/networks/milner_drugbank_postgresql/pharmacologies.csv")
# pharmacologies_moa = pharmacologies.iloc[:, [0, 4]] # 0 = id and 4 = mechanism_of_action
# pharmacologies_moa.columns = ["Drug_ID", "MoA"]

moa_data = []
with open("../data/networks/milner_drugbank_postgresql/pharmacologies.csv", "r", encoding="utf-8") as f:
    reader = csv.reader(f)
    for row in reader:
        if len(row) >= 5:
            try:
                moa_data.append((int(row[1]), row[4]))  # drug_id, mechanism_of_action
            except ValueError:
                continue  # skip malformed rows

pharmacologies_moa_clean = pd.DataFrame(moa_data, columns=["Drug_ID", "MoA"])

# ---------------------- STEP 4: Merge Drug_ID and MoA ---------------------- #

# Create mapping from integer ID to DrugBank ID using drugs.csv
drugs_df = pd.read_csv("../data/networks/milner_drugbank_postgresql/drugs.csv", header=None)

# Extract relevant columns
integer_id_to_drugbank_id = dict(zip(drugs_df.iloc[:, 0], drugs_df.iloc[:, 2]))

# Merge MoA into each top_percent_drugs_*_drugbank dataframe
def add_moa_column(df):
    """Add mechanism of action (MoA) column and DrugBank IDs"""
    df = df.copy()

    # Step 1: Map drug name to Drug_ID
    df["Integer_ID"] = df["drug"].map(drug_name_to_id)

    # Step 2: Convert integer ID to actual DrugBank ID
    df["Drug_ID"] = df["Integer_ID"].map(integer_id_to_drugbank_id)

    # Step 3: Merge with pharmacologies_moa on integrer ID (since pharmacologies.csv uses integer IDs)
    df = df.merge(pharmacologies_moa_clean, left_on="Integer_ID", right_on="Drug_ID", how="left", suffixes=('', '_pharm'))

    # Step 4: clean up – keep the DrugBank ID and drop the integer ID columns
    df = df.drop(columns=["Integer_ID", "Drug_ID_pharm"])

    return df

# Apply to each dataframe
top_percent_drugs_step1_drugbank_moa = add_moa_column(top_percent_drugs_step1_drugbank.copy())
top_percent_drugs_step2_drugbank_moa = add_moa_column(top_percent_drugs_step2_drugbank.copy())
top_percent_drugs_step3_drugbank_moa = add_moa_column(top_percent_drugs_step3_drugbank.copy())

# ---------------------- STEP 5: Confirm MoA coverage ---------------------- #

print("Step 1 MoA matches:", top_percent_drugs_step1_drugbank_moa["MoA"].notna().sum(), "/", len(top_percent_drugs_step1_drugbank_moa))
print("Step 2 MoA matches:", top_percent_drugs_step2_drugbank_moa["MoA"].notna().sum(), "/", len(top_percent_drugs_step2_drugbank_moa))
print("Step 3 MoA matches:", top_percent_drugs_step3_drugbank_moa["MoA"].notna().sum(), "/", len(top_percent_drugs_step3_drugbank_moa))

Step 1 MoA matches: 440 / 440
Step 2 MoA matches: 530 / 530
Step 3 MoA matches: 422 / 422


### For drugs from ChEMBL, get the MoA

In [160]:
# Extract drugs in top 10% that are from ChEMBL
top_percent_drugs_step1_chembl = top_percent_drugs_step1[top_percent_drugs_step1["Source"] == "ChEMBL"]
top_percent_drugs_step2_chembl = top_percent_drugs_step2[top_percent_drugs_step2["Source"] == "ChEMBL"]
top_percent_drugs_step3_chembl = top_percent_drugs_step3[top_percent_drugs_step3["Source"] == "ChEMBL"]

# Load annotated DPI to get mapping from drug name to ChEMBL ID
chembl_dpi_with_meta = pd.read_csv("../data/networks/chembl_DPI_with_meta.csv")
# Ensure lowercase for matching
chembl_dpi_with_meta['compound_name'] = chembl_dpi_with_meta['compound_name'].str.lower()
# print(chembl_dpi_with_meta.head())

# Map from compound name to compound ChEMBL ID
compound_map = chembl_dpi_with_meta[['compound_name', 'compound_chembl_id']].drop_duplicates('compound_name', keep='first')

def add_moa_column(df, compound_map):
    """Add mechanism of action (MoA) column"""
    # Merge to get ChEMBL IDs
    df = df.merge(compound_map, left_on='drug', right_on='compound_name', how='left')

    # Query ChEMBL for unique compound_chembl_ids
    moa_data = []
    for chembl_id in tqdm(df['compound_chembl_id'].dropna().unique()):
        url = f"https://www.ebi.ac.uk/chembl/api/data/mechanism.json?molecule_chembl_id={chembl_id}"
        response = requests.get(url)
        if response.status_code == 200:
            json_data = response.json()
            mechanisms = json_data.get('mechanisms', [])
            moa_list = [mech.get('mechanism_of_action') for mech in mechanisms if mech.get('mechanism_of_action')]
            moa_str = '; '.join(moa_list) if moa_list else None
            moa_data.append({'compound_chembl_id': chembl_id, 'MoA': moa_str})
    
    moa_df = pd.DataFrame(moa_data)
    df = df.merge(moa_df, on='compound_chembl_id', how='left')

    # Drop helper column
    return df.drop(columns=['compound_name'])

# Run the function for each step
top_percent_drugs_step1_chembl_moa = add_moa_column(top_percent_drugs_step1_chembl.copy(), compound_map)
top_percent_drugs_step2_chembl_moa = add_moa_column(top_percent_drugs_step2_chembl.copy(), compound_map)
top_percent_drugs_step3_chembl_moa = add_moa_column(top_percent_drugs_step3_chembl.copy(), compound_map)

print(top_percent_drugs_step1_chembl_moa.head())


100%|██████████| 201/201 [00:16<00:00, 12.17it/s]
100%|██████████| 249/249 [00:20<00:00, 12.44it/s]
100%|██████████| 214/214 [00:17<00:00, 12.22it/s]

                         drug  distance    z_score  Source compound_chembl_id  \
0             florbetaben f18  0.000000 -28.523016  ChEMBL       CHEMBL566752   
1  pioglitazone hydrochloride  0.500000 -20.175329  ChEMBL         CHEMBL1715   
2       rosiglitazone maleate  0.666667 -17.392767  ChEMBL          CHEMBL843   
3     memantine hydrochloride  1.000000 -11.827642  ChEMBL         CHEMBL1699   
4              metaproterenol  1.000000 -11.827642  ChEMBL          CHEMBL776   

                                                 MoA  
0                                   Diagnostic agent  
1  Peroxisome proliferator-activated receptor gam...  
2  Peroxisome proliferator-activated receptor gam...  
3  Glutamate [NMDA] receptor negative allosteric ...  
4                                               None  





In [171]:
# Double checking that the numbers add up
print("Before MoA annotation:")
print(f"Step 1 drugs: {len(top_percent_drugs_step1)}")
print(f"Step 2 drugs: {len(top_percent_drugs_step2)}")
print(f"Step 3 drugs: {len(top_percent_drugs_step3)}")

print("\nAfter DrugBank MoA annotation:")
print(f"Step 1 DrugBank: {len(top_percent_drugs_step1_drugbank_moa)}")
print(f"Step 2 DrugBank: {len(top_percent_drugs_step2_drugbank_moa)}")
print(f"Step 3 DrugBank: {len(top_percent_drugs_step3_drugbank_moa)}")

print("\nAfter ChEMBL MoA annotation:")
print(f"Step 1 ChEMBL: {len(top_percent_drugs_step1_chembl_moa)}")
print(f"Step 2 ChEMBL: {len(top_percent_drugs_step2_chembl_moa)}")
print(f"Step 3 ChEMBL: {len(top_percent_drugs_step3_chembl_moa)}")

Before MoA annotation:
Step 1 drugs: 641
Step 2 drugs: 779
Step 3 drugs: 636

After DrugBank MoA annotation:
Step 1 DrugBank: 440
Step 2 DrugBank: 530
Step 3 DrugBank: 422

After ChEMBL MoA annotation:
Step 1 ChEMBL: 201
Step 2 ChEMBL: 249
Step 3 ChEMBL: 214


### Merge DrugBank and ChEMBL together after adding MoA column

In [175]:
def standardise_columns(df, source_type):
    """Standardise column structure for merging"""
    # Keep only essential columns
    essential_cols = ['drug', 'distance', 'z_score', 'Source', 'MoA']
    
    # Create a copy to work with
    df_clean = df[essential_cols].copy()

    # Add Drug_ID column
    if source_type == 'drugbank' and 'Drug_ID' in df.columns:
        df_clean['Drug_ID'] = df['Drug_ID']
    elif source_type == 'chembl' and 'compound_chembl_id' in df.columns:
        df_clean['Drug_ID'] = df['compound_chembl_id']
    else:
        # If no ID column found, set to None
        df_clean['Drug_ID'] = None
        print(f"Warning: No ID column found for {source_type} data")

    # Rename columns
    df_clean = df_clean.rename(columns={
        'drug': 'Drug',
        'distance': 'Distance', 
        'z_score': 'z_score',
        'MoA': 'MoA',
        'Source': 'Source'
    })

    # Reorder columns
    df_clean = df_clean[['Drug', 'Distance', 'z_score', 'MoA', 'Source', 'Drug_ID']]
    
    return df_clean

def merge_step_data(drugbank_df, chembl_df, step_name):
    """Merge DrugBank and ChEMBL data for a single step"""
    
    # Standardise columns
    drugbank_clean = standardise_columns(drugbank_df, 'drugbank')
    chembl_clean = standardise_columns(chembl_df, 'chembl')
    
    # Verify no overlap
    overlap = set(drugbank_clean['Drug']).intersection(set(chembl_clean['Drug']))
    if overlap:
        print(f"WARNING - {step_name}: Found {len(overlap)} overlapping drugs")
        print(f"Overlapping drugs: {list(overlap)[:5]}...")  # Show first 5
    
    # Concatenate
    merged = pd.concat([drugbank_clean, chembl_clean], ignore_index=True)
    
    # Sort by z_score (most significant first)
    merged = merged.sort_values('z_score', ascending=True)
    
    print(f"{step_name} merged: {len(merged)} total drugs ({len(drugbank_clean)} DrugBank + {len(chembl_clean)} ChEMBL)")
    
    return merged

# Merge each step
step1_merged = merge_step_data(top_percent_drugs_step1_drugbank_moa, 
                              top_percent_drugs_step1_chembl_moa, "Step 1")
step2_merged = merge_step_data(top_percent_drugs_step2_drugbank_moa, 
                              top_percent_drugs_step2_chembl_moa, "Step 2")
step3_merged = merge_step_data(top_percent_drugs_step3_drugbank_moa, 
                              top_percent_drugs_step3_chembl_moa, "Step 3")

step1_merged.to_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_step1_moa.csv", index=False)
step2_merged.to_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_step2_moa.csv", index=False)
step3_merged.to_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_step3_moa.csv", index=False)


Step 1 merged: 641 total drugs (440 DrugBank + 201 ChEMBL)
Step 2 merged: 779 total drugs (530 DrugBank + 249 ChEMBL)
Step 3 merged: 636 total drugs (422 DrugBank + 214 ChEMBL)
