# Identifying mechanism of action (MoA) of the top significant drugs after drug proximity analysis

After running drug proximity analysis, the top prioritised significant drugs will have their MoA recovered and this will guide the decision for wet lab validation. You can decide what percentage of the drugs you want to get the MoA for with the "top_percent" variable.

This is the **object-oriented programming (OOP)** approach which works better in this case.

In [1]:
import pandas as pd
import csv
import requests
from tqdm import tqdm

class MoAAnnotator:
    """
    Class to annotate significant drugs with mechanism of action (MoA) data.
    """

    def __init__(self, data_dir="../data/networks", postgres_dir="../data/networks/milner_drugbank_postgresql"):
        """
        Initalise the MoA annotator with data directories.

        Args:
            data_dir: Directory containing network data files
            postgres_dir: Directory containing DrugBank PostgreSQL files
        """
        self.data_dir = data_dir
        self.postgres_dir = postgres_dir

        # Load reference data
        self._load_reference_data()
    
    def _load_reference_data(self):
        """Load all reference datasets needed for MoA annotation."""
        # Load drug source mapping
        combined_dpi_with_source = pd.read_csv(f"{self.data_dir}/combined_DPI_with_source.csv")
        self.drug_to_source_map = self._create_drug_source_mapping(combined_dpi_with_source)

        # Load DrugBank reference data
        self._load_drugbank_data()

        # Load ChEMBL reference data
        self._load_chembl_data()
    
    def _create_drug_source_mapping(self, combined_dpi_with_source):
        """Build a mapping from drug name to source(s)"""
        def determine_source(sources):
            sources_set = set(sources)
            if sources_set == {"ChEMBL"}:
                return "ChEMBL"
            elif sources_set == {"DrugBank"}:
                return "DrugBank"
            else:
                return "Both"

        return (
            combined_dpi_with_source
            .groupby("Drug_Name")["Source"]
            .apply(determine_source)
            .to_dict()
        )
    
    def _load_drugbank_data(self):
        """Load DrugBank reference data"""
        # Load DrugBank DPI with DrugBank IDs
        drugbank_dpi_with_id = pd.read_csv(f"{self.data_dir}/drugbank_DPI_with_ID.csv")
        drugbank_dpi_with_id["Drug_Name"] = drugbank_dpi_with_id["Drug_Name"].str.lower()
        drugbank_dpi_with_id["Drug_ID"] = drugbank_dpi_with_id["Drug_ID"].astype(int)

        # Map Drug_Name -> Drug_ID using drugbank_dpi_with_id
        self.drug_name_to_id_drugbank = (
            drugbank_dpi_with_id
            .drop_duplicates("Drug_Name")
            .set_index("Drug_Name")["Drug_ID"]
            .to_dict()
        )

        # Load and clean pharmacologies.csv
        moa_data = []
        with open(f"{self.postgres_dir}/pharmacologies.csv", "r", encoding="utf-8") as f:
            reader = csv.reader(f)
            for row in reader:
                if len(row) >= 5:
                    try:
                        moa_data.append((int(row[1]), row[4]))  # drug_id, mechanism_of_action
                    except ValueError:
                        continue  # skip malformed rows

        self.pharmacologies_moa_clean = pd.DataFrame(moa_data, columns=["Drug_ID", "MoA"])

        # Create mapping from integer ID to DrugBank ID using drugs.csv
        drugs_df = pd.read_csv(f"{self.postgres_dir}/drugs.csv", header=None)
        # Extract relevant columns
        self.integer_id_to_drugbank_id_map = dict(zip(drugs_df.iloc[:, 0], drugs_df.iloc[:, 2]))

    def _load_chembl_data(self):
        """Load ChEMBL reference data"""
        # Load annotated DPI to get mapping from drug name to ChEMBL ID, ensure lowercase for matching
        chembl_dpi_with_meta = pd.read_csv(f"{self.data_dir}/chembl_DPI_with_meta.csv")
        chembl_dpi_with_meta['compound_name'] = chembl_dpi_with_meta['compound_name'].str.lower()
        self.drug_name_to_chembl_id_map = chembl_dpi_with_meta[['compound_name', 'compound_chembl_id']].drop_duplicates('compound_name', keep='first')

    def get_top_percent_drugs(self, significant_drugs_df, top_percent):
        """Extract top percentage of significant drugs"""
        top_percent_count = round(top_percent * len(significant_drugs_df))
        return significant_drugs_df.head(top_percent_count).copy()

    def add_source_info(self, df):
        """Add source info to dataframe"""
        df["Source"] = df["drug"].map(self.drug_to_source_map)
        return df
    
    def separate_by_source(self, df):
        """Separate drugs by source (DrugBank vs ChEMBL). Drugs in both sources will be classified as DrugBank (clearer MoA definitions)."""
        drugbank_df = df[(df["Source"] == "DrugBank") | (df["Source"] == "Both")]
        chembl_df = df[df["Source"] == "ChEMBL"]
        return drugbank_df, chembl_df
    
    def add_drugbank_moa(self, df):
        """Add mechanism of action (MoA) column and DrugBank IDs"""
        df = df.copy()
        # Map drug name to integer drug ID
        df["Integer_ID"] = df["drug"].map(self.drug_name_to_id_drugbank)
        # Convert integer ID to actual DrugBank ID
        df["Drug_ID"] = df["Integer_ID"].map(self.integer_id_to_drugbank_id_map)
        # Merge with pharmacologies_moa on integrer ID (since pharmacologies.csv uses integer IDs)
        df = df.merge(self.pharmacologies_moa_clean, left_on="Integer_ID", right_on="Drug_ID", how="left", suffixes=('', '_pharm'))
        # Clean up: keep the DrugBank ID and drop the integer ID columns
        df = df.drop(columns=["Integer_ID", "Drug_ID_pharm"])
        return df

    def add_chembl_moa(self, df):
        """Add mechanism of action (MoA) column for ChEMBL drugs"""
        # Merge to get ChEMBL IDs
        df = df.merge(self.drug_name_to_chembl_id_map, left_on='drug', right_on='compound_name', how='left')
        # Query ChEMBL for unique compound_chembl_ids
        moa_data = []
        for chembl_id in tqdm(df['compound_chembl_id'].dropna().unique()):
            url = f"https://www.ebi.ac.uk/chembl/api/data/mechanism.json?molecule_chembl_id={chembl_id}"
            response = requests.get(url)
            if response.status_code == 200:
                json_data = response.json()
                mechanisms = json_data.get('mechanisms', [])
                moa_list = [mech.get('mechanism_of_action') for mech in mechanisms if mech.get('mechanism_of_action')]
                moa_str = '; '.join(moa_list) if moa_list else None
                moa_data.append({'compound_chembl_id': chembl_id, 'MoA': moa_str})
        
        moa_df = pd.DataFrame(moa_data)
        df = df.merge(moa_df, on='compound_chembl_id', how='left')

        # Drop helper column
        return df.drop(columns=['compound_name'])
    
    def standardise_columns(self, df, source_type):
        """Standardise column structure for merging"""
        # Keep only essential columns
        essential_cols = ['drug', 'distance', 'z_score', 'Source', 'MoA']
        
        # Create a copy to work with
        df_clean = df[essential_cols].copy()

        # Add Drug_ID column
        if source_type == 'drugbank' and 'Drug_ID' in df.columns:
            df_clean['Drug_ID'] = df['Drug_ID']
        elif source_type == 'chembl' and 'compound_chembl_id' in df.columns:
            df_clean['Drug_ID'] = df['compound_chembl_id']
        else:
            # If no ID column found, set to None
            df_clean['Drug_ID'] = None
            print(f"Warning: No ID column found for {source_type} data")

        # Rename columns
        df_clean = df_clean.rename(columns={
            'drug': 'Drug',
            'distance': 'Distance', 
            'z_score': 'z_score',
            'MoA': 'MoA',
            'Source': 'Source'
        })

        # Reorder columns
        df_clean = df_clean[['Drug', 'Distance', 'z_score', 'MoA', 'Source', 'Drug_ID']]
        
        return df_clean
    
    def merge_step_data(self, drugbank_df, chembl_df, step_name):
        """Merge DrugBank and ChEMBL data for a single step"""
        # Standardise columns
        drugbank_clean = self.standardise_columns(drugbank_df, 'drugbank')
        chembl_clean = self.standardise_columns(chembl_df, 'chembl')
        
        # Verify no overlap
        overlap = set(drugbank_clean['Drug']).intersection(set(chembl_clean['Drug']))
        if overlap:
            print(f"WARNING - {step_name}: Found {len(overlap)} overlapping drugs")
            print(f"Overlapping drugs: {list(overlap)[:5]}...")  # Show first 5
        
        # Concatenate
        merged = pd.concat([drugbank_clean, chembl_clean], ignore_index=True)
        
        # Sort by z_score (most significant first)
        merged = merged.sort_values('z_score', ascending=True)
        
        print(f"{step_name} merged: {len(merged)} total drugs ({len(drugbank_clean)} DrugBank + {len(chembl_clean)} ChEMBL)")
        
        return merged
    
    def process_significant_drugs(self, significant_drugs_df, top_percent, step_name=""):
        """
        Complete pipeline to process significant drugs and add MoA annotations.

        Args:
            significant_drugs_df: DataFrame with significant drugs
            top_percent: Percentage of top drugs to process (e.g., 0.1 for 10%)
            step_name: Name for this step (for printing)
        
        Returns:
            DataFrame with MoA annotations
        """
        # Extract top percentage of significant drugs
        top_percent_drugs = self.get_top_percent_drugs(significant_drugs_df, top_percent)

        # Add source info
        top_percent_drugs = self.add_source_info(top_percent_drugs)

        # Separate by source
        drugbank_df, chembl_df = self.separate_by_source(top_percent_drugs)

        # Process DrugBank drugs
        if len(drugbank_df) > 0:
            drugbank_moa = self.add_drugbank_moa(drugbank_df.copy())
            print(f"{step_name} DrugBank MoA matches:", drugbank_moa["MoA"].notna().sum(), "/", len(drugbank_moa))
        else:
            drugbank_moa = pd.DataFrame()
        
        # Process ChEMBL drugs
        if len(chembl_df) > 0:
            chembl_moa = self.add_chembl_moa(chembl_df.copy())
        else:
            chembl_moa = pd.DataFrame()

        # Merge results
        if len(drugbank_moa) > 0 and len(chembl_moa) > 0:
            merged = self.merge_step_data(drugbank_moa, chembl_moa, step_name)
        elif len(drugbank_moa) > 0:
            merged = self.standardise_columns(drugbank_moa, 'drugbank')
        elif len(chembl_moa) > 0:
            merged = self.standardise_columns(chembl_moa, 'chembl')
        else:
            merged = pd.DataFrame()
        
        return merged

## Add MoA for whole gene lists

In [2]:
if __name__ == "__main__":
    # Initialise the annotator
    annotator = MoAAnnotator()

    # Set parameters
    top_percent = 1.0

    print("======== ADDING MOA FOR WHOLE GENE LISTS ========")

    # Load and process each step
    significant_drugs_whole_1000_step1 = pd.read_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_step1.csv", index_col=0)
    significant_drugs_whole_1000_step2 = pd.read_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_step2.csv", index_col=0)
    significant_drugs_whole_1000_step3 = pd.read_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_step3.csv", index_col=0)
    significant_drugs_whole_1000_full_diff = pd.read_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_full_diff.csv", index_col=0)

    # Process each step
    step1_merged = annotator.process_significant_drugs(significant_drugs_whole_1000_step1, top_percent, "Step 1")
    step2_merged = annotator.process_significant_drugs(significant_drugs_whole_1000_step2, top_percent, "Step 2")
    step3_merged = annotator.process_significant_drugs(significant_drugs_whole_1000_step3, top_percent, "Step 3")
    full_diff_merged = annotator.process_significant_drugs(significant_drugs_whole_1000_full_diff, top_percent, "Full Diff")

    # Save results
    step1_merged.to_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_step1_moa.csv", index=False)
    step2_merged.to_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_step2_moa.csv", index=False)
    step3_merged.to_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_step3_moa.csv", index=False)
    full_diff_merged.to_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_whole_1000_full_diff_moa.csv", index=False)

    print("======== ADDING MOA FOR KEY GENE LISTS ========")

    # Load and process each step
    significant_drugs_key_1000_step1 = pd.read_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_key_1000_step1.csv", index_col=0)
    significant_drugs_key_1000_step2 = pd.read_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_key_1000_step2.csv", index_col=0)
    significant_drugs_key_1000_step3 = pd.read_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_key_1000_step3.csv", index_col=0)
    significant_drugs_key_1000_full_diff = pd.read_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_key_1000_full_diff.csv", index_col=0)

    # Process each step
    step1_key_merged = annotator.process_significant_drugs(significant_drugs_key_1000_step1, top_percent, "Step 1")
    step2_key_merged = annotator.process_significant_drugs(significant_drugs_key_1000_step2, top_percent, "Step 2")
    step3_key_merged = annotator.process_significant_drugs(significant_drugs_key_1000_step3, top_percent, "Step 3")
    full_diff_merged = annotator.process_significant_drugs(significant_drugs_key_1000_full_diff, top_percent, "Full Diff")

    # Save results
    step1_key_merged.to_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_key_1000_step1_moa.csv", index=False)
    step2_key_merged.to_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_key_1000_step2_moa.csv", index=False)
    step3_key_merged.to_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_key_1000_step3_moa.csv", index=False)
    full_diff_merged.to_csv("../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_key_1000_full_diff_moa.csv", index=False)

Step 1 DrugBank MoA matches: 440 / 440


100%|██████████| 201/201 [00:15<00:00, 13.02it/s]


Step 1 merged: 641 total drugs (440 DrugBank + 201 ChEMBL)
Step 2 DrugBank MoA matches: 530 / 530


100%|██████████| 249/249 [00:19<00:00, 13.09it/s]


Step 2 merged: 779 total drugs (530 DrugBank + 249 ChEMBL)
Step 3 DrugBank MoA matches: 422 / 422


100%|██████████| 214/214 [00:18<00:00, 11.77it/s]


Step 3 merged: 636 total drugs (422 DrugBank + 214 ChEMBL)
Full Diff DrugBank MoA matches: 526 / 526


100%|██████████| 273/273 [00:20<00:00, 13.26it/s]


Full Diff merged: 799 total drugs (526 DrugBank + 273 ChEMBL)
Step 1 DrugBank MoA matches: 241 / 241


100%|██████████| 94/94 [00:06<00:00, 14.17it/s]


Step 1 merged: 335 total drugs (241 DrugBank + 94 ChEMBL)
Step 2 DrugBank MoA matches: 292 / 292


100%|██████████| 158/158 [00:11<00:00, 13.91it/s]


Step 2 merged: 450 total drugs (292 DrugBank + 158 ChEMBL)
Step 3 DrugBank MoA matches: 212 / 212


100%|██████████| 102/102 [00:07<00:00, 14.17it/s]


Step 3 merged: 314 total drugs (212 DrugBank + 102 ChEMBL)
Full Diff DrugBank MoA matches: 343 / 343


100%|██████████| 175/175 [00:11<00:00, 14.74it/s]


Full Diff merged: 518 total drugs (343 DrugBank + 175 ChEMBL)
