UPDATE: in the interest of time, this step is no longer being carried out. Nicholas will receive all significant drugs and their targets, and the in silico knockout will identify any promising targets and therefore drugs. This is because the in silico knockout is extremely fast to do.

## Identifying inhibition / activation of significant drugs after drug proximity analysis

After running drug proximity analysis, we need to find out whether the prioritised drugs inhibit or activate their target. This way we can do in silico knockout experiments and also in vitro validation down the line.

In [56]:
import pandas as pd
import sqlite3

In [3]:
z_score_thres = -1.96

### Significant drugs for step 1 of differentiation

In [35]:
# Load proximity results and filter for significant drugs
proximity_step1 = pd.read_csv("../results/humanPVATsn/network_analysis/proximity_step1.csv")
proximity_step1_significant = proximity_step1[proximity_step1["z_score"] <= z_score_thres]

# Print length before and after filtering for significance
print("Number of total drugs analysed for step 1 of differentiation:", len(proximity_step1))
print("Number of significant drugs for step 1 of differentiation", len(proximity_step1_significant))

# Extract significant drugs
significant_drugs_step1 = proximity_step1_significant["drug"].str.lower().unique()

# Save significant drugs to CSV file
step1_significant_drugs_path = "../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_step1.csv"
significant_drugs_step1_df = pd.DataFrame(significant_drugs_step1, columns=["drug"])
significant_drugs_step1_df.to_csv(step1_significant_drugs_path, index=False, header=False)
print("Significant drugs for step 1 saved to", step1_significant_drugs_path)

Number of total drugs analysed for step 1 of differentiation: 2244
Number of significant drugs for step 1 of differentiation 668
Significant drugs for step 1 saved to ../results/humanPVATsn/network_analysis/proximity_significant_drugs/significant_drugs_step1.csv


In [30]:
# Load existing DPI mapping
combined_DPI_with_source = pd.read_csv("../data/networks/combined_DPI_with_source.csv")
combined_DPI_with_source["Drug_Name"] = combined_DPI_with_source["Drug_Name"].str.lower() # standardise to lower case to match drug proximity results

# Filter DPI to keep only targets of significant drugs
combined_dpi_with_source_significant = combined_DPI_with_source[combined_DPI_with_source["Drug_Name"].isin(significant_drugs_step1)]

# Print length of DPI before and after filtering
print("Number of total drug-target interactions in DPI:", len(combined_DPI_with_source))
print("Number of drug-target interactions involving significant drugs in DP in step 1:", len(combined_dpi_with_source_significant))

print(combined_dpi_with_source_significant.head())
print(combined_dpi_with_source_significant["Source"].value_counts())

Number of total drug-target interactions in DPI: 15951
Number of drug-target interactions involving significant drugs in DP in step 1: 5808
     Drug_Name Drug_Target  Source
5     abacavir       ABCG2  ChEMBL
6     abacavir         ALB  ChEMBL
8   abametapir        CCR1  ChEMBL
9   abametapir        CCR5  ChEMBL
10  abametapir        CCR8  ChEMBL
Source
ChEMBL      4897
DrugBank     562
Both         349
Name: count, dtype: int64


### Summarise source of significant genes
This function checks where the top "n_top" significant drugs come from (ChEMBL, DrugBank or both).

In [None]:
def summarise_top_drug_sources(proximity_df, dpi_path, z_score_threshold=-1.96, n_top=100, label="step"):
    """
    Summarise the source breakdown of the top N significant drugs based on z-score.

    Parameters:
        proximity_df: DataFrame containing 'drug' and 'z_score' columns.
        dpi_path: Path to combined_DPI_with_source.csv.
        z_score_threshold: z-score threshold for significance.
        n_top: Number of top significant drugs to include.
        label: Label to identify the context (e.g., 'step 1 of differentiation').

    Returns:
        A Series with counts for each drug source (ChEMBL only, DrugBank only, Both).
    """
    print(f"======== FOR {label.upper()}, GETTING TOP {n_top} MOST SIGNIFICANT DRUGS ========")

    # Filter significant drugs and get top N unique by z-score
    proximity_significant = proximity_df[proximity_df["z_score"] <= z_score_threshold]
    top_unique_drugs = (
        proximity_significant
        .sort_values("z_score")["drug"]
        .str.lower()
        .drop_duplicates()
        .head(n_top)
    )
    print(f"Total top {n_top} unique significant drugs: {len(top_unique_drugs)}")

    # Load DPI and standardise drug names
    dpi_df = pd.read_csv(dpi_path)
    dpi_df["Drug_Name"] = dpi_df["Drug_Name"].str.lower()

    # Count how many top drugs are mapped
    dpi_drugs = dpi_df["Drug_Name"].unique()
    n_mapped = sum(drug in dpi_drugs for drug in top_unique_drugs)
    n_unmapped = n_top - n_mapped

    print(f"Number of top {n_top} drugs that map to DPI: {n_mapped}")
    print(f"Number of top {n_top} drugs with no known gene targets: {n_unmapped}")

    # Filter DPI and collapse source info
    dpi_top = dpi_df[dpi_df["Drug_Name"].isin(top_unique_drugs)]
    unique_drugs_with_source = dpi_top.drop_duplicates(subset=["Drug_Name", "Source"])
    source_per_drug = (
        unique_drugs_with_source
        .groupby("Drug_Name")["Source"]
        .agg(lambda x: "Both" if set(x) == {"ChEMBL", "DrugBank"} else x.iloc[0])
    )

    # Count sources
    counts = source_per_drug.value_counts()
    print(f"\nBreakdown of top {n_top} significant drugs with DPI mapping:")
    print(f"- From ChEMBL only: {counts.get('ChEMBL', 0)}")
    print(f"- From DrugBank only: {counts.get('DrugBank', 0)}")
    print(f"- From both: {counts.get('Both', 0)}")

    return counts

In [55]:
# Summarise sources for step 1 significant drugs
step1_source_counts = summarise_top_drug_sources(
    proximity_df=proximity_step1,
    dpi_path="../data/networks/combined_DPI_with_source.csv",
    z_score_threshold=z_score_thres,
    n_top=100,
    label="step 1 of differentiation"
)

Total top 100 unique significant drugs: 100
Number of top 100 drugs that map to DPI: 100
Number of top 100 drugs with no known gene targets: 0

Breakdown of top 100 significant drugs with DPI mapping:
- From ChEMBL only: 28
- From DrugBank only: 60
- From both: 12


### ChEMBL MoA
This chunk adds information about whether the drug is an inhibitor or activator etc. for all significant drugs coming from the ChEMBL database.

#### Step 1

In [69]:
# Filter for ChEMBL drugs only
chembl_drugs = combined_dpi_with_source_significant[
    combined_dpi_with_source_significant["Source"].isin(["ChEMBL", "Both"])
]["Drug_Name"].unique()
print(f"Number of unique ChEMBL-related significant drugs for step 1: {len(chembl_drugs)}")

# Connect to ChEMBL SQLite database
conn = sqlite3.connect("../data/networks/chembl_35/chembl_35.db")

# Query ChEMBL for mechanism of action
moa_query = """
SELECT 
    m.compound_name, 
    c.accession AS uniprot_id, 
    m.action_type, 
    m.mechanism_of_action
FROM mechanism m
JOIN target_dictionary t ON m.target_chembl_id = t.target_chembl_id
JOIN target_components tc ON t.target_chembl_id = tc.target_chembl_id
JOIN component_sequences c ON tc.component_id = c.component_id
WHERE m.action_type IS NOT NULL
"""

moa_df = pd.read_sql_query(moa_query, conn)
conn.close()

Number of unique ChEMBL-related significant drugs for step 1: 415


DatabaseError: Execution failed on sql '
SELECT 
    m.compound_name, 
    c.accession AS uniprot_id, 
    m.action_type, 
    m.mechanism_of_action
FROM mechanism m
JOIN target_dictionary t ON m.target_chembl_id = t.target_chembl_id
JOIN target_components tc ON t.target_chembl_id = tc.target_chembl_id
JOIN component_sequences c ON tc.component_id = c.component_id
WHERE m.action_type IS NOT NULL
': no such table: mechanism

In [68]:
import sqlite3

conn = sqlite3.connect("../data/networks/chembl_35/chembl_35_sqlite/chembl_35.db")
cursor = conn.cursor()

# List all available tables in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
conn.close()

for t in tables:
    print(t[0])

action_type
assay_type
chembl_id_lookup
confidence_score_lookup
curation_lookup
chembl_release
source
relationship_type
target_type
variant_sequences
bioassay_ontology
data_validity_lookup
activity_smid
activity_stds_lookup
assay_classification
atc_classification
bio_component_sequences
component_sequences
protein_classification
domains
go_classification
structural_alert_sets
products
frac_classification
hrac_classification
irac_classification
research_stem
organism_class
patent_use_codes
usan_stems
version
cell_dictionary
docs
target_dictionary
tissue_dictionary
molecule_dictionary
activity_supp
component_class
component_domains
component_go
component_synonyms
structural_alerts
defined_daily_dose
product_patents
protein_class_synonyms
research_companies
assays
compound_records
binding_sites
biotherapeutics
compound_properties
compound_structural_alerts
compound_structures
molecule_atc_classification
molecule_frac_classification
molecule_hierarchy
molecule_hrac_classification
molecule_