# Network analysis

This file contains the code to generate the three different inputs for network analysis in my MPhil brown adipocyte differentiation project.

In [None]:
import pandas as pd

## Create the combined PPI list

Load and reformat the downloaded interactomes.

In [None]:
# ====== 1. STRING ======
# Load reformatted STRING data (reformatting done in R using biomaRt)
string_PPI = pd.read_csv("../data/networks/string_PPI.csv", sep=",")


# ====== 2. BioGRID ======
# Load BioGRID data
biogrid_PPI = pd.read_csv("../data/networks/BIOGRID-MV-Physical-4.4.245.tab3.txt", sep="\t")

# Filter to keep only human interactions
biogrid_PPI = biogrid_PPI[
    (biogrid_PPI["Organism ID Interactor A"] == 9606) &
    (biogrid_PPI["Organism ID Interactor B"] == 9606)
]

# Filter and reformat
biogrid_PPI = biogrid_PPI[["Official Symbol Interactor A", "Official Symbol Interactor B"]]
biogrid_PPI.columns = ["GeneA", "GeneB"]
biogrid_PPI.to_csv("../data/networks/biogrid_PPI.csv", index=False)


# ====== 3. HINT ======
# Load HINT data
hint_PPI = pd.read_csv("../data/networks/HINT_HomoSapiens_binary_hq.txt", sep="\t")

# Filter and reformat
hint_PPI = hint_PPI[["Gene_A", "Gene_B"]]
hint_PPI.columns = ["GeneA", "GeneB"]
hint_PPI.to_csv("../data/networks/hint_PPI.csv", index=False)


  biogrid_PPI = pd.read_csv("../data/networks/BIOGRID-MV-Physical-4.4.245.tab3.txt", sep="\t")


In [47]:
# Create summary table
ppi_counts = pd.DataFrame({
    "Dataset": ["BIOGRID", "STRING", "HINT"],
    "Num_Interactions": [
        len(biogrid_PPI),
        len(string_PPI),
        len(hint_PPI)
    ]
})

# Print the table
print(ppi_counts)

   Dataset  Num_Interactions
0  BIOGRID            318015
1   STRING            207024
2     HINT            163435


Combine all PPI networks into one list without duplications

In [68]:

# Combine all PPI networks
combined_ppi = pd.concat([biogrid_PPI, string_PPI, hint_PPI], ignore_index=True)
print("Combined PPI: length", len(combined_ppi))

# Remove empty rows
combined_ppi = combined_ppi.dropna()
print("After removing rows with empty entry: length", len(combined_ppi))

# Remove self-loops
combined_ppi = combined_ppi[combined_ppi["GeneA"] != combined_ppi["GeneB"]]
print("After removing self-loops: length", len(combined_ppi))

# Remove duplicates in both directions
combined_ppi.drop_duplicates()
print("After removing duplicates: length", len(combined_ppi))
combined_ppi["sorted_pair"] = combined_ppi.apply(lambda row: tuple(sorted([row["GeneA"], row["GeneB"]])), axis=1)
combined_ppi = combined_ppi.drop_duplicates(subset="sorted_pair")
combined_ppi = combined_ppi.drop(columns="sorted_pair")
print("After removing bidirectional duplicates: length", len(combined_ppi))

# Save combined PPI
combined_ppi.to_csv("../data/networks/combined_PPI.csv", index=False)
print("Combined PPI network saved as 'combined_PPI.csv'")

Combined PPI: length 688474
After removing rows with empty entry: length 678651
After removing self-loops: length 666715
After removing duplicates: length 666715
After removing bidirectional duplicates: length 345547
Combined PPI network saved as 'combined_PPI.csv'


## Create the drug-gene interaction list

### DrugBank
From DrugBank, I take the files bonds.csv, polypeptides.csv, and bio_entities.csv and combine them to generate a drug-gene interaction list containing the drug bank ID and then the gene name of the protein it interacts with.

In [329]:

######### Load data #########

print("\nLoading dataframes...\n")

bonds = pd.read_csv("../data/networks/milner_drugbank_postgresql/bonds.csv", sep=",", header=None)
bonds.columns = [
    "id", "type", "drug_id", "biodb_id", "pdb_id", "position",
    "pharmacological_action", "antagonist", "agonist", "substrate",
    "inhibitor", "inducer", "other_action", "inducer_strength",
    "inhibitor_strength", "induction_clinically_sig", "inhibition_clinically_sig"
]

polypeptides = pd.read_csv("../data/networks/milner_drugbank_postgresql/polypeptides.csv", sep=",", header=None)
polypeptides.columns = [
    "uniprot_id", "name", "uniprot_name", "gene_name", "organism_id",
    "molecular_weight", "theoretical_pi", "general_function", "specific_function",
    "signal_regions", "transmembrane_regions", "pdb_ids", "genbank_gene_id",
    "genbank_protein_id", "genecard_id", "locus", "genatlas_id", "hgnc_id",
    "meta_cyc_id", "ncbi_sequence_ids", "tissue_specificity", "cofactor",
    "subunit", "cellular_location", "amino_acid_sequence", "gene_sequence"
]

bio_entities = pd.read_csv("../data/networks/milner_drugbank_postgresql/bio_entities.csv", sep=",", header=None)
bio_entities.columns = [
    "biodb_id", "name", "kind", "organism"
]

drugs = pd.read_csv("../data/networks/milner_drugbank_postgresql/drugs.csv", sep=",", header=None)
drugs.columns = [
    "id", "type", "drugbank_id", "name", "state", "description",
    "simple_description", "clinical_description", "cas_number",
    "protein_formula", "protein_weight", "investigational", "approved",
    "vet_approved", "experimental", "nutraceutical", "illicit", "withdrawn",
    "moldb_mono_mass", "moldb_inchi", "moldb_inchikey", "moldb_smiles",
    "moldb_average_mass", "moldb_formula", "synthesis_patent_id",
    "protein_weight_details", "biotech_kind"
]
# Rename "id" column to "drug_id" in drugs DataFrame
drugs.rename(columns={"id": "drug_id", "name": "drug_name"}, inplace=True)

# Print head of each DataFrame
print("Bonds DataFrame head:")
print(bonds.head())
print("\nPolypeptides DataFrame head:")
print(polypeptides.head())
print("\nBio Entities DataFrame head:")
print(bio_entities.head())
print("\nDrugs DataFrame head:")
print(drugs.head())


######### Filter dataframes #########

print("\nFiltering dataframes...\n")

# Keep only target bonds with pharmacological action
target_bonds = bonds[
    (bonds["type"] == "TargetBond") &
    (bonds["pharmacological_action"] == "yes")
].copy()
target_bonds = target_bonds[["drug_id", "biodb_id"]]
print("Bonds DataFrame head:")
print(target_bonds.head())

# Filter for human proteins, then keep protein name and gene name
polypeptides = polypeptides[polypeptides["organism_id"] == 154].copy()
polypeptides = polypeptides[["name", "gene_name"]]
print("\nPolypeptides DataFrame head:")
print(polypeptides.head())

# Keep only ID and name
bio_entities = bio_entities[bio_entities["kind"] == "protein"].copy()
bio_entities = bio_entities[["biodb_id", "name"]]
print("\nBio Entities DataFrame head:")
print(bio_entities.head())

# Filter for id, drug bank id, and name
drugs = drugs[["drug_id", "drugbank_id", "drug_name"]]
print("\nDrugs DataFrame head:")
print(drugs.head())


######### Merge dataframes #########

print("\nMerging dataframes...\n")

# Merge drug ID's with protein names
drug_protein = target_bonds.merge(bio_entities, on="biodb_id", how="left")

# Merge with gene names
drug_gene = drug_protein.merge(polypeptides, on="name", how="left")
print("Missing gene names:", drug_gene["gene_name"].isna().sum())

# Merge with drug bank ID's and drug names
drug_gene = drug_gene.merge(drugs, on="drug_id", how="left")
print("\nDrugs DataFrame head:")
print(drug_gene.head())

# Keep only drug and gene names
drug_gene_final = drug_gene[["drugbank_id", "drug_name", "gene_name"]]
# Remove empty rows
drug_gene_final = drug_gene_final.dropna()
print("\nFinal Drugs DataFrame head:")
print(drug_gene_final.head())

# Print the number of unique drugs and genes
print("\nNumber of unique drugs:", drug_gene_final["drugbank_id"].nunique())
print("Number of unique genes:", drug_gene_final["gene_name"].nunique())

# Print dimensions of the final DataFrame
print("\nFinal DataFrame dimensions:", drug_gene_final.shape)

# Save the final DataFrame to a CSV file
drug_gene_final.to_csv("../data/networks/drugbank_DPI_with_drug_names.csv", index=False)

# Remove column with drug names
drug_gene_no_name = drug_gene_final.drop(columns=["drug_name"])
drug_gene_no_name.columns = ["DrugID", "Drug_Target"]
drug_gene_no_name.to_csv("../data/networks/drugbank_DPI.csv", index=False)


Loading dataframes...

Bonds DataFrame head:
     id        type  drug_id   biodb_id  pdb_id  position  \
0     1  TargetBond        1  BE0000048     NaN         1   
1     2  TargetBond        2  BE0000767     NaN         1   
2  8430  TargetBond        2  BE0000901     NaN         2   
3  8432  TargetBond        2  BE0002094     NaN         3   
4  8433  TargetBond        2  BE0002095     NaN         4   

  pharmacological_action  antagonist  agonist  substrate  inhibitor  inducer  \
0                    yes           0        0          0          1        0   
1                    yes           0        0          0          0        0   
2                unknown           0        0          0          0        0   
3                unknown           0        0          0          0        0   
4                unknown           0        0          0          0        0   

  other_action inducer_strength inhibitor_strength  induction_clinically_sig  \
0          NaN            

### ChEMBL
From CheMBL, I downloaded the chembl_35_sqlite.tar.gz file from version 35.

In [313]:
import sqlite3
import pandas as pd

# Connect to the database
conn = sqlite3.connect("../data/networks/chembl_35/chembl_35_sqlite/chembl_35.db")

cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# Query ChEMBL database for drug-protein interactions (DPI) keeping high confidence interactions
query = """
SELECT DISTINCT 
    activities.molregno,
    target_dictionary.chembl_id AS target_chembl_id
FROM 
    activities
JOIN 
    assays ON activities.assay_id = assays.assay_id
JOIN 
    target_dictionary ON assays.tid = target_dictionary.tid
JOIN 
    target_components ON target_dictionary.tid = target_components.tid
JOIN 
    component_sequences ON target_components.component_id = component_sequences.component_id
WHERE 
    activities.standard_relation = '='
    AND activities.standard_type IN ('IC50', 'Ki', 'Kd', 'EC50')
    AND activities.standard_value IS NOT NULL
    AND component_sequences.organism = 'Homo sapiens'
"""

# Execute the query and load the results into a DataFrame
chembl_dpi = pd.read_sql(query, conn)
print("Chembl DPI DataFrame head:")
print(chembl_dpi.head())
print("\nNumber of unique drugs:", chembl_dpi["molregno"].nunique())
print("Number of unique targets:", chembl_dpi["target_chembl_id"].nunique())
print("\nFinal Chembl DPI DataFrame dimensions:")
print(chembl_dpi.shape)


Chembl DPI DataFrame head:
   molregno target_chembl_id
0    179188        CHEMBL235
1     33632        CHEMBL259
2     33415        CHEMBL259
3     33415       CHEMBL3795
4    219299       CHEMBL3974

Number of unique drugs: 673271
Number of unique targets: 3768

Final Chembl DPI DataFrame dimensions:
(1088205, 2)


In [314]:
# Map ChEMBL target IDs to UniProt IDs
uniprot_map = pd.read_csv("../data/networks/chembl_35/chembl_35_sqlite/chembl_uniprot_mapping.txt", sep="\t", header=None, skiprows=1)
uniprot_map.columns = ["target_uniprot_id", "target_chembl_id", "target_name", "target_type"]
uniprot_map = uniprot_map[["target_chembl_id", "target_uniprot_id"]]
chembl_dpi_final = chembl_dpi.merge(uniprot_map, on="target_chembl_id", how="left")
# Drop rows with missing values
chembl_dpi_final = chembl_dpi_final.dropna()
print("Chembl DPI with UniProt mapping DataFrame head:\n", chembl_dpi_final.head())

# Save this DataFrame to a CSV file
chembl_dpi_final.to_csv("../data/networks/inter_chembl_DPI.csv", index=False)

Chembl DPI with UniProt mapping DataFrame head:
    molregno target_chembl_id target_uniprot_id
0    179188        CHEMBL235            P37231
1     33632        CHEMBL259            P32245
2     33415        CHEMBL259            P32245
3     33415       CHEMBL3795            Q01726
4    219299       CHEMBL3974            P25116


The mapping of target UniProt IDs to target gene names was done in R. The next step is to convert the drug molregno IDs to actual ChEMBL IDs.

In [323]:
# Load file
chembl_dpi = pd.read_csv("../data/networks/annotated_chembl_dpi.csv", sep=",")

# Load molregno to ChEMBL compound ID and name
compound_info_query = """
SELECT 
    molregno,
    chembl_id AS compound_chembl_id,
    pref_name AS compound_name
FROM 
    molecule_dictionary
"""

compound_info = pd.read_sql(compound_info_query, conn)

# Merge to get drug names and ChEMBL compound IDs
chembl_dpi = chembl_dpi.merge(compound_info, on="molregno", how="left")

# Remove duplicates
chembl_dpi = chembl_dpi.drop_duplicates()

# Save a subset containing just named drugs
named_drugs = chembl_dpi[chembl_dpi["compound_name"].notna()]

print("Final ChEMBL DataFrame:\n", chembl_dpi.head(10))
print("Shape of final ChEMBL DataFrame:\n", chembl_dpi.shape)

Final ChEMBL DataFrame:
    molregno target_chembl_id target_uniprot_id gene_name compound_chembl_id  \
0    179188        CHEMBL235            P37231     PPARG       CHEMBL111217   
1     33632        CHEMBL259            P32245      MC4R       CHEMBL415341   
2     33415        CHEMBL259            P32245      MC4R       CHEMBL437822   
3     33415       CHEMBL3795            Q01726      MC1R       CHEMBL437822   
4    219299       CHEMBL3974            P25116       F2R       CHEMBL268064   
5    187899       CHEMBL3974            P25116       F2R       CHEMBL115543   
6    219632       CHEMBL3974            P25116       F2R       CHEMBL337875   
7    219724       CHEMBL3974            P25116       F2R       CHEMBL130058   
8    219100       CHEMBL3974            P25116       F2R       CHEMBL132254   
9    210380        CHEMBL236            P41143     OPRD1       CHEMBL423694   

  compound_name  
0          None  
1          None  
2          None  
3          None  
4          None

#### (Failed) Mapping of ChEMBL IDs to DrugBank IDs
This was an attempt to map the ChEMBL IDs to DrugBank IDs to be able to merge the two DPI dataframes into a single final one. Because of the large number of ChEMBL IDs to convert (672,144), the UniChem API cannot be used for the ChEMBL to DrugBank conversion. Instead I download the full mapping from UniChem ([link](https://chembl.gitbook.io/unichem/downloads)).

In [309]:
print(len(chembl_dpi['compound_chembl_id'].unique()))

672144


In [310]:
# Read mapping file
mapping = pd.read_csv("../data/networks/chembl_to_drugbank_map.txt", sep="\t", skiprows=1, names=["chembl_id", "drugbank_id"])

# Merge with ChEMBL DPI to get DrugBank IDs
chembl_dpi_mapped = chembl_dpi.merge(mapping[["chembl_id", "drugbank_id"]], left_on="compound_chembl_id", right_on="chembl_id", how="left")
print(chembl_dpi_mapped.head())

   molregno target_chembl_id target_uniprot_id gene_name compound_chembl_id  \
0    179188        CHEMBL235            P37231     PPARG       CHEMBL111217   
1     33632        CHEMBL259            P32245      MC4R       CHEMBL415341   
2     33415        CHEMBL259            P32245      MC4R       CHEMBL437822   
3     33415       CHEMBL3795            Q01726      MC1R       CHEMBL437822   
4    219299       CHEMBL3974            P25116       F2R       CHEMBL268064   

  compound_name chembl_id drugbank_id  
0          None       NaN         NaN  
1          None       NaN         NaN  
2          None       NaN         NaN  
3          None       NaN         NaN  
4          None       NaN         NaN  


In [311]:
# Quality control

# Total number of compounds
total = len(chembl_dpi_mapped)

# Number of mapped DrugBank IDs (not NA)
mapped = chembl_dpi_mapped['drugbank_id'].notna().sum()

# Number of unmapped DrugBank IDs (NA)
unmapped = chembl_dpi_mapped['drugbank_id'].isna().sum()

print(f"Total compounds: {total}")
print(f"Mapped to DrugBank: {mapped}")
print(f"Not mapped (NA): {unmapped}")
print(f"Mapping success rate: {mapped/total:.2%}")

Total compounds: 1270105
Mapped to DrugBank: 34763
Not mapped (NA): 1235342
Mapping success rate: 2.74%


This mapping only has a 2.74% success rate, because ChEMBL and DrugBank lists have many unique compounds not present in the other. In the reference paper by Han et al. (2021), their drug-gene interaction list contains both ChEMBL IDs and DrugBank IDs, so I will make my list in the same way.

#### Save final csv

In [328]:
# Keep only compound ChEMBL ID and target gene name
chembl_dpi_final = chembl_dpi[["gene_name", "compound_chembl_id"]].copy()

# Switch columns to match DrugBank DPI format
chembl_dpi_final = chembl_dpi[["compound_chembl_id", "gene_name"]].copy()
chembl_dpi_final.columns = ["DrugID", "Drug_Target"]
print(chembl_dpi_final.head())

# Save final unmapped ChEMBL DPI DataFrame
chembl_dpi_final.to_csv("../data/networks/chembl_DPI.csv", index=False)

         DrugID Drug_Target
0  CHEMBL111217       PPARG
1  CHEMBL415341        MC4R
2  CHEMBL437822        MC4R
3  CHEMBL437822        MC1R
4  CHEMBL268064         F2R


## Merge DrugBank and ChEMBL DPI lists 

In [335]:
# Standardise column names for both lists
drugbank_dpi = pd.read_csv("../data/networks/drugbank_DPI.csv", sep=",")
chembl_dpi = pd.read_csv("../data/networks/chembl_DPI.csv", sep=",")

# Concatenate the two DataFrames
combined_dpi = pd.concat([drugbank_dpi, chembl_dpi], ignore_index=True)

# Drop duplicates
combined_dpi = combined_dpi.drop_duplicates()

# Save to CSV
combined_dpi.to_csv("../data/networks/combined_DPI.csv", index=False)
print(combined_dpi.head())
print(f"Total unique drug-gene pairs: {len(combined_dpi)}")

    DrugID Drug_Target
0  DB00001          F2
1  DB00002        EGFR
2  DB00003      DNASE1
3  DB00004       IL2RA
4  DB00004       IL2RB
Total unique drug-gene pairs: 1248771
