In [11]:

import os
import urllib.request
from bs4 import BeautifulSoup

# FTP URL
url = "http://ftp.ebi.ac.uk/pub/databases/opentargets/platform/25.03/output/association_overall_direct/"
save_dir = "association_by_datasource_direct"
os.makedirs(save_dir, exist_ok=True)

# Parse HTML for .parquet links
with urllib.request.urlopen(url) as response:
    soup = BeautifulSoup(response.read(), "html.parser")

# Download each .parquet file
for link in soup.find_all("a"):
    href = link.get("href")
    if href.endswith(".parquet"):
        full_url = url + href
        save_path = os.path.join(save_dir, href)
        print(f"Downloading {href}...")
        urllib.request.urlretrieve(full_url, save_path)


Downloading part-00000-67ea6339-0087-4bca-bb51-0de521275806-c000.snappy.parquet...
Downloading part-00001-67ea6339-0087-4bca-bb51-0de521275806-c000.snappy.parquet...
Downloading part-00002-67ea6339-0087-4bca-bb51-0de521275806-c000.snappy.parquet...
Downloading part-00003-67ea6339-0087-4bca-bb51-0de521275806-c000.snappy.parquet...
Downloading part-00004-67ea6339-0087-4bca-bb51-0de521275806-c000.snappy.parquet...
Downloading part-00005-67ea6339-0087-4bca-bb51-0de521275806-c000.snappy.parquet...
Downloading part-00006-67ea6339-0087-4bca-bb51-0de521275806-c000.snappy.parquet...
Downloading part-00007-67ea6339-0087-4bca-bb51-0de521275806-c000.snappy.parquet...
Downloading part-00008-67ea6339-0087-4bca-bb51-0de521275806-c000.snappy.parquet...
Downloading part-00009-67ea6339-0087-4bca-bb51-0de521275806-c000.snappy.parquet...
Downloading part-00010-67ea6339-0087-4bca-bb51-0de521275806-c000.snappy.parquet...
Downloading part-00011-67ea6339-0087-4bca-bb51-0de521275806-c000.snappy.parquet...
Down

### GRAPH THING => 
### Create a graph of nodes = genes 
### edges = the euclidean distance of the gene's disease vector

In [1]:
import os
import pandas as pd

# Set the path to your openTargets folder
folder_path = "./openTargets"

# List all files in the folder
parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]

# Initialize an empty list to hold individual DataFrames
df_list = []

# Read each Parquet file and append the resulting DataFrame to the list
for parquet_file in parquet_files:
    file_path = os.path.join(folder_path, parquet_file)
    df = pd.read_parquet(file_path)  # Read the Parquet file
    df_list.append(df)

# Concatenate all DataFrames into one
openTargets_df = pd.concat(df_list, ignore_index=True)

# Now combined_df contains all the data from the Parquet files
print(openTargets_df.head())  # Preview the first few rows
print(len(openTargets_df))

      diseaseId         targetId     score  evidenceCount
0  DOID_0050890  ENSG00000001084  0.031799              4
1  DOID_0050890  ENSG00000004142  0.002217              1
2  DOID_0050890  ENSG00000004478  0.002217              1
3  DOID_0050890  ENSG00000004948  0.002957              1
4  DOID_0050890  ENSG00000005381  0.002217              1
3628026


In [2]:
import pronto

# Load the ontology
ontology = pronto.Ontology("http://purl.obolibrary.org/obo/doid.obo")

# Example: map DOIDs to names
doid_to_name = {term.id: term.name for term in ontology.terms() if term.id.startswith("DOID")}
print(doid_to_name["DOID:0050890"]) 

synucleinopathy


In [3]:
import pandas as pd
df = pd.read_csv("./Ensembl/Homo_sapiens.GRCh38.113.gtf.gz", sep='\t', comment='#', header=None)

  df = pd.read_csv("./Ensembl/Homo_sapiens.GRCh38.113.gtf.gz", sep='\t', comment='#', header=None)


In [4]:
import pandas as pd
from tqdm import tqdm

# Assuming df is already loaded
# Define a function to extract the gene_id and gene_name from the attribute column
def extract_gene_info(attributes):
    # Split by semicolon and strip whitespace
    attributes = attributes.split(';')
    
    # Initialize the gene_id and gene_name as None
    gene_id = None
    gene_name = None
    
    # Loop through the key-value pairs and extract gene_id and gene_name
    for attribute in attributes:
        if 'gene_id' in attribute:
            gene_id = attribute.split('"')[1]  # Extract value between quotes
        elif 'gene_name' in attribute:
            gene_name = attribute.split('"')[1]  # Extract value between quotes
    
    return pd.Series([gene_id, gene_name])

# Use tqdm to apply the function with a progress bar
tqdm.pandas(desc="Extracting gene info")

# Apply the function to the 'attribute' column with tqdm progress bar
gene_info_df = df[8].progress_apply(extract_gene_info)

# Set column names
gene_info_df.columns = ['gene_id', 'gene_name']

# Drop duplicates based on the 'gene_id' and 'gene_name' pair
gene_info_df = gene_info_df.drop_duplicates()

# Display the new DataFrame
print(gene_info_df.head())


Extracting gene info: 100%|██████████| 4114450/4114450 [02:32<00:00, 26916.80it/s]


             gene_id gene_name
0    ENSG00000142611    PRDM16
220  ENSG00000284616      None
226  ENSG00000157911     PEX10
346  ENSG00000260972      None
349  ENSG00000224340  RPL21P21


In [8]:
import pandas as pd

# Load MONDO nodes
mondo_df = pd.read_csv("./mondo/mondo_nodes.tsv", sep="\t")

# Drop rows without xrefs
mondo_df = mondo_df.dropna(subset=["xref"])

xref_to_name = {}

# Iterate over rows to extract all xrefs and map them to MONDO name
for _, row in mondo_df.iterrows():
    name = row['name']
    xrefs = row['xref'].split('|')
    for xref in xrefs:
        xref_to_name[xref] = name

# OPTIONAL: include MONDO IDs too
for _, row in mondo_df.iterrows():
    mondo_id = row['id']
    name = row['name']
    xref_to_name[mondo_id] = name

# Now you can use it like this
openTargets_df['diseaseCommonName'] = openTargets_df['diseaseId'].str.replace("_", ":", regex=False).map(xref_to_name)


In [9]:
gene_id_to_name = dict(zip(gene_info_df['gene_id'], gene_info_df['gene_name']))

# Step 2: Vectorized operations (no swifter needed anymore!)
openTargets_df['geneCommonName'] = openTargets_df['targetId'].map(gene_id_to_name)


In [10]:
print(openTargets_df.head())
print(len(openTargets_df))
print(openTargets_df["geneCommonName"].unique())
print(openTargets_df["geneCommonName"].unique().shape)
print(openTargets_df["diseaseCommonName"].unique())
print(openTargets_df["diseaseCommonName"].unique().shape)
missing_disease_rows = openTargets_df[openTargets_df['diseaseCommonName'].isna()]
print(missing_disease_rows.head())
print("Number of missing disease names:", missing_disease_rows.shape[0])
print(openTargets_df['diseaseId'].str.startswith("EFO_").sum())


      diseaseId         targetId     score  evidenceCount diseaseCommonName  \
0  DOID_0050890  ENSG00000001084  0.031799              4   synucleinopathy   
1  DOID_0050890  ENSG00000004142  0.002217              1   synucleinopathy   
2  DOID_0050890  ENSG00000004478  0.002217              1   synucleinopathy   
3  DOID_0050890  ENSG00000004948  0.002957              1   synucleinopathy   
4  DOID_0050890  ENSG00000005381  0.002217              1   synucleinopathy   

  geneCommonName  
0           GCLC  
1        POLDIP2  
2          FKBP4  
3          CALCR  
4            MPO  
3628026
['GCLC' 'POLDIP2' 'FKBP4' ... 'ADAM24P' 'MIR6893' 'LINC00970']
(25553,)
['synucleinopathy' 'trypanosomiasis' 'giardiasis' ...
 'albinism-hearing loss syndrome' 'PEHO-like syndrome'
 'obsolete primary parathyroid hyperplasia']
(11853,)
         diseaseId         targetId     score  evidenceCount  \
15902  EFO_0000180  ENSG00000000460  0.003696              1   
15903  EFO_0000180  ENSG00000000938  0.0

In [12]:
openTargets_df = openTargets_df.dropna(subset=["diseaseCommonName", "geneCommonName"])
len(openTargets_df)

2842003

In [13]:
df_score_as_value = openTargets_df.pivot_table(
    index='geneCommonName',
    columns='diseaseCommonName',
    values='score',
    aggfunc='first'  # or 'mean'/'max' if multiple scores exist for same (gene, disease)
)
print(df_score_as_value.columns)

Index(['10q22.3q23.3 microduplication syndrome',
       '12q14 microdeletion syndrome', '14q11.2 microduplication syndrome',
       '14q12 microdeletion syndrome', '14q24.1q24.3 microdeletion syndrome',
       '15q overgrowth syndrome', '15q11q13 microduplication syndrome',
       '15q14 microdeletion syndrome', '16p13.11 microdeletion syndrome',
       '16p13.11 microduplication syndrome',
       ...
       'xeroderma pigmentosum-Cockayne syndrome complex',
       'xerosis and growth failure with immune and pulmonary dysfunction syndrome',
       'yellow nail syndrome', 'yolk sac tumor',
       'young-onset Parkinson disease', 'zebra body myopathy',
       'zinc deficiency, transient neonatal', 'zygodactyly type 1',
       'zygodactyly type 2', 'zygodactyly type 3'],
      dtype='object', name='diseaseCommonName', length=11852)


In [2]:
df_score_as_value.to_csv("./df_score_as_value.tsv", sep="\t")

NameError: name 'df_score_as_value' is not defined

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from scipy.spatial.distance import pdist, squareform
from tqdm import tqdm

print("🔹 Loading dataframe...")
df = pd.read_csv("./df_score_as_value.tsv", sep="\t", index_col=0)

print("🔹 Extracting genes and score matrix...")
genes = df.iloc[:, 0].to_numpy()
score_matrix = df.iloc[:, 1:].fillna(0).to_numpy()

print(f"✅ Extracted {len(genes)} genes and score matrix of shape {score_matrix.shape}")

print("🔹 Computing pairwise Cosine distances...")
distances = pdist(score_matrix, metric='cosine')
distance_matrix = squareform(distances)

print("✅ Distance matrix computed.")
np.save("distance_matrix.npy", distance_matrix)

print("🔹 Building graph with nodes...")
G = nx.Graph()
G.add_nodes_from(genes)
print("✅ Nodes added to graph.")

print("🔹 Preparing edge list...")
i_upper, j_upper = np.triu_indices(len(genes), k=1)

edges = []
for i, j in tqdm(zip(i_upper, j_upper), total=len(i_upper), desc="Adding edges"):
    edges.append((genes[i], genes[j], {'weight': distance_matrix[i, j]}))

print(f"✅ Prepared {len(edges)} edges.")

print("🔹 Adding edges to the graph...")
G.add_edges_from(edges)
print("✅ Graph construction complete.")

print(f"📊 Final graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")


🔹 Loading dataframe...
🔹 Extracting genes and score matrix...
✅ Extracted 25307 genes and score matrix of shape (25307, 11851)
🔹 Computing pairwise Cosine distances...
✅ Distance matrix computed.
🔹 Building graph with nodes...
✅ Nodes added to graph.
🔹 Preparing edge list...


Adding edges:  77%|███████▋  | 245902743/320209471 [54:14<1:04:53, 19082.79it/s]

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from scipy.spatial.distance import pdist, squareform
from tqdm import tqdm

print("🔹 Loading dataframe...")
df = pd.read_csv("./df_score_as_value.tsv", sep="\t", index_col=0)

🔹 Loading dataframe...


In [2]:
print(df["bipolar disorder"])

geneCommonName
7SK              NaN
A1BG        0.003696
A1BG-AS1         NaN
A1CF             NaN
A2M         0.003696
              ...   
ZYG11A           NaN
ZYG11B           NaN
ZYX              NaN
ZZEF1            NaN
ZZZ3        0.070237
Name: bipolar disorder, Length: 25307, dtype: float64


In [3]:
print("🔹 Extracting genes and score matrix...")
genes = df.index.to_numpy()
print(genes)
score_matrix = df.iloc[:, 1:].fillna(0).to_numpy()
print(score_matrix)

🔹 Extracting genes and score matrix...
['7SK' 'A1BG' 'A1BG-AS1' ... 'ZYX' 'ZZEF1' 'ZZZ3']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [4]:

print("🔹 Computing pairwise Cosine distances...")
# distances = pdist(score_matrix, metric='cosine')
# sim_matrix = squareform(distances)
sim_matrix = np.load("distance_matrix.npy")
print("🔹 Building graph with nodes...")
G = nx.Graph()
G.add_nodes_from(genes)
print("✅ Nodes added to graph.")
print(list(G.nodes)[:10])

🔹 Computing pairwise Cosine distances...
🔹 Building graph with nodes...
✅ Nodes added to graph.
['7SK', 'A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1', 'A2ML1', 'A2MP1', 'A3GALT2', 'A4GALT']


In [5]:
# Keep top-K per row
K = 50  # Adjust as needed
edges = []
print("🔹 Selecting top-K similar genes per gene...")
for i in tqdm(range(sim_matrix.shape[0])):
    top_k_idx = np.argpartition(sim_matrix[i], -K)[-K:]
    for j in top_k_idx:
        if i != j:
            weight = sim_matrix[i, j]
            edges.append((genes[i], genes[j], {'weight': weight}))
print(len(edges))
# Build the graph
print("🔹 Constructing graph...")
G = nx.Graph()
G.add_nodes_from(genes)
G.add_edges_from(edges)

print("✅ Graph construction complete.")
print(f"📊 Final graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

🔹 Selecting top-K similar genes per gene...


100%|██████████| 25307/25307 [00:14<00:00, 1733.48it/s]


1265350
🔹 Constructing graph...
✅ Graph construction complete.
📊 Final graph has 25307 nodes and 1262444 edges.


In [6]:
nx.write_weighted_edgelist(G, "50_gene_distance_graph.edgelist")

In [7]:
print(G)
print("Sample nodes:", list(G.nodes)[:10])
print("Number of NaN nodes:", sum(1 for n in G.nodes if str(n) == 'nan' or (isinstance(n, float) and math.isnan(n))))


Graph with 25307 nodes and 1262444 edges
Sample nodes: ['7SK', 'A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1', 'A2ML1', 'A2MP1', 'A3GALT2', 'A4GALT']
Number of NaN nodes: 0


In [26]:
import community as community_louvain  # python-louvain
import networkx as nx
G = nx.read_edgelist("./50_gene_distance_graph.edgelist", data=(("weight", float),))
# Run Louvain community detection
partition = community_louvain.best_partition(G, weight='weight', resolution=2.0)  # returns: {gene: community_id}


#  Calculate centrality for nodes
degree_centrality = nx.degree_centrality(G)

# Group nodes by community
communities = {}
for node, comm_id in partition.items():
    if comm_id not in communities:
        communities[comm_id] = []
    communities[comm_id].append(node)

# For each community, sort by degree centrality (you can use other centrality measures)
for comm_id, nodes in communities.items():
    central_nodes = sorted(nodes, key=lambda x: degree_centrality[x], reverse=True)
    print(f"Community {comm_id} top nodes: {central_nodes[:5]}")  # Top 5 nodes
# # Invert the partition to get community → list of genes
# from collections import defaultdict

# communities = defaultdict(list)
# for gene, community_id in partition.items():
#     communities[community_id].append(gene)

# # Sort by community ID and print
# sorted_communities = dict(sorted(communities.items()))

# for cid, members in sorted_communities.items():
#     print(f"Community {cid}: {members}")
    
# with open("communities_resolution2.0.txt", "w") as f:
#     for cid, members in sorted_communities.items():
#         f.write(f"Community {cid}: {', '.join(members)}\n")


Community 9 top nodes: ['7SK', 'OR52P1', 'OR4C45', 'PACRG-AS1', 'OR52P2P']
Community 1 top nodes: ['SPDYE15', 'SPDYE21', 'SPDYE14', 'SPDYE7P', 'SPDYE13']
Community 2 top nodes: ['LINC02727', 'LINC02645', 'LINC02689', 'LINC02705', 'LINC02488']
Community 3 top nodes: ['LINC02359', 'LINC02284', 'LINC02295', 'LINC02294', 'LINC02404']
Community 4 top nodes: ['LINC01809', 'LINC01805', 'LINC01821', 'LINC01778', 'LINC01803']
Community 5 top nodes: ['LINC01476', 'LINC01492', 'LINC01400', 'LINC01465', 'LINC01491']
Community 6 top nodes: ['TMEM42', 'TMEM256-PLSCR3', 'KLF7-IT1', 'TBCEL-TECTA', 'TFAP2E-AS1']
Community 7 top nodes: ['PDE2A-AS1', 'PDE4DIPP6', 'PGBD4', 'PDCD6IP-DT', 'PAPPA-AS2']
Community 8 top nodes: ['PMCHL2', 'PNMA6E', 'PMS2P11', 'PP2D1', 'POLR2LP1']
Community 10 top nodes: ['LACTB2-AS1', 'KRT87P', 'KRTAP7-1', 'KRTAP9-3', 'KRT8P45']
Community 11 top nodes: ['MEIS1-AS3', 'MATN1-AS1', 'MARCKSL1P1', 'LINC00895', 'STPG2']
Community 12 top nodes: ['MYRF-AS1', 'MTATP6P16', 'MTCO2P16', 'M

In [25]:
# Calculate betweenness centrality considering edge weights
betweenness_centrality = nx.betweenness_centrality(G, weight='weight')

# Group nodes by community
for comm_id, nodes in communities.items():
    central_nodes = sorted(nodes, key=lambda x: betweenness_centrality[x], reverse=True)
    print(f"Community {comm_id} top nodes by betweenness: {central_nodes[:5]}")  # Top 5 nodes


KeyboardInterrupt: 

## Other goal = given a disease, normalize their scores aka have their score = score / total genes then see how our genes compare