In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from scipy.spatial.distance import squareform
import matplotlib.pyplot as plt
import random

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
all_terms = pd.read_excel("All pathway annotations.xlsx")

In [7]:
term_gene = all_terms.groupby(['term_id', 'term_name', 'source'])['gene'].apply(set).reset_index()

In [9]:
# Master gene list
all_genes = set()
for geneset in term_gene['gene']:
    all_genes.update(geneset)
all_genes = sorted(list(all_genes))

In [11]:
# Map genes to terms
gene_idx = {gene: idx for idx, gene in enumerate(all_genes)}
term_idx = {term: idx for idx, term in enumerate(term_gene['term_id'])}

In [13]:
# Sparse binary matrix
mat = np.zeros((len(all_genes), len(term_gene)), dtype=np.int8)

for j, genes in enumerate(term_gene['gene']):
    indices = [gene_idx[g] for g in genes if g in gene_idx]
    mat[indices, j] = 1

print(f"Gene-term matrix size: {mat.shape}")

Gene-term matrix size: (20336, 23915)


In [15]:
# Kappa similarity matrix

n_terms = mat.shape[1]
kappa_matrix = np.zeros((n_terms, n_terms))

pbar = tqdm(total=(n_terms * (n_terms + 1)) // 2)

for i in range(n_terms):
    a = mat[:, i]
    for j in range(i, n_terms):
        b = mat[:, j]
        p0 = np.sum(a == b) / len(a)
        pe = (np.mean(a) * np.mean(b)) + ((1 - np.mean(a)) * (1 - np.mean(b)))
        kappa = (p0 - pe) / (1 - pe) if (1 - pe) != 0 else 0
        kappa_matrix[i, j] = kappa_matrix[j, i] = kappa
        pbar.update(1)
pbar.close()

  0%|                                                                     | 5562/285975570 [00:05<81:49:17, 970.85it/s]

KeyboardInterrupt: 

In [None]:
distance_matrix = 1 - kappa_matrix
np.fill_diagonal(distance_matrix, 0)

# Hierarchical clustering with average linkage
Z = linkage(squareform(distance_matrix), method='average')

# Cut at 0.75 distance threshold (Kappa > 0.25)
threshold = 1 - 0.25
clusters = fcluster(Z, t=threshold, criterion='distance')

term_gene['cluster'] = clusters

In [None]:
term_gene.to_excel("Clustered_Pathways_kappa0.25.xlsx", index=False)