In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

import pandas as pd
import scanpy as sc

sys.path.append("..")
from utils import read_aws_csv, read_aws_h5ad

## Data Preprocessing
- Ensure that the data is downloaded (see `../data` for instructions)
- Ensure that the data was filtered using `norman_filtering.ipynb`

In [27]:
data_path = "path to filtered norman.h5ad"
adata = read_aws_h5ad(data_path)

In [30]:
# get perturbation genes
pert_list = set()
for t in adata.obs["perturbation_name"]:
    if "+" not in t and t != "control":
        pert_list.add(t)

105

In [31]:
# subset to hvg
sc.pp.highly_variable_genes(adata, n_top_genes=5000, subset=False)

In [33]:
# ensure all perts are in adata
idx = []
for i in range(len(adata.var_names)):
    if (adata.var["highly_variable"][i]) or (adata.var_names[i] in pert_list):
        idx.append(i)
adata_subset = adata[:, idx]
adata_subset

View of AnnData object with n_obs × n_vars = 101578 × 5035
    obs: 'guide_identity', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'guide_AHR', 'guide_ARID1A', 'guide_ARRDC3', 'guide_ATL1', 'guide_BAK1', 'guide_BCL2L11', 'guide_BCORL1', 'guide_BPGM', 'guide_C19orf26', 'guide_C3orf72', 'guide_CBFA2T3', 'guide_CBL', 'guide_CDKN1A', 'guide_CDKN1B', 'guide_CDKN1C', 'guide_CEBPA', 'guide_CEBPB', 'guide_CEBPE', 'guide_CELF2', 'guide_CITED1', 'guide_CKS1B', 'guide_CLDN6', 'guide_CNN1', 'guide_CNNM4', 'guide_COL1A1', 'guide_COL2A1', 'guide_CSRNP1', 'guide_DLX2', 'guide_DUSP9', 'guide_EGR1', 'guide_ELMSAN1', 'guide_ETS2', 'guide_FEV', 'guide_FOSB', 'guide_FOXA1', 'guide_FOXA3', 'guide_FOXF1', 'guide_FOXL2', 'guide_FOXO4', 'guide_GLB1L2', 'guide_HES7', 'guide_HK2', 'guide_HNF4A', 'guide_HOXA13', 'guide_HOXB9', 'guide_HOXC13', 'guide_IER5L', 'guide_IGDCC3', 'guide_IKZF3', 'guide_IRF1', 'guide_ISL2', 'guide_JUN', 'guide_KIAA1804', 'guide_KIF18B', 'guide_KI

## StringDB prior graph

In [35]:
# stringdb prior
stringdb_hq = read_aws_csv("s3://pert-spectra/references/StringDB.HQ.txt", sep="\t")

In [36]:
# ensg mapping
gene_name_df = read_aws_csv(
    "s3://pert-spectra/references/Homo_sapiens.gene_info", sep="\t"
)

In [37]:
gene_name_df

Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,9606,1,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20240617,-
1,9606,2,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20240617,-
2,9606,3,A2MP1,-,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe...,12,12p13.31,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1,alpha-2-macroglobulin pseudogene 1,O,pregnancy-zone protein pseudogene,20240617,-
3,9606,9,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20240617,-
4,9606,10,NAT2,-,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156...,8,8p22,N-acetyltransferase 2,protein-coding,NAT2,N-acetyltransferase 2,O,arylamine N-acetyltransferase 2|N-acetyltransf...,20240617,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193451,741158,8923215,trnD,-,-,-,MT,-,tRNA-Asp,tRNA,-,-,-,-,20200909,-
193452,741158,8923216,trnP,-,-,-,MT,-,tRNA-Pro,tRNA,-,-,-,-,20200909,-
193453,741158,8923217,trnA,-,-,-,MT,-,tRNA-Ala,tRNA,-,-,-,-,20200909,-
193454,741158,8923218,COX1,-,-,-,MT,-,cytochrome c oxidase subunit I,protein-coding,-,-,-,cytochrome c oxidase subunit I,20230818,-


In [38]:
gene_to_ensg = {}
gene_name_mapping = {}
for symbol, synonyms, ref in zip(
    gene_name_df["Symbol"], gene_name_df["Synonyms"], gene_name_df["dbXrefs"]
):
    syn_set = set(synonyms.split("|")).union(set([symbol]))
    refs = ref.split("|")
    ensg = None
    for r in refs:
        label = r.split(":")
        if label[0] == "Ensembl":
            ensg = label[1]
            break
    if ensg is None:
        continue
    assert ensg[:4] == "ENSG"
    gene_to_ensg[symbol] = ensg
    gene_name_mapping[symbol] = syn_set
    # make sure name mapping goes both ways
    for syn in syn_set:
        if syn in gene_name_mapping:
            gene_name_mapping[syn].add(symbol)
        else:
            gene_name_mapping[syn] = set([symbol])

In [49]:
adata_subset.var["gene_symbols"] = adata_subset.var_names.copy()
adata_subset.var_names = adata_subset.var["index"].copy()
norman_ensg_to_gene = {}
for symbol, ensgid in zip(adata_subset.var["gene_symbols"], adata_subset.var_names):
    norman_ensg_to_gene[ensgid] = symbol

In [50]:
dataset_measured_genes = set(adata_subset.var["gene_symbols"])
dataset_measured_ensg = set(adata_subset.var_names)

In [54]:
# first, check if gene name has ensembl id
# if it does, use that to match
# else, use gene name mapping
edges = []
for index, row in stringdb_hq.iterrows():
    gene1 = row["i_genes"]
    gene2 = row["j_genes"]
    if gene1 not in gene_to_ensg:
        # neither gene has ensg
        if gene2 not in gene_to_ensg:
            gene1_syn = set([gene1])
            gene2_syn = set([gene2])
            if gene1 in gene_name_mapping:
                gene1_syn = gene_name_mapping[gene1].union(set([gene1]))
            if gene2 in gene_name_mapping:
                gene2_syn = gene_name_mapping[gene2].union(set([gene2]))

            alias_intersect_1 = gene1_syn.intersection(dataset_measured_genes)
            alias_intersect_2 = gene2_syn.intersection(dataset_measured_genes)

            if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):
                gene1 = (
                    gene1
                    if gene1 in dataset_measured_genes
                    else list(alias_intersect_1)[0]
                )
                gene2 = (
                    gene2
                    if gene2 in dataset_measured_genes
                    else list(alias_intersect_2)[0]
                )
                edges.append([gene1, gene2, row["x"]])

        # gene1 does not have ensemblid, gene2 does have ensemblid
        else:
            ensg2 = gene_to_ensg[gene2]
            gene1_syn = set([gene1])
            gene2_syn = set([ensg2])
            if gene1 in gene_name_mapping:
                gene1_syn = gene_name_mapping[gene1].union(set([gene1]))

            alias_intersect_1 = gene1_syn.intersection(dataset_measured_genes)
            alias_intersect_2 = gene2_syn.intersection(dataset_measured_ensg)

            if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):
                gene1 = (
                    gene1
                    if gene1 in dataset_measured_genes
                    else list(alias_intersect_1)[0]
                )
                gene2 = norman_ensg_to_gene[list(alias_intersect_2)[0]]
                edges.append([gene1, gene2, row["x"]])

    else:
        # gene1 has ensemblid, gene2 does not have ensemblid
        if gene2 not in gene_to_ensg:
            ensg1 = gene_to_ensg[gene1]
            gene1_syn = set([ensg1])
            gene2_syn = set([gene2])
            if gene2 in gene_name_mapping:
                gene2_syn = gene_name_mapping[gene2].union(set([gene2]))

            alias_intersect_1 = gene1_syn.intersection(dataset_measured_ensg)
            alias_intersect_2 = gene2_syn.intersection(dataset_measured_genes)

            if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):
                gene1 = norman_ensg_to_gene[list(alias_intersect_1)[0]]
                gene2 = (
                    gene2
                    if gene2 in dataset_measured_genes
                    else list(alias_intersect_2)[0]
                )
                edges.append([gene1, gene2, row["x"]])

        # both genes have ensmblid
        else:
            ensg1 = gene_to_ensg[gene1]
            ensg2 = gene_to_ensg[gene2]
            gene1_syn = set([ensg1])
            gene2_syn = set([ensg2])

            alias_intersect_1 = gene1_syn.intersection(dataset_measured_ensg)
            alias_intersect_2 = gene2_syn.intersection(dataset_measured_ensg)

            if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):
                gene1 = norman_ensg_to_gene[list(alias_intersect_1)[0]]
                gene2 = norman_ensg_to_gene[list(alias_intersect_2)[0]]
                edges.append([gene1, gene2, row["x"]])

In [74]:
norman_network = pd.DataFrame(edges, columns=["gene1", "gene2", "score"])
norman_network

Unnamed: 0,gene1,gene2,score
0,MISP,USH1C,0.212
1,TUBB1,USH1C,0.184
2,CRYM,USH1C,0.289
3,LIN7B,USH1C,0.152
4,TAX1BP3,USH1C,0.919
...,...,...,...
223491,N4BP2,ZNF518A,0.179
223492,PHIP,ZNF518A,0.160
223493,BRD2,ZNF518A,0.228
223494,GSTP1,ZNF518A,0.165


In [77]:
# add genes measured in Norman with no connections found in StringDB
missing_genes = dataset_measured_genes.difference(set(norman_network["gene1"].unique()))
for g in missing_genes:
    norman_network = pd.concat(
        [pd.DataFrame([[g, g, 0]], columns=norman_network.columns), norman_network],
        ignore_index=True,
    )
norman_network

Unnamed: 0,gene1,gene2,score
0,ZNF720,ZNF720,0.000
1,LINC00467,LINC00467,0.000
2,RP3-473L9.4,RP3-473L9.4,0.000
3,RP11-380O24.1,RP11-380O24.1,0.000
4,LINC01597,LINC01597,0.000
...,...,...,...
225103,N4BP2,ZNF518A,0.179
225104,PHIP,ZNF518A,0.160
225105,BRD2,ZNF518A,0.228
225106,GSTP1,ZNF518A,0.165


In [79]:
# create one-hot-encoding mapping based on adata var structure
adata_subset.var_names = adata_subset.var["gene_symbols"]


def map_gene_to_onehot(name: str):
    return adata_subset.var_names.get_loc(name)


norman_network["gene1"] = norman_network["gene1"].apply(map_gene_to_onehot)
norman_network["gene2"] = norman_network["gene2"].apply(map_gene_to_onehot)
norman_network = norman_network.sort_values(by=["gene1"])

In [82]:
# create weighted adj matrix
import networkx

edgeList = norman_network.values.tolist()
G = networkx.DiGraph()
for i in range(len(edgeList)):
    G.add_edge(edgeList[i][0], edgeList[i][1], weight=edgeList[i][2])
A = networkx.adjacency_matrix(
    G, nodelist=[i for i in range(len(adata_subset.var_names))]
).toarray()

In [84]:
# turn to sparse
from scipy import sparse

sA = sparse.csr_matrix(A)

In [85]:
# write to adata
adata_subset.uns["sparse_gene_network"] = sA
adata_subset.write_h5ad("../norman_adata_spectra.h5ad")