In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

import pandas as pd
import scanpy as sc

sys.path.append("..")
from utils import inhouse_preprocess, read_aws_csv, read_aws_h5ad

## Data Preprocessing
- Ensure that the data is downloaded or fetch directly from s3 (see `../data` for instructions)

In [3]:
data_path = "s3://pert-spectra/data/inhouse.h5ad"
adata = read_aws_h5ad(data_path)

In [None]:
adata = inhouse_preprocess(adata)

In [5]:
# subset to hvg
sc.pp.highly_variable_genes(adata, n_top_genes=5000, subset=False)

In [7]:
# ensure all perts are in adata
idx = []
pert_list = adata.obs["condition"].unique()
for i in range(len(adata.var_names)):
    if (adata.var["highly_variable"][i]) or (adata.var_names[i] in pert_list):
        idx.append(i)
adata = adata[:, idx]

## Generating StringDB prior graph

In [8]:
# stringdb prior
stringdb_hq = read_aws_csv("s3://pert-spectra/references/StringDB.HQ.txt", sep="\t")

In [9]:
# ensg mapping
gene_name_df = read_aws_csv(
    "s3://pert-spectra/references/Homo_sapiens.gene_info", sep="\t"
)
gene_to_ensg = {}
gene_name_mapping = {}
for symbol, synonyms, ref in zip(
    gene_name_df["Symbol"], gene_name_df["Synonyms"], gene_name_df["dbXrefs"]
):
    syn_set = set(synonyms.split("|")).union(set([symbol]))
    refs = ref.split("|")
    ensg = None
    for r in refs:
        label = r.split(":")
        if label[0] == "Ensembl":
            ensg = label[1]
            break
    if ensg is None:
        continue
    assert ensg[:4] == "ENSG"
    gene_to_ensg[symbol] = ensg
    gene_name_mapping[symbol] = syn_set
    # make sure name mapping goes both ways
    for syn in syn_set:
        if syn in gene_name_mapping:
            gene_name_mapping[syn].add(symbol)
        else:
            gene_name_mapping[syn] = set([symbol])

In [10]:
adata.var["gene_symbols"] = adata.var_names
adata.var_names = [i.split(".")[0] for i in adata.var["gene_id"]]
inhouse_ensg_to_gene = {}
for symbol, ensgid in zip(adata.var["gene_symbols"], adata.var_names):
    inhouse_ensg_to_gene[ensgid] = symbol

  adata.var['gene_symbols'] = adata.var_names


In [11]:
dataset_measured_genes = set(adata.var["gene_symbols"])
dataset_measured_ensg = set(adata.var_names)

In [12]:
# first, check if gene name has ensembl id
# if it does, use that to match
# else, use gene name mapping
edges = []
for index, row in stringdb_hq.iterrows():
    gene1 = row["i_genes"]
    gene2 = row["j_genes"]
    if gene1 not in gene_to_ensg:
        # neither gene has ensg
        if gene2 not in gene_to_ensg:
            gene1_syn = set([gene1])
            gene2_syn = set([gene2])
            if gene1 in gene_name_mapping:
                gene1_syn = gene_name_mapping[gene1].union(set([gene1]))
            if gene2 in gene_name_mapping:
                gene2_syn = gene_name_mapping[gene2].union(set([gene2]))

            alias_intersect_1 = gene1_syn.intersection(dataset_measured_genes)
            alias_intersect_2 = gene2_syn.intersection(dataset_measured_genes)

            if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):
                gene1 = (
                    gene1
                    if gene1 in dataset_measured_genes
                    else list(alias_intersect_1)[0]
                )
                gene2 = (
                    gene2
                    if gene2 in dataset_measured_genes
                    else list(alias_intersect_2)[0]
                )
                edges.append([gene1, gene2, row["x"]])

        # gene1 does not have ensemblid, gene2 does have ensemblid
        else:
            ensg2 = gene_to_ensg[gene2]
            gene1_syn = set([gene1])
            gene2_syn = set([ensg2])
            if gene1 in gene_name_mapping:
                gene1_syn = gene_name_mapping[gene1].union(set([gene1]))

            alias_intersect_1 = gene1_syn.intersection(dataset_measured_genes)
            alias_intersect_2 = gene2_syn.intersection(dataset_measured_ensg)

            if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):
                gene1 = (
                    gene1
                    if gene1 in dataset_measured_genes
                    else list(alias_intersect_1)[0]
                )
                gene2 = inhouse_ensg_to_gene[list(alias_intersect_2)[0]]
                edges.append([gene1, gene2, row["x"]])

    else:
        # gene1 has ensemblid, gene2 does not have ensemblid
        if gene2 not in gene_to_ensg:
            ensg1 = gene_to_ensg[gene1]
            gene1_syn = set([ensg1])
            gene2_syn = set([gene2])
            if gene2 in gene_name_mapping:
                gene2_syn = gene_name_mapping[gene2].union(set([gene2]))

            alias_intersect_1 = gene1_syn.intersection(dataset_measured_ensg)
            alias_intersect_2 = gene2_syn.intersection(dataset_measured_genes)

            if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):
                gene1 = inhouse_ensg_to_gene[list(alias_intersect_1)[0]]
                gene2 = (
                    gene2
                    if gene2 in dataset_measured_genes
                    else list(alias_intersect_2)[0]
                )
                edges.append([gene1, gene2, row["x"]])

        # both genes have ensmblid
        else:
            ensg1 = gene_to_ensg[gene1]
            ensg2 = gene_to_ensg[gene2]
            gene1_syn = set([ensg1])
            gene2_syn = set([ensg2])

            alias_intersect_1 = gene1_syn.intersection(dataset_measured_ensg)
            alias_intersect_2 = gene2_syn.intersection(dataset_measured_ensg)

            if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):
                gene1 = inhouse_ensg_to_gene[list(alias_intersect_1)[0]]
                gene2 = inhouse_ensg_to_gene[list(alias_intersect_2)[0]]
                edges.append([gene1, gene2, row["x"]])

In [13]:
inhouse_network = pd.DataFrame(edges, columns=["gene1", "gene2", "score"])
inhouse_network

Unnamed: 0,gene1,gene2,score
0,GPC1,HS3ST1,0.949
1,SLCO2B1,HS3ST1,0.172
2,SDC4,HS3ST1,0.930
3,HSPG2,HS3ST1,0.941
4,GPC6,HS3ST1,0.949
...,...,...,...
210287,KRT8,MUC5AC,0.575
210288,GAN,MUC5AC,0.270
210289,MYC,MUC5AC,0.266
210290,GCNT7,MUC5AC,0.905


In [14]:
# add genes measured in inhouse with no connections found in StringDB
missing_genes = dataset_measured_genes.difference(
    set(inhouse_network["gene1"].unique())
)
for g in missing_genes:
    inhouse_network = pd.concat(
        [pd.DataFrame([[g, g, 0]], columns=inhouse_network.columns), inhouse_network],
        ignore_index=True,
    )
inhouse_network

Unnamed: 0,gene1,gene2,score
0,ENSG00000228541,ENSG00000228541,0.000
1,STARD13-AS,STARD13-AS,0.000
2,CLDND2,CLDND2,0.000
3,ENSG00000230333,ENSG00000230333,0.000
4,ENSG00000246090,ENSG00000246090,0.000
...,...,...,...
211669,KRT8,MUC5AC,0.575
211670,GAN,MUC5AC,0.270
211671,MYC,MUC5AC,0.266
211672,GCNT7,MUC5AC,0.905


In [15]:
# create one-hot-encoding mapping based on adata var structure
adata.var_names = adata.var["gene_symbols"]


def map_gene_to_onehot(name: str):
    return adata.var_names.get_loc(name)


inhouse_network["gene1"] = inhouse_network["gene1"].apply(map_gene_to_onehot)
inhouse_network["gene2"] = inhouse_network["gene2"].apply(map_gene_to_onehot)
inhouse_network = inhouse_network.sort_values(by=["gene1"])

In [16]:
# create weighted adj matrix
import networkx

edgeList = inhouse_network.values.tolist()
G = networkx.DiGraph()
for i in range(len(edgeList)):
    G.add_edge(edgeList[i][0], edgeList[i][1], weight=edgeList[i][2])
A = networkx.adjacency_matrix(
    G, nodelist=[i for i in range(len(adata.var_names))]
).toarray()

In [17]:
# turn to sparse
from scipy import sparse

sA = sparse.csr_matrix(A)

In [None]:
# write to adata
adata.uns["sparse_gene_network"] = sA
adata.write_h5ad("../inhouse_adata_spectra.h5ad")