# From knowledge graphs to summarization prompts

In [13]:
import os
import re
import json
import itertools
from collections import defaultdict
from tqdm.auto import tqdm

import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


## Genes of interest

This is the set of genes which we would like to describe.

**Note**: Replace `all_genes` with your own list.

In [8]:
with open("../perturbqa/datasets/k562_gw_mapping_full.json") as f:
    all_genes = [g for g in json.load(f) if g != "non-targeting"]

## Load raw graphs

Files that return tuples are bipartite graphs.

In [9]:
fp_ensembl = "../perturbqa/datasets/kg/ensembl.json"
with open(fp_ensembl) as f:
    ensembl_data = json.load(f)
print(len(ensembl_data))

fp_uniprot = "../perturbqa/datasets/kg/uniprot.json"
with open(fp_uniprot) as f:
    uniprot_data = json.load(f)
print(len(uniprot_data))

fp_bioplex = "../perturbqa/datasets/kg/bioplex.json"
with open(fp_bioplex) as f:
    bioplex_data = json.load(f)
print(len(bioplex_data))

fp_go = "../perturbqa/datasets/kg/go.json"
with open(fp_go) as f:
    genes_to_go, go_to_genes, go_to_go = json.load(f)
print(len(genes_to_go), len(go_to_genes), len(go_to_go))

fp_reactome = "../perturbqa/datasets/kg/reactome.json"
with open(fp_reactome) as f:
    gene_to_reaction, reaction_to_gene = json.load(f)
print(len(gene_to_reaction), len(reaction_to_gene))

fp_corum = "../perturbqa/datasets/kg/corum.json"
with open(fp_corum) as f:
    gene_to_complex, complex_to_gene = json.load(f)
print(len(gene_to_complex), len(complex_to_gene))

fp_string = "../perturbqa/datasets/kg/string.json"
with open(fp_string) as f:
    string_data = json.load(f)
print(len(string_data))

55604
548403
13310
19441 18627 42094
10373 14809
4431 3580
18479


In [10]:
all_graphs = {
	"go": {**genes_to_go, **go_to_genes},
	"reactome": {**gene_to_reaction, **reaction_to_gene},
	"corum": {**gene_to_complex, **complex_to_gene},
	"bioplex": bioplex_data,
	"string": string_data,
}

# remove most common
common = defaultdict(list)
max_degree = 1000
for graph_name, graph in all_graphs.items():
    for k, v in graph.items():
        if len(v) > max_degree:
            common[graph_name].append(k)
    print(graph_name, len(common[graph_name]))

for graph_name in ["go", "string"]:
    for common_term in common[graph_name]:
        del all_graphs[graph_name][common_term]
    all_graphs[graph_name] = {key:[v for v in val if v not in common[graph_name]] for key, val in all_graphs[graph_name].items()}

go 24
reactome 4
corum 0
bioplex 0
string 47


## Convert into descriptions

Ensembl

In [11]:
ensembl_dict = {entry["name"]: entry["description"] for entry in ensembl_data.values()}

def ensembl_to_text(gene):
    if gene not in ensembl_dict:
        return []
    desc = ensembl_dict[gene]
    text = f"Description of gene: {desc}"
    text = text.split("[Source:")[0]  # trim this off
    return [text]

UniProt

In [14]:
uniprot_dict = defaultdict(list)
for entry in tqdm(uniprot_data):
    genes = entry["gene"]
    if type(genes) is str:
        genes = [genes]
    for gene in genes:
        if gene not in all_genes:
            continue
        uniprot_dict[gene].append(entry)
uniprot_dict = dict(uniprot_dict)
print(len(uniprot_dict))

100%|██████████| 548403/548403 [02:05<00:00, 4352.62it/s]

8161





In [15]:
def uniprot_to_text(gene):
    if gene not in uniprot_dict:
        return []
    description = []
    entries = uniprot_dict[gene]

    proteins = _combine_entries(entries, "protein")
    if len(proteins) > 0 :
        description.append(f"Gene products: {', '.join(proteins)}")

    function = _combine_entries(entries, "function")
    if len(function) > 0:
        description.append(f"Functions: {', '.join(function)}")

    subunit = _combine_entries(entries, "subunit")
    if len(subunit) > 0:
        description.append(f"Quaternary structure: {' '.join(subunit)}")

    interaction = _combine_entries(entries, "interaction")
    if len(interaction) > 0:
        description.append(f"Interacts with: {', '.join(interaction)}")

    description = [re.sub(" \(PubMed.*\)", "", d) for d in description]
    return description


def uniprot_to_text_long(gene):
    if gene not in uniprot_dict:
        return []
    description = []
    entries = uniprot_dict[gene]

    proteins = _combine_entries(entries, "protein")
    if len(proteins) > 0 :
        description.extend(f"Gene products: {x.strip()}" for x in proteins)

    function = _combine_entries(entries, "function")
    if len(function) > 0:
        for val in function:
            if ")." in val:
                description.extend(f"Functions: {x.strip()})" for x in val.split(").")[:-1])
            else:
                description.append(f"Functions: {val.strip()}")

    subunit = _combine_entries(entries, "subunit")
    if len(subunit) > 0:
        for val in subunit:
            if ")." in val:
                description.extend(f"Quaternary structure: {x.strip()})" for x in val.split(").")[:-1])
            else:
                description.append(f"Quaternary structure: {val.strip()}")

    interaction = _combine_entries(entries, "interaction")
    if len(interaction) > 0:
        for val in interaction:
            if ")." in val:
                description.extend(f"Interacts with: {x.strip()})" for x in val.split(").")[:-1])
            else:
                description.append(f"Interacts with: {val.strip()}")

    return description

def _combine_entries(entries, key):
    if type(entries) is dict:
        entries = [entries]
    vals = [entry[key] for entry in entries if len(entry[key]) > 0]
    vals = [[x] if type(x) is str else x for x in vals]
    if len(vals) == 0:
        return []
    vals = itertools.chain(*vals)
    vals = sorted(set(vals))
    return vals

BioPlex

In [16]:
def translate_bioplex(g1, rels):
    # only 2 types of cells
    cell_to_g2s = defaultdict(list)
    for rel in rels:
        g2, celltype = rel
        cell_to_g2s[celltype].append(g2)
    desc = []
    for celltype, g2s in cell_to_g2s.items():
        desc.append(f"In {celltype} cells, {g1} may form a complex with {', '.join(g2s)}")
    return desc

def translate_bioplex_long(g1, rels):
    # only 2 types of cells
    cell_to_g2s = defaultdict(list)
    for rel in rels:
        g2, celltype = rel
        cell_to_g2s[celltype].append(g2)
    desc = []
    for celltype, g2s in cell_to_g2s.items():
        for g2 in g2s:
            desc.append(f"In {celltype} cells, {g1} may form a complex with {g2}")
    return desc

Gene ontology

In [17]:
fp_go = "../perturbqa/datasets/kg/go_dict.json"
with open(fp_go) as f:
    go_desc = json.load(f)
print(len(go_desc))

47914


In [18]:
go_to_english = {
    "NOT|acts_upstream_of_or_within": "does not act upstream of or within",
    "NOT|colocalizes_with": "does not colocalize with",
    "NOT|contributes_to": "does not contribute to",
    "NOT|enables": "does not enable",
    "NOT|involved_in": "is not involved in",
    "NOT|is_active_in": "is not active in",
    "NOT|located_in": "is not located in",
    "NOT|part_of": "is not part of",
    "acts_upstream_of": "acts upstream of",
    "acts_upstream_of_negative_effect": "acts upstream of negative effect",
    "acts_upstream_of_or_within": "acts upstream of or within",
    "acts_upstream_of_or_within_negative_effect": "acts upstream of or within negative effect",
    "acts_upstream_of_or_within_positive_effect": "acts upstream of or within positive effect",
    "acts_upstream_of_positive_effect": "acts upstream of positive effect",
    "colocalizes_with": "colocalizes with",
    "contributes_to": "contributes to",
    "enables": "enables",
    "involved_in": "involved in",
    "is_active_in": "is active in",
    "located_in": "is located in",
    "part_of": "is part of",
}

In [19]:
def translate_go_rel(rel):
    if rel in go_to_english:
        rel = go_to_english[rel]
    elif rel in go_desc:
        rel = go_desc[rel]
    return rel

In [20]:
def translate_go(g1, rels):
    desc = []
    for rel in rels:
        go_term, rel = rel
        desc.append(f"{g1} {translate_go_rel(rel)} {translate_go_rel(go_term)}.")
    desc = [d for d in desc if "molecular_function" not in d and "biological_process" not in d and "protein_binding" not in d]
    return desc

Reactome

In [21]:
def translate_reactome(g1, rels):
    # only 2 types of locs
    loc_to_g2s = defaultdict(list)
    for rel in rels:
        g2, loc1 = rel
        loc_to_g2s[loc1].append(g2)
    desc = []
    for loc1, g2s in loc_to_g2s.items():
        desc.append(f"In the {loc1}, {g1} enables {', '.join(g2s)}")
    return desc

def translate_reactome_long(g1, rels):
    # only 2 types of locs
    loc_to_g2s = defaultdict(list)
    for rel in rels:
        g2, loc1 = rel
        loc_to_g2s[loc1].append(g2)
    desc = []
    for loc1, g2s in loc_to_g2s.items():
        for g2 in g2s:
            desc.append(f"In the {loc1}, {g1} enables {g2}")
    return desc

CORUM

**Note**: CORUM is licensed under CC-BY-NC.

In [22]:
complex_to_desc = {}
df_corum = pd.read_csv("../perturbqa/datasets/kg/corum_human_5.1.txt", sep="\t")
df_corum = df_corum.replace({float("nan"): None}).to_dict(orient="records")

for item in df_corum:
    complex_name = item["complex_name"]
    cell_line = item["complex_name"]
    complex_comment = item["comment_complex"]
    subunit_comment = item["comment_members"]
    disease_comment = item["comment_disease"]
    desc = f"{complex_name}"
    if cell_line is not None:
        desc = desc + f" in {cell_line}."
    if complex_comment is not None:
        desc = desc + f" {complex_comment}"
    if subunit_comment is not None:
        desc = desc + f" {subunit_comment}"
    if disease_comment is not None:
        desc = desc + f" {disease_comment}"
    complex_to_desc[complex_name] = desc

In [23]:
def translate_corum(g1, rels):
    desc = []
    for rel in rels:
        complex1, celltype1 = rel
        desc.append(f"{g1} is a member of {complex1} in {celltype1}.")
    return desc

STRING

In [24]:
string_evidence = {
    "database": "database evidence in humans",
    "database_transferred": "database evidence in other animals",
    "experiments": "experimental evidence in humans",
    "experiments_transferred": "experimental evidence in other animals",
    "textmining": "literature evidence in humans",
    "textmining_transferred": "literature evidence in other animals",
}

In [25]:
def translate_string(g1, rel):
    desc = []
    # types of evidence
    rel_to_g2s = defaultdict(list)
    for rel in rels:
        g2, rel1 = rel
        rel1 = ", ".join([string_evidence[e] for e in rel1])
        rel_to_g2s[rel1].append(g2)
    desc = []
    for rel1, g2s in rel_to_g2s.items():
        desc.append(f"Based on evidence from {rel1}, {g1} may physically interact with {', '.join(g2s)}.")
    return desc

def translate_string_long(g1, rel):
    desc = []
    # types of evidence
    rel_to_g2s = defaultdict(list)
    for rel in rels:
        g2, rel1 = rel
        rel1 = ", ".join([string_evidence[e] for e in rel1])
        rel_to_g2s[rel1].append(g2)
    desc = []
    for rel1, g2s in rel_to_g2s.items():
        for g2 in g2s:
            desc.append(f"Based on evidence from {rel1}, {g1} may physically interact with {g2}.")
    return desc

## Serialize everything

In [26]:
translate = {
    "go": translate_go,
    "corum": translate_corum,
    "reactome": translate_reactome,
    "bioplex": translate_bioplex,
    "string": translate_string,

    "ensembl": ensembl_to_text,
    "uniprot": uniprot_to_text,
    
    
    #### option to generate longer descriptions if desired

    # "reactome": translate_reactome_long,
    # "bioplex": translate_bioplex_long,
    # "string": translate_string_long
    
    # "uniprot": uniprot_to_text_long,
}

In [27]:
desc = defaultdict(set)
max_items = 50
for gene in all_genes:
    # decreasing order of desired-ness
    for db_name in ["ensembl", "uniprot"]:
        desc[gene].update(translate[db_name](gene))
    
    for graph_name in ["reactome", "corum", "go", "bioplex", "string"]:
        graph = all_graphs[graph_name]
        if gene not in graph:
            continue
        rels = graph[gene]
        # tuple mashing
        if graph_name == "string":
            rels = [(r[0], tuple(r[1])) for r in rels]
        if type(rels[0]) is list:
            rels = [tuple(r) for r in rels]

        # if too many, skip lowest fidelity
        if len(rels) > 50 and graph_name == "string":
            rels = [r for r in rels if len(r[1]) > 1]
        if len(rels) > 50 and graph_name == "bioplex":
            continue
        rels = translate[graph_name](gene, rels)
        desc[gene].update(rels)
        if len(desc[gene]) > max_items:
            break

print(len(all_genes), len(desc))

8454 8454


## Example prompt creation

You may copy the appropriate template from `examples/summer/prompts` 

In [28]:
prompt_perturbation = f"""You are an expert molecular biologist who studies how genes are related using Perturb-seq.

Task: You are writing a brief overview of the human gene {{gene}}, with a focus on its molecular and cellular functions. You will be provided a set of database entries about the gene. Ensure that your overview remains faithful to this domain knowledge.

Format:
- Write one to two sentences describing the primary molecular and cellular function of gene {{gene}}.
- Write one sentence describing the potential downstream impact of perturbing gene {{gene}} via gene knockdown.

Constraints:
- Maintain a professional tone throughout.
- Do not comment on your own writing.
- Do not add any notes or references. Do not make up additional information.
- Do not discuss the importance or impact of the gene. Focus only on its function.

Domain knowledge:
{{entries}}

Downstream effects of perturbing {{gene}} via gene knockdown:
"""

In [29]:
gene = "ABCE1"
entries = "\n".join(["- " + d for d in desc[gene]])

print(prompt_perturbation.format(gene=gene, entries=entries))

You are an expert molecular biologist who studies how genes are related using Perturb-seq.

Task: You are writing a brief overview of the human gene ABCE1, with a focus on its molecular and cellular functions. You will be provided a set of database entries about the gene. Ensure that your overview remains faithful to this domain knowledge.

Format:
- Write one to two sentences describing the primary molecular and cellular function of gene ABCE1.
- Write one sentence describing the potential downstream impact of perturbing gene ABCE1 via gene knockdown.

Constraints:
- Maintain a professional tone throughout.
- Do not comment on your own writing.
- Do not add any notes or references. Do not make up additional information.
- Do not discuss the importance or impact of the gene. Focus only on its function.

Domain knowledge:
- ABCE1 is active in cytosolic ribosome.
- ABCE1 involved in translational initiation.
- Functions: (Microbial infection) May act as a chaperone for post-translational