# Notebook 12
### Computing the protein interaction likelihood ration based on Gene Ontology overlap

In [None]:
import pandas as pd
import numpy as np
import random, pprint

In [None]:
#load a "\t"-separated data file of protein information into a dataframe
# I saved tsv file on my drive
prot_data = pd.read_csv("/content/uniprotkb_proteome_UP000005640_2024_01_29.tsv",sep="\t")

In [None]:
#delete inessential colums from the dataframe
del prot_data['Reviewed']
del prot_data['Entry Name']
del prot_data['Protein names']
del prot_data['Organism']
del prot_data['Length']
del prot_data['Gene Ontology (cellular component)']
del prot_data['Gene Ontology (molecular function)']
del prot_data['Entry']


In [None]:
prot_data.head(6)

Unnamed: 0,Gene Names,Gene Ontology (biological process)
0,DMD,bone development [GO:0060348]; cardiac muscle ...
1,DGKI,excitatory postsynaptic potential [GO:0060079]...
2,BOLA2-SMG1P6,
3,CYP2D7,arachidonic acid metabolic process [GO:0019369...
4,PTGS1,prostaglandin biosynthetic process [GO:0001516...
5,HNF1A,apoptotic nuclear changes [GO:0030262]; bile a...


In [None]:
# make a dictionary relating gene name to a list of Gene Ontology biological process annotation terms for that gene
ctr = 0
gene_names_dict = {}
for index, prot in prot_data.iterrows():
    ctr += 1
    gene_names = prot[0]
    if isinstance(gene_names, float) and np.isnan(gene_names):
        continue
    gene_names_list = gene_names.split(' ')
    go_bp = prot[1]
    if not isinstance(go_bp, float) or not np.isnan(go_bp):
        go_bp_list = go_bp.split(';')
        go_bp_list = [bp.strip() for bp in go_bp_list]
    else:
        go_bp_list = []
    for gene_name in gene_names_list:
        gene_names_dict[gene_name] = go_bp_list
# the string-type biological process column is no longer need, since we have converted it to a list-type column
del prot_data['Gene Ontology (biological process)']


In [None]:
# make a dataframe containing just two columns - "gene" and "bp"
# where "gene" contains the gene symbol, "bp" contains a list of GO biological process term annotations for theat gene symbol
go_tuple = tuple(zip(*tuple((k,v) for k,v in gene_names_dict.items())))
go_df = pd.DataFrame({'gene': go_tuple[0], 'bp': go_tuple[1]})


In [None]:
# make dictionary relating gene symbol to the list of GO biological process term annotations for that gene
gene_to_go = pd.Series.to_dict(go_df.groupby([go_df.gene]).bp.apply(sum))

#make dictionary relating GO biological process terms to genes that are annotated with the GO biological process term in the key
go_to_gene = dict()
for gene_name, bp_list in gene_to_go.items():
    for go_term in bp_list:
        go_term_genes = go_to_gene.get(go_term, None)
        if go_term_genes is None:
            go_term_genes = []
            go_to_gene[go_term] = go_term_genes
        go_term_genes.append(gene_name)


In [None]:
# load the protein-protein interaction data from the SIF file

!curl https://csx46.s3-us-west-2.amazonaws.com/PathwayCommons9.All.hgnc.sif.gz --output PathwayCommons9.All.hgnc.sif.gz
!gunzip -f PathwayCommons9.All.hgnc.sif.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 5930k  100 5930k    0     0  9505k      0 --:--:-- --:--:-- --:--:-- 9503k


In [None]:
sif_data = pd.read_csv("PathwayCommons9.All.hgnc.sif",
                       sep="\t", names=["species1","interaction_type","species2"])

In [None]:
# process the protein-protein interaction data to eliminate duplicates
interaction_types_ppi = set(["interacts-with",
                             "in-complex-with"])
interac_ppi = sif_data[sif_data.interaction_type.isin(interaction_types_ppi)].copy()
boolean_vec = interac_ppi['species1'] > interac_ppi['species2']
interac_ppi.loc[boolean_vec, ['species1', 'species2']] = interac_ppi.loc[boolean_vec, ['species2', 'species1']].values
for rowid, row in interac_ppi.head().iterrows():
    if row['species1'] > row['species2']:
        interac_ppi['species1'][rowid] = row['species2']
        interac_ppi['species2'][rowid] = row['species1']
interac_ppi_unique = interac_ppi[["species1", "species2"]].drop_duplicates()


In [None]:
# calculate, for all pairs of interacting proteins (mapped to gene names),
# the size of the smallest shared GO biological process annotation for the genes
size_smallest_shared_bp_int = []
no_shared_bp_int = 0
int_set = set()  # need set of "keys" of interacting proteins;
for row in interac_ppi_unique.iterrows():
    g1 = row[1].species1
    g2 = row[1].species2
    int_set.add(g1 + '-' + g2)
    go1 = set(gene_to_go.get(g1, []))
    go2 = set(gene_to_go.get(g2, []))
    go12_terms = go1 & go2
    if len(go12_terms) > 0:
        go12_terms_sizes = {t: len(go_to_gene[t]) for t in go12_terms}
        min_term = min(go12_terms_sizes, key=go12_terms_sizes.get)
        size_min_term = go12_terms_sizes[min_term]
        size_smallest_shared_bp_int.append(size_min_term)
    else:
        no_shared_bp_int += 1

In [None]:
# calculate, for ten million random pairs of non-interacting proteins (mapped to gene names),
# the size of the smallest shared GO biological process annotation for the genes
size_smallest_shared_bp_no_int = []
no_shared_bp_no_int = 0
all_genes = list(gene_to_go.keys())
ctr = 0
Nnoint = 10000000
while ctr < Nnoint:
    g1 = random.choice(all_genes)
    g2 = g1
    while g2 == g1 or (g1 + '-' + g2) in int_set:  # use the "key" to check if they are interacting
        g2 = random.choice(all_genes)
    go1 = set(gene_to_go.get(g1, []))
    go2 = set(gene_to_go.get(g2, []))
    go12_terms = go1 & go2
    if len(go12_terms) > 0:
        go12_terms_sizes = {t: len(go_to_gene[t]) for t in go12_terms}
        min_term = min(go12_terms_sizes, key=go12_terms_sizes.get)
        size_min_term = go12_terms_sizes[min_term]
        size_smallest_shared_bp_no_int.append(size_min_term)
    else:
        no_shared_bp_no_int += 1
    ctr += 1

In [None]:
# calculate the likelihood ratios using the same binning based on GO biological process gen-set size that Reading12 used
# use Nunpy's histogram feature
breaks = [0, 10, 50, 100, 500, 1000, 5000]
Nint = interac_ppi_unique.shape[0]
l_no_shared = (no_shared_bp_int / Nint)/(no_shared_bp_no_int / Nnoint)
hist_int = np.histogram(size_smallest_shared_bp_int, bins=breaks)
hist_no_int = np.histogram(size_smallest_shared_bp_no_int, bins=breaks)
l_ratios = (hist_int[0]/Nint)/(hist_no_int[0]/Nnoint)
l_ratios_res = [('no relation', l_no_shared)]
for ctr in range(len(l_ratios)):
    l_ratios_res.append((f"{breaks[ctr]}-{breaks[ctr+1]}", l_ratios[ctr]))


In [None]:
# print likelihood ratios
pprint.pprint(l_ratios_res)

[('no relation', 0.9821131025446865),
 ('0-10', 7.009383461640098),
 ('10-50', 4.4480223036432704),
 ('50-100', 2.7503962862833395),
 ('100-500', 3.6085925344746212),
 ('500-1000', 1.9461299631233013),
 ('1000-5000', 0.6328541523315716)]


The output likelihood ratio decreases as a function of increasing size of the smallest set of overlapping Gene Ontology Biology Process terms for the protein pair.