In [1]:
import warnings
warnings.filterwarnings('ignore')

import re
import os
import sys
import json
import tqdm
import pickle
import urllib
import requests
import textwrap
import functools
import traceback
import itertools
import matplotlib
import numpy as np
import pandas as pd
from Bio import SeqIO
import seaborn as sns
import networkx as nx
import plotly.express as px
from functools import reduce
import matplotlib.pyplot as plt
from multiprocessing import Pool
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

from pygosemsim import graph
from pygosemsim import download
from pygosemsim import term_set
from pygosemsim import similarity

from utilities.funcs import *

# Variables

In [2]:
goslim_subset = 'goslim_generic'
level = 'at2759'
odb_levels = 'https://v101.orthodb.org/download/odb10v1_levels.tab.gz'
levels = pd.read_table(odb_levels, compression='gzip', header=None)
levels[0] = levels[0].apply(lambda x: f'at{x}')
levels = dict(levels[[0,1]].values.tolist())
levelname = levels.get(level)

## File paths

In [3]:
# INFILES

transitions_file = 'Eukaryota.transitions.mini'
clusters_raw_file = 'Eukaryota.clusters.mini'

# #OUTFILES

transitions_ogs_file = f'{transitions_file}.ogs'
clusters_file = f'{clusters_raw_file}.grouped'
allogs_file = f"{transitions_file.split('.')[0]}.ogs"

# LINKS

odb_og2genes = 'https://v101.orthodb.org/download/odb10v1_OG2genes.tab.gz'
odb_genexrefs = 'https://v101.orthodb.org/download/odb10v1_gene_xrefs.tab.gz'

# # goslim infiles
# goobo = 'external/go.obo'

# # mapping files containing: orthogroups --> GO from Uniprot
# oguni = f'external/og_uni_{levelname}.tsv'
# unigo = 'Eukaryota.raxml.RL.csv.num.transitions.annotated.unigo'
# ogunigo = f'{transitions_file}.unigo'

# pygosemsim_file = f'{ogunigo_file}.pygosemsim'
# ogunigo_slim_file = f'{ogunigo_file}.slim'

# # orthoDB data output file
# ogunilevel = f'external/og_uni_go_{levelname}.tsv'
# # subset files for GOs slimming 
# goslim_outfile = 'external/annotations.generic'
# odbfile = 'external/odbinfos.pickle'

# Processing files

In [4]:
# transition pairs to single, most significant ogs
transitionstoogs(transitions_file).to_csv(transitions_ogs_file, sep='\t', index=None)

# processing raw clusters file --> dataframe
clustersfromraw(clusters_raw_file, transitions_file).to_csv(clusters_file, sep='\t', index=None)

# all ogs list
transitionsogs = transitionstoogs(transitions_file)
allogsdf = transitionsogs[['og','ogname']]
allogsdf.columns = ['og','name']

allogsdf.to_csv(allogs_file, sep='\t', index=None)

# GO

## GO Mapping

In [5]:
oguni = f'external/og_uni_{levelname}.tsv'
ogunigo = f'external/og_uni_go_{levelname}.tsv'

!wget external/http://release.geneontology.org/2020-01-01/annotations/goa_uniprot_all.gaf.gz
goa = 'external/goa_uniprot_all.gaf.gz'

xref = pd.read_table(odb_og2genes, compression='gzip', header=None)
og2genes = pd.read_table(ogb_genexrefs, compression='gzip', header=None)
og2genes = og2genes[og2genes[2]=='UniProt']
merged = pd.merge(og2genes, xref.rename(columns={0:3,1:0}))
merged2 = merged[merged[3].str.contains(level)]
merged2.columns = ['gene','id','db','og']
merged2.to_csv(oguni, sep='\t', index=None)

!python3 utilities/oguni2chunk goa oguni ogunigo

oggo = pd.read_table(ogunigo)
ex = ['IDA', 'IMP', 'IPI', 'IEP', 'IGI']

oggo = oggo[oggo['og'].isin(set(oggo[oggo['evidence'].isin(ex)]['og']))]

oggo[~oggo['go'].isin(['GO:0008150', 'GO:0005575', 'GO:0003674'])
    ].to_csv(ogunigo.replace('.tsv', '.experimental.tsv'), 
             sep ='\t', index=False)

transitions = pd.read_table(transitions_file, header=None)
oggo = pd.read_table(ogunigo.replace('.tsv', '.experimental.filt.tsv')
oggo = oggo[oggo['og'].isin(set(transitions[0].tolist()+transitions[1].tolist()))]
oggo.to_csv(f'{transitions_file}.unigo.experimental', sep='\t', index=None)

In [10]:
###### for the brief version (we'll attach this file in the folder 'external')
trans = pd.read_table('Eukaryota.transitions.mini')
clu = pd.read_table('Eukaryota.clusters.mini')
ogs = set(trans['Orthogroup1'].tolist()+trans['Orthogroup2'].tolist())

oguni = pd.read_table(oguni_file)
oguni[oguni['og'].isin(set(trans['Orthogroup1'].tolist()+trans['Orthogroup2'].tolist()))
     ].to_csv(oguni.replace('.tsv', '_mini.tsv', sep='\t', index=None)



In [11]:
ogunigo = pd.read_table('../../og_uni_go_Eukaryota_experimental_filt.tsv')
ogunigo[ogunigo['og'].isin(ogs)].to_csv('../../og_uni_go_Eukaryota_experimental_filt.tsv'.replace('.tsv', '_mini.tsv', sep='\t', index=None)
                                        
                                        
                                        
                                        

Unnamed: 0,og,go,type,evidence
0,1002357at2759,GO:0004177,F,IEA
1,1002357at2759,GO:0006464,P,IEA
2,1002357at2759,GO:0006508,P,IEA
3,1002357at2759,GO:0008233,F,IEA
4,1002357at2759,GO:0016787,F,IEA
...,...,...,...,...
1014341,204282at2759,GO:0003690,F,IEA
1014342,204282at2759,GO:0006355,P,IEA
1014419,754103at2759,GO:0004555,F,IEA
1014420,754103at2759,GO:0005991,P,IEA


## PyGOSemSim

In [None]:
try:
    download.obo("go-basic")
    download.download("goslim_chembl.obo",
        "http://www.geneontology.org/ontology/subsets/goslim_chembl.obo")
    G = graph.from_resource("go-basic")
    G_chembl = graph.from_resource("goslim_chembl")
    similarity.precalc_lower_bounds(G)
except ValueError:
    pass

def calcsim(indexgo1go2):
    try:
        sf = functools.partial(term_set.sim_func, G, similarity.resnik)
        si = term_set.sim_bma(indexgo1go2[1], indexgo1go2[2], sf)
    except:
        si = 0
    return [indexgo1go2[0], si]

oggo_df = pd.read_table(f"{transitions_file}.unigo.experimental.filt")
oggo = dict(oggo_df.groupby('og')['go'].apply(lambda x: list(set(x))).reset_index().values)
oggo_bp = dict(oggo_df[oggo_df['type']=='P'].groupby('og')['go'].apply(lambda x: list(set(x))).reset_index().values)
oggo_cc = dict(oggo_df[oggo_df['type']=='C'].groupby('og')['go'].apply(lambda x: list(set(x))).reset_index().values)
oggo_mf = dict(oggo_df[oggo_df['type']=='F'].groupby('og')['go'].apply(lambda x: list(set(x))).reset_index().values)
dfs = []
for OGGO, typ in zip([oggo_bp, oggo_cc, oggo_mf],
                     ['BP', 'CC', 'MF']):
    oggo_df = transitions[['og1','og2']].applymap(lambda x: OGGO.get(x)).reset_index()
    oggo_df.columns = ['index','go1', 'go2']
    oggo_df = oggo_df.dropna()
    simscore = Pool(500).map(calcsim, oggo_df.values.tolist())
    simscore = dict(simscore)
    oggo_df['simscore'] = oggo_df['index'].apply(lambda x: simscore.get(x))
    oggo_df = oggo_df.drop('index', axis=1)
    oggo_df = oggo_df.reset_index(drop=True).rename(columns={'simscore':f'{typ.lower()}sim', 
                                                                          'go1':f'go1_{typ.lower()}', 
                                                                          'go2':f'go2_{typ.lower()}'})
    oggo_df[f'len_go1_{typ.lower()}'] = oggo_df[f'go1_{typ.lower()}'].apply(len)
    oggo_df[f'len_go2_{typ.lower()}'] = oggo_df[f'go2_{typ.lower()}'].apply(len)
    dfs.append(pd.concat([transitions.loc[list(simscore.keys())].reset_index(drop=True), oggo_df], axis=1))
total = (dfs[0]
 .merge(dfs[1], how='outer')
 .merge(dfs[2], how='outer')
)
total.to_csv(f"{transitions_file}.unigo.experimental.pygosemsim", sep='\t', index=False)