In [1]:
import pandas as pd
import ete3
import re

ncbi = ete3.NCBITaxa()

%cd ~/work/eggNOGbyClass/

/nobackup1c/users/payette/eggNOGbyClass


In [3]:
sampled_genomes = pd.read_csv('genomes.tab',
                              sep='\t',
                              index_col=0)

In [4]:
ABCDG_sampled_genomes = pd.read_excel('ABCDG_tree_taxa.xlsx')  #ABCDG tree genomes

In [5]:
ABCDG_sampled_genomes = ABCDG_sampled_genomes[ABCDG_sampled_genomes['TaxID'] != 2762020] #Exclude this taxa b/c error

In [7]:
sorted(pd.DataFrame(sampled_genomes.species_taxid).species_taxid.head().unique())

[1097.0, 1219.0, 33072.0, 198252.0]

In [8]:
lineages = pd.DataFrame()
for taxid in ABCDG_sampled_genomes['TaxID'].unique():
    if pd.isna(taxid):
        continue
    lineages = lineages.append({tax_rank: tmp_taxid 
                                 for tmp_taxid, tax_rank in ncbi.get_rank(ncbi.get_lineage(taxid)).items()},
                                ignore_index=True)
lineages = lineages.reindex(columns=['class', 'family',  'genus', 'phylum',
                                     'order', 'species', 'superkingdom']).copy()
lineages = lineages.query('superkingdom == 2').copy()

lineages.loc[lineages['phylum']==1224, 'phylum'] = lineages.loc[lineages['phylum']==1224, 'class']

In [10]:
eggNOG_sample = pd.read_csv('e5.taxid_info.tsv',
                            sep='\t',
                            comment='#',
                            names=['Taxid', 'Sci.Name', 'Rank', 'Named Lineage', 'Taxid Lineage'],
                            header=None,
                            index_col=0)

In [11]:
eggNOG_lineage = pd.DataFrame()
for taxid in eggNOG_sample.index.unique():
    if pd.isna(taxid):
        continue
    tmp = pd.Series({tax_rank: tmp_taxid 
                     for tmp_taxid, tax_rank in ncbi.get_rank(ncbi.get_lineage(taxid)).items()})
    tmp.name = taxid
    eggNOG_lineage = eggNOG_lineage.append(tmp)

eggNOG_lineage = eggNOG_lineage.reindex(columns=['class', 'family',  'genus', 'phylum',
                                                 'order', 'species', 'superkingdom']).copy()
eggNOG_lineage = eggNOG_lineage.query('superkingdom == 2').copy()

eggNOG_lineage.loc[eggNOG_lineage['phylum']==1224, 'phylum'] = eggNOG_lineage.loc[eggNOG_lineage['phylum']==1224, 'class']





In [12]:
eggNOG_groups = pd.read_csv('2_members.tsv',
                            sep='\t',
                            header=None,
                            usecols=[1,2,3,4],
                            names=['group_id', 'num_proteins', 'num_taxa', 'members'])

tmp           = eggNOG_groups.members.map(lambda cell: [int(taxid) 
                                                        for taxid in re.findall('(\d+)\.(?:[^,]+)', cell)])
tmp.name      = 'taxa'
eggNOG_groups = eggNOG_groups.join(tmp)

In [13]:
def get_phyla_overlap(taxa):
    #taxids = [int(_) for _ in taxa]
    group_phyla      = set(eggNOG_lineage.loc[taxa, 'phylum'].unique())
    overlapped_phyla = group_phyla.intersection(lineages.phylum.unique())
    return(overlapped_phyla)

eggNOG_target_groups = eggNOG_groups[eggNOG_groups.taxa.map(lambda cell: 
                                                            True if len(get_phyla_overlap(cell)) > 1 
                                                            else False)]

In [14]:
eggNOG_trees = pd.read_csv('2_trees.tsv',
                           sep='\t',
                           header=None,
                           usecols=[1,2,3],
                           index_col=0,
                           names=['group_id', 'fast', 'tree'])
eggNOG_trees = eggNOG_trees.reindex(index=eggNOG_target_groups.group_id)

In [15]:
working_groups = eggNOG_target_groups.query('num_proteins<num_taxa*3')
working_trees  = eggNOG_trees.loc[working_groups.group_id]

In [16]:
working_groups.to_parquet('working_eggNOG_groups.parquet', compression='gzip', engine='fastparquet')
working_trees.to_parquet( 'working_eggNOG_trees.parquet',  compression='gzip', engine='fastparquet')
eggNOG_lineage.to_parquet('eggNOG_taxonomy.parquet', compression='gzip', engine='fastparquet')

In [None]:
#working_groups.to_parquet('working_eggNOG_groups.parquet', compression='gzip')
#working_trees.to_parquet( 'working_eggNOG_trees.parquet',  compression='gzip')
#eggNOG_lineage.to_parquet('eggNOG_taxonomy.parquet', compression='gzip')