In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from Bio import SeqIO

from venn import venn



In [19]:
def run_ortholog_map(spec_set, orth_map_dir_base, sc_annotations_df):
    
    for spec_source, (spec_name_source, eggnog_path_source, eggnog_fname_source) in spec_set.items(): 

        print(spec_source)  
        
        orth_map_dir = orth_map_dir_base + spec_source + os.sep + eggnog_path_source 
        
        orthtable = pd.read_table( orth_map_dir  + 'out.emapper.orthologs', skiprows = 4, skipfooter=3, engine='python')

        data = []    

        #Add genes that have no orthologs as type 'no_eggnog_orthologs'
            
        source_seqs = SeqIO.parse(orth_map_dir + 'queries.fasta', 'fasta' )

        source_ids = []

        for seq in source_seqs:
            source_ids.append(seq.id)

        genes_w_orths = set(orthtable['#query'])
        genes_no_orths = set(source_ids) - genes_w_orths
        print( 'For {} there are {} genes with orthologs and {} genes with no orthologs in eggnog'.format(spec_source, len(genes_w_orths), len(genes_no_orths)))

        genes_no_orths_list = list(genes_no_orths)
        genes_no_orths_list.sort()
        for source_genename in genes_no_orths_list: 
            data.append((source_genename, 'NONE', 'no_eggnog_orthologs' ))


        orthtable_target = orthtable[orthtable['species']=='Saccharomyces cerevisiae(4932)']
        #Add genes that have no S. cerevisae orthologs as type 'no_target_orthologs'

        genes_no_targ_orths = genes_w_orths - set(orthtable_target['#query'])

        print( 'There are {} genes with no orthologs in target species {}'.format(len(genes_no_targ_orths), spec_target))

        genes_no_targ_orths_list = list(genes_no_targ_orths)
        genes_no_targ_orths_list.sort()
        for source_genename in genes_no_targ_orths_list: 
            data.append((source_genename, 'NONE', 'no_target_orthologs' ))


        orthgroups = orthtable_target.groupby('orth_type')

        #for orth_type in orth_types:
        #Append data with source_genename, target_genename, orth_type

        for orth_type in ['one2one', 'many2one', 'one2many', 'many2many']: 
            orthtable_target_type = orthgroups.get_group(orth_type)

            for row in orthtable_target_type.iterrows():
                source_genename = row[1]['#query']
                row_orths_raw = row[1]['orthologs']
                row_orths = row_orths_raw.split(',')
                if orth_type in {'one2many', 'many2many'}:
                    if len(row_orths)==1: 
                        print("{} gene {} should have more than one ortholog, but only has one: {}".format(spec_source, source_genename,row_orths_raw )) 
                for row_orth in row_orths: 
                    if row_orth[0]=='*': 
                        data.append((source_genename, row_orth.split('*')[1],orth_type))
                    else: 
                        data.append((source_genename, row_orth[0],orth_type))

        orth_map = pd.DataFrame(data, columns=['source_genename', 'target_genename', 'orth_type'])

        print('{} orth-map complete'.format(spec_source))

        orth_map.to_csv(orth_map_dir_base + spec_source + '_' + spec_target + '.csv')


        #Map annotation from eggnog and S. cerevisiae onto ortholog file

        #Import appropriate columns from Eggnog
        eggnog_annotations = pd.read_table(orth_map_dir + 'out.emapper.annotations' , skiprows = 4, skipfooter=3, engine='python')
        eggnog_cols_to_keep = ['#query','eggNOG_OGs','max_annot_lvl','COG_category','Description', 'Preferred_name', 'GOs','EC', 'KEGG_ko', 'KEGG_Pathway', 'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE', 'KEGG_TC','CAZy','BiGG_Reaction', 'PFAMs' ]


        #QC checks: 
        assert len(eggnog_annotations)==len(set(eggnog_annotations['#query'])), 'More than one annotation for a given query'

        genes_w_annotations = set(orthtable['#query'])
        assert len(set(eggnog_annotations['#query']) - genes_w_annotations)== 0, 'More annotations than genes with annotations'
        assert len(genes_w_annotations - set(eggnog_annotations['#query'])) == 0, 'More genes with annotations than annotations'
        orth_map_annot = orth_map.merge(eggnog_annotations.loc[:,eggnog_cols_to_keep], how = 'left', left_on='source_genename', right_on = '#query')

        #Add annotations from yeastmine
        orth_map_annot = orth_map_annot.merge(sc_annotations_df, how  = 'left', left_on='target_genename', right_index=True)

        print('{} annotations complete'.format(spec_source))
        orth_map_annot.to_csv(orth_map_dir_base + spec_source + '_' + spec_target + '_annot.csv')

    
    
    
    return

## Build Yeastmine annotation matrix for S.cer

In [2]:
#Build Yeastmine annotation matrix for S.cer
#Only need to do once.  

yeastmine_scer_annotation = pd.read_table(base_dir + os.sep + 'eggnog_mapper' + os.sep +  'annotation' + os.sep + 'yeastmine_annotation_query_20211007.tsv')
yeastmine_scer_annotation = yeastmine_scer_annotation.loc[~(yeastmine_scer_annotation['Gene.secondaryIdentifier'].isnull())]
yeastmine_scer_annotation_grouped = yeastmine_scer_annotation.groupby('Gene.secondaryIdentifier')
sc_annotations = {}
sc_genenames = list(set(yeastmine_scer_annotation['Gene.secondaryIdentifier']))
for sc_genename in sc_genenames:
    gene_group = yeastmine_scer_annotation_grouped.get_group(sc_genename)

    #unique columns: 

    unique_cols = ['Gene.symbol', 'Gene.secondaryIdentifier', 'Gene.name'] 
    gene_data = []
    for col in unique_cols:
        gene_group_entries = set(gene_group[col])
        assert len(gene_group_entries) == 1, 'unexpected nonunique item for {}: {}'.format(sc_genename, col)
        gene_data.append(list(gene_group_entries)[0])

    #nonunique columns: 

    non_unique_cols = ['Gene.pathways.identifier', 'Gene.pathways.name']
    for col in non_unique_cols:
        gene_group_entries = list(set(gene_group[col]))
        gene_group_entries = [item for item in gene_group_entries if not(pd.isnull(item))]  #remove nans
        gene_group_entries.sort()
        gene_data.append(gene_group_entries)


    xref_to_keep = ['EC number',
     'LoQAtE ID',
     'PANTHER ID',
     'PDB ID',
     'PDB ID Chain',
     'Pathway ID',
     'UniParc ID',
     'UniProtKB ID'
    ]

    gene_group_xref_group = gene_group.groupby('Gene.crossReferences.dbxreftype')

    for xref in xref_to_keep: 
    
        if xref in set(gene_group_xref_group.groups.keys()):
            gene_xref_entries = list(set(gene_group_xref_group.get_group(xref)['Gene.crossReferences.identifier']))
            gene_xref_entries.sort()
            gene_data.append(gene_xref_entries)
        
        else: 
            gene_data.append(None)
        
    sc_annotations[sc_genename] = gene_data

sc_annotations_df = pd.DataFrame.from_dict(sc_annotations, orient='index', columns = unique_cols + non_unique_cols + xref_to_keep)

sc_annotations_df.to_csv(base_dir + os.sep + 'eggnog_mapper' + os.sep +  'annotation' + os.sep + 'yeastmine_annotation_summary_20211007.csv')

## Make ortholog map file and map annotations for proeomes used for the proteomics experiment

In [None]:
orthtable = pd.read_table( orth_map_dir  + 'out.emapper.orthologs', skiprows = 4, skipfooter=3, engine='python')


In [5]:
orthseeds = pd.read_table( orth_map_dir  + 'out.emapper.seed_orthologs', skiprows = 5, skipfooter=3, engine='python')

Unnamed: 0,#qseqid,sseqid,evalue,bitscore,qstart,qend,sstart,send,pident,qcov,scov
0,sp|A0A1E3P8S8|EAT2_WICAA,1041607.K0KSN3,8.600000e-158,563.1,1,330,1,329,83.0,96.8,96.5
1,sp|A0A1E3P8S6|EAT1_WICAA,1041607.K0KPV8,9.500000e-169,599.7,1,386,1,390,75.4,98.7,99.2
2,tr|A0A1E3P6R1|A0A1E3P6R1_WICAA,1041607.K0KQF1,1.800000e-145,522.3,1,360,50,407,67.8,100.0,88.0
3,tr|A0A1E3P6B5|A0A1E3P6B5_WICAA,1041607.K0KMQ6,3.100000e-161,574.7,1,367,1,370,77.0,99.2,62.0
4,tr|A0A1E3NU06|A0A1E3NU06_WICAA,1041607.K0L0S0,2.700000e-23,114.4,2,68,181,247,86.6,77.0,24.9
...,...,...,...,...,...,...,...,...,...,...,...
5389,tr|A0A1E3P6J5|A0A1E3P6J5_WICAA,1041607.K0KED1,1.400000e-151,542.3,1,299,3,301,94.0,95.8,94.3
5390,tr|A0A1E3PA25|A0A1E3PA25_WICAA,1071379.XP_004181039.1,3.700000e-121,441.0,1,256,1,255,87.5,100.0,100.0
5391,tr|A0A1E3NYQ4|A0A1E3NYQ4_WICAA,1041607.K0KWB0,6.400000e-48,196.4,1,103,1,103,100.0,100.0,100.0
5392,tr|A0A1E3NZ49|A0A1E3NZ49_WICAA,1041607.K0KHH6,1.700000e-66,258.5,1,130,1,130,98.5,100.0,100.0


In [13]:
orthtable

Unnamed: 0,#query,orth_type,species,orthologs
0,sp|A0A1E3P8S8|EAT2_WICAA,many2one,Candida glabrata(5478),*XP_446354
1,sp|A0A1E3P8S8|EAT2_WICAA,many2one,Kazachstania africana(432096),*XP_003958079
2,sp|A0A1E3P8S8|EAT2_WICAA,many2one,Kazachstania naganishii(588726),*J7S748
3,sp|A0A1E3P8S8|EAT2_WICAA,many2one,Kluyveromyces lactis(28985),*XP_453001
4,sp|A0A1E3P8S8|EAT2_WICAA,many2one,Lachancea thermotolerans(381046),*XP_002553862
...,...,...,...,...
491243,tr|A0A1E3NZ49|A0A1E3NZ49_WICAA,one2many,Schizosaccharomyces pombe(4896),"*SPAC22A12,*SPAC5D6"
491244,tr|A0A1E3NZ49|A0A1E3NZ49_WICAA,one2many,Tetrapisispora phaffii(113608),"*XP_003683777,*XP_003686033"
491245,tr|A0A1E3NZ49|A0A1E3NZ49_WICAA,one2many,Torulaspora delbrueckii(4950),"*XP_003681614,*XP_003682322"
491246,tr|A0A1E3NZ49|A0A1E3NZ49_WICAA,one2many,Wickerhamomyces ciferrii(1041607),*K0KHH6


In [16]:
grouped = orthtable.groupby('#query')

In [32]:
a = groups.get_group('tr|A0A1E3P1J8|A0A1E3P1J8_WICAA')

In [35]:
# For each seed 
#for seqid, seed_id in zip(orthseeds['#qseqid'], orthseeds['sseqid']): 

seqid = 'sp|A0A1E3P8S8|EAT2_WICAA'
seed_id = '1041607.K0KSN3'

seq_orths = groups.get_group(seqid)

# Find the row for that seed

# make its orthtype, seed



In [36]:
seq_orths

Unnamed: 0,orth_type,species,orthologs
0,many2one,Candida glabrata(5478),*XP_446354
1,many2one,Kazachstania africana(432096),*XP_003958079
2,many2one,Kazachstania naganishii(588726),*J7S748
3,many2one,Kluyveromyces lactis(28985),*XP_453001
4,many2one,Lachancea thermotolerans(381046),*XP_002553862
5,many2one,Naumovozyma castellii(27288),*XP_003673468
6,many2one,Naumovozyma dairenensis(27289),*XP_003672691
7,many2one,Saccharomyces cerevisiae(4932),*YGR015C
8,many2one,Tetrapisispora phaffii(113608),*XP_003687705
9,many2one,Torulaspora delbrueckii(4950),*XP_003681095


In [None]:
#Check if there is more than one 

#If so check if it is a seed. 

#If so, 
# Determine status (one2one, etc).

In [23]:
a[a['species']=='Saccharomyces cerevisiae(4932)']

Unnamed: 0,orth_type,species,orthologs
133808,one2one,Saccharomyces cerevisiae(4932),*YBR135W
133842,one2many,Saccharomyces cerevisiae(4932),*YBR135W


In [25]:
spec_source = 'Wano'
spec_target = 'Scer'
eggnog_path_source = ''

#Read in and make dictionary of seeds
base_dir = os.path.normpath('C:/Users/heineib/Google Drive/Crick_LMS/projects/diverse_yeasts')
orth_map_dir_base = base_dir + os.sep + os.path.normpath('eggnog_mapper/proteomics_set') + os.sep


orth_map_dir = orth_map_dir_base + spec_source + os.sep + eggnog_path_source 

orthtable = pd.read_table( orth_map_dir  + 'out.emapper.orthologs', skiprows = 4, skipfooter=3, engine='python')

data = []    

#Add genes that have no orthologs as type 'no_eggnog_orthologs'

source_seqs = SeqIO.parse(orth_map_dir + 'queries.fasta', 'fasta' )

source_ids = []

for seq in source_seqs:
    source_ids.append(seq.id)

genes_w_orths = set(orthtable['#query'])
genes_no_orths = set(source_ids) - genes_w_orths
print( 'For {} there are {} genes with orthologs and {} genes with no orthologs in eggnog'.format(spec_source, len(genes_w_orths), len(genes_no_orths)))

genes_no_orths_list = list(genes_no_orths)
genes_no_orths_list.sort()
for source_genename in genes_no_orths_list: 
    data.append((source_genename, 'NONE', 'no_eggnog_orthologs' ))


orthtable_target = orthtable[orthtable['species']=='Saccharomyces cerevisiae(4932)']
#Add genes that have no S. cerevisae orthologs as type 'no_target_orthologs'

genes_no_targ_orths = genes_w_orths - set(orthtable_target['#query'])

print( 'There are {} genes with no orthologs in target species {}'.format(len(genes_no_targ_orths), spec_target))

genes_no_targ_orths_list = list(genes_no_targ_orths)
genes_no_targ_orths_list.sort()
for source_genename in genes_no_targ_orths_list: 
    data.append((source_genename, 'NONE', 'no_target_orthologs' ))


orthgroups = orthtable_target.groupby('orth_type')

#for orth_type in orth_types:
#Append data with source_genename, target_genename, orth_type

for orth_type in ['one2one', 'many2one', 'one2many', 'many2many']: 
    orthtable_target_type = orthgroups.get_group(orth_type)

    for row in orthtable_target_type.iterrows():
        source_genename = row[1]['#query']
        row_orths_raw = row[1]['orthologs']
        row_orths = row_orths_raw.split(',')
        if orth_type in {'one2many', 'many2many'}:
            if len(row_orths)==1: 
                print("{} gene {} should have more than one ortholog, but only has one: {}".format(spec_source, source_genename,row_orths_raw )) 
        for row_orth in row_orths: 
            if row_orth[0]=='*': 
                data.append((source_genename, row_orth.split('*')[1],orth_type))
            else: 
                data.append((source_genename, row_orth[0],orth_type))

orth_map = pd.DataFrame(data, columns=['source_genename', 'target_genename', 'orth_type'])


For Wano there are 5279 genes with orthologs and 1127 genes with no orthologs in eggnog
There are 1223 genes with no orthologs in target species Scer
Wano gene tr|A0A1E3P1J8|A0A1E3P1J8_WICAA should have more than one ortholog, but only has one: *YBR135W
Wano gene tr|A0A1E3P289|A0A1E3P289_WICAA should have more than one ortholog, but only has one: *YNL260C


In [31]:
orth_map

Unnamed: 0,source_genename,target_genename,orth_type
0,tr|A0A1E3NTW5|A0A1E3NTW5_WICAA,NONE,no_eggnog_orthologs
1,tr|A0A1E3NTW9|A0A1E3NTW9_WICAA,NONE,no_eggnog_orthologs
2,tr|A0A1E3NTX6|A0A1E3NTX6_WICAA,NONE,no_eggnog_orthologs
3,tr|A0A1E3NTY6|A0A1E3NTY6_WICAA,NONE,no_eggnog_orthologs
4,tr|A0A1E3NU15|A0A1E3NU15_WICAA,NONE,no_eggnog_orthologs
...,...,...,...
7160,tr|A0A1E3P8C7|A0A1E3P8C7_WICAA,YIL052C,many2many
7161,tr|A0A1E3NZX1|A0A1E3NZX1_WICAA,YHR203C,many2many
7162,tr|A0A1E3NZX1|A0A1E3NZX1_WICAA,YJR145C,many2many
7163,tr|A0A1E3NY82|A0A1E3NY82_WICAA,YBL002W,many2many


In [28]:
#orth_map[orth_map['source_genename']=='tr|A0A1E3P1J8|A0A1E3P1J8_WICAA']

orth_map[orth_map['source_genename']=='tr|A0A1E3NZ49|A0A1E3NZ49_WICAA']

Unnamed: 0,source_genename,target_genename,orth_type
1460,tr|A0A1E3NZ49|A0A1E3NZ49_WICAA,NONE,no_target_orthologs


In [2]:
#Seed ortholog error 
#I reported this error, and it is fixed in the master branch so that the ortholog file now lists seeds separately (instead of as many2one) (https://github.com/eggnogdb/eggnog-mapper/issues/328)
#Until it is updated in the master branch, this fixes the issue.  

spec_source = 'Wano'
eggnog_path_source = ''

#Read in and make dictionary of seeds
base_dir = os.path.normpath('C:/Users/heineib/Google Drive/Crick_LMS/projects/diverse_yeasts')
orth_map_dir_base = base_dir + os.sep + os.path.normpath('eggnog_mapper/proteomics_set') + os.sep
orth_map_dir = orth_map_dir_base + spec_source + os.sep + eggnog_path_source 

orthtable = pd.read_table( orth_map_dir  + 'out.emapper.orthologs', skiprows = 4, skipfooter=3, engine='python')


#For ortholog file extract info for each seed and replace category with seed. 

In [23]:
#Abbreviation: (full name, path to eggnog files, eggnog_name)
spec_set = {'Zrou': ('Zygosaccharomyces rouxii','', ''),
            'Wano': ('Wickerhamomyces anomalus','', ''),
            'Spom': ('Schizosaccharomyces pombe','', ''), 
            'Scer': ('Saccharomyces cerevisiae','R64-3-1_20210421/', 'Saccharomyces cerevisiae(4932)'), 
            'Kser': ('Kazachstania servazzii','', '' ), 
            'Kpha': ('Komagataella phaffii','', '' ), 
            'Kmar': ('Kluyveromyces marxianus','uniprot/', ''), 
            'Klac': ('Kluyveromyces lactis','', '' ), 
            'Lthe': ('Lachancea thermotolerans', '',''),
            'Hosm': ('Hanseniaspora osmophila','', '' ), 
            'Gcan': ('Galactomyces candidus','', ''), 
            'Dhan': ('Debaryomyces hansenii','', '' ), 
            'Ctro': ('Candida tropicalis','', ''), 
            'Calb': ('Candida albicans','', '' )
            }

base_dir = os.path.normpath('C:/Users/heineib/Google Drive/Crick_LMS/projects/diverse_yeasts')

orth_map_dir_base = base_dir + os.sep + os.path.normpath('eggnog_mapper/proteomics_set') + os.sep
#orth_map_dir = base_dir + os.sep + os.path.normpath('eggnog_mapper/stefan_denovo') + os.sep

run_ortholog_map(spec_set, orth_map_dir_base, sc_annotations_df)

Zrou
For Zrou there are 4812 genes with orthologs and 175 genes with no orthologs in eggnog
There are 327 genes with no orthologs in target species Scer
Zrou orth-map complete
Zrou annotations complete
Wano
For Wano there are 5279 genes with orthologs and 1127 genes with no orthologs in eggnog
There are 1223 genes with no orthologs in target species Scer
Wano gene tr|A0A1E3P1J8|A0A1E3P1J8_WICAA should have more than one ortholog, but only has one: *YBR135W
Wano gene tr|A0A1E3P289|A0A1E3P289_WICAA should have more than one ortholog, but only has one: *YNL260C
Wano orth-map complete
Wano annotations complete
Spom
For Spom there are 4235 genes with orthologs and 903 genes with no orthologs in eggnog
There are 1291 genes with no orthologs in target species Scer
Spom orth-map complete
Spom annotations complete
Scer
For Scer there are 5544 genes with orthologs and 1172 genes with no orthologs in eggnog
There are 10 genes with no orthologs in target species Scer
Scer gene YAL067C should have 

## Make ortholog mapping and map annoations for Stephan's Genomes

In [None]:
orth_map_dir_base = base_dir + os.sep + os.path.normpath('eggnog_mapper/stefan_denovo') + os.sep

spec_target = 'Scer'
spec_name_target, eggnog_path_target, eggnog_fname_target = spec_set[spec_target]



#Abbreviation: (full name, path to eggnog files, eggnog_name)
spec_set = {'Zrou': ('Zygosaccharomyces rouxii','', ''),
            'Wano': ('Wickerhamomyces anomalus','', ''),
            #'Spom': ('Schizosaccharomyces pombe','', ''), 
            'Lthe': ('Lachancea thermotolerans', '', ''),
            'Scer': ('Saccharomyces cerevisiae','', 'Saccharomyces cerevisiae(4932)'), 
            'Kser': ('Kazachstania servazzii','', '' ), 
            #'Kpha': ('Komagataella phaffii','', '' ), 
            'Kmar': ('Kluyveromyces marxianus','', ''), 
            'Kmar_nomito': ('Kluyveromyces marxianus no Mito','', ''), 
            #'Klac': ('Kluyveromyces lactis','', '' ), 
            'Hosm': ('Hanseniaspora osmophila','', '' ), 
            'Gcan': ('Galactomyces candidus','', ''), 
            'Dhan': ('Debaryomyces hansenii','', '' ), 
            #'Ctro': ('Candida tropicalis','', ''), 
            #'Calb': ('Candida albicans','', '' )
            }

run_ortholog_map(spec_set, orth_map_dir_base, sc_annotations_df)