In [186]:
import pandas as pd
from pathlib import Path
import argparse
from handling_acc_files import HelperMethod
from create_reference_from_tsv_and_pepxml import ReferenceWriter
from collections import defaultdict
from create_PSM_df import PSM_FDR
import pickle

In [187]:
taxon_graph = HelperMethod.load_taxa_graph(Path('/home/jules/Documents/databases/databases_tax2proteome/taxdump.tar.gz'))

Load taxon graph from harddrive.


In [93]:
path_to_identification_file = Path('/home/jules/Documents/Tax2Proteome/benchmarking/results_searchgui_xtandem_analyzer_bachelor_thesis/uniprot/x_tandem_tsv/Run1_U1_2000ng_uniprot_species_nr.t.xml_reduced.tsv')
path_to_reference = Path('/home/jules/Documents/Tax2Proteome/benchmarking/reference_files/Run1_U1_2000ng_uniprot_species_nr_reference.tsv')
level='species'
spectra_file='/home/jules/Documents/Tax2Proteome/benchmarking/spectra/Run1_U1_2000ng.mgf'

In [94]:
path_to_all_info_tsv = path_to_identification_file.parent.joinpath(path_to_identification_file.stem + '_' + path_to_reference.stem + '.tsv')
print(path_to_all_info_tsv)
result_df = ReferenceWriter.read_csv_with_generic_function(path_to_identification_file,
                                            ['Protein', 'decoy', 'taxID', f'taxID_{level}'])
reference_df = ReferenceWriter.read_csv_with_generic_function(path_to_reference,
                                            ['Ref_Peptide', 'Ref_ProteinAcc', 'Ref_Hyperscore', 'Ref_decoy','Ref_taxID_DB', f'Ref_taxID_{level}'])

/home/jules/Documents/Tax2Proteome/benchmarking/results_searchgui_xtandem_analyzer_bachelor_thesis/uniprot/x_tandem_tsv/Run1_U1_2000ng_uniprot_species_nr.t.xml_reduced_Run1_U1_2000ng_uniprot_species_nr_reference.tsv


In [88]:
class DeterminatorSpecificitySensitivity():

    def __init__(self, level, fdr_applied_df, reference_df, spectra_file):
        """
        :param fdr_applied_df:
        :param refernce_df: column names =
        :param spectra_file: 'Run1_U1_2000ng.mgf'
        """
        self.tax_level = ['species', 'genus', 'family', 'order']
        self.level = level
        self.result_df = fdr_applied_df[['Title', 'Peptide', 'Hyperscore', 'Protein', 'decoy', 'taxID', f'taxID_{level}']]
        self.reference_df = reference_df[['SpectraID', 'Ref_Peptide', 'Ref_Hyperscore', 'Ref_ProteinAcc', 'Ref_decoy', 'Ref_taxID_DB', f'Ref_taxID_{level}']]
        self.all_spectra_list = self.get_all_spectra_IDs(spectra_file)

    def create_df_with_all_spectra_reference_and_result_taxa(self, path_to_out):
        df_all_spectra = pd.DataFrame(self.all_spectra_list, columns=['SpectraID'])
        print('Number of different spectra: ', len(self.all_spectra_list))
        df_with_all_spectra_and_reference_and_results = pd.merge(df_all_spectra, self.result_df, how="outer", left_on='SpectraID', right_on='Title')
        df_with_all_spectra_and_reference_and_results = pd.merge(df_with_all_spectra_and_reference_and_results,
                                                                 self.reference_df, how="outer", left_on='SpectraID', right_on='SpectraID')
        print(f"write df_with_all_spectra_and_reference_and_results {path_to_out}... ")
        df_with_all_spectra_and_reference_and_results.to_csv(str(path_to_out), sep='\t')
        return df_with_all_spectra_and_reference_and_results

    def calculate_sensitivity(self, TP, FN):
        return TP/(TP+FN)

    def calculate_specificity(self, FP, TN):
        return TN/(TN + FP)

    def calculate_sensitivity_and_specificity(self, path_to_out):
        pd.set_option("display.max_rows", None, "display.max_columns", None)
        df_with_all_spectra_and_reference_and_results = self.create_df_with_all_spectra_reference_and_result_taxa(path_to_out)
        print('calculate TP, FP, TN, FN')
        TP, FP, TN, FN = self.get_true_positive_and_true_negative(df_with_all_spectra_and_reference_and_results)
        sensitivity = self.calculate_sensitivity(TP, FN)
        specificity = self.calculate_specificity(FP, TN)
        print(f'sensitivity: {sensitivity}, specificity: {specificity}')

    def load_ref_file(self, ref_file, level):
        level_to_column_nb_dict={'species': 4, 'genus': 5, 'family': 6, 'order': 7}
        spectraID_to_taxid_dict = defaultdict(list)
        with open(ref_file, 'r') as ref:
            ref.readline()
            for line in ref:
                fields = line.split()
                level_specific_taxid = fields[level_to_column_nb_dict[level]]
                spectraID = fields[0]
                spectraID_to_taxid_dict[spectraID].append(level_specific_taxid)
        return spectraID_to_taxid_dict

    def get_all_spectra_IDs(self, ident_file):
        all_spec_IDs = set()
        with open(ident_file, 'r') as ident_file:
            for line in ident_file:
                if line.startswith('TITLE'):
                    all_spec_IDs.add(line.split()[0].split('TITLE=')[1])
        return all_spec_IDs



In [155]:
class DeterminatorSpecificitySensitivity(DeterminatorSpecificitySensitivity):
    
    def check_for_TP(self, taxid_set, taxid_ref_set):
        # ignore Decoy crap entries from result_reduced (not contained in reference)
        decoy_set = {'DECOY', 'DECOY/CRAP', 0}
        taxid_set = taxid_set.difference(decoy_set)
        if len(taxid_set) == 0 and pd.isna(taxid_ref_set): #only decoy entries
            return False
        return taxid_set.issubset(taxid_ref_set)
    
    def check_for_FP(self, taxid_set, taxid_ref_set):
        for taxid in taxid_set:
            # ignore Decoy crap entries from result_reduced (not contained in reference)
            if taxid == 'DECOY/CRAP' or taxid == 'DECOY':
                continue
            else:
                if taxid not in taxid_ref_set: 
                    return True
        return False
                        
    def compare_tax_sets(self, taxid_set, taxid_ref_set, is_FP):
        if not pd.isna(taxid_set) and not pd.isna(taxid_ref_set):
            if is_FP:
                return self.check_for_FP(taxid_set, taxid_ref_set)  
            else:
                return self.check_for_TP(taxid_set, taxid_ref_set) 
        return False
    
    def check_taxid_in_reference(self, taxid_level_column, taxid_level_ref_column, is_FP):
        true_false_list = []
        for taxid_set, taxid_ref_set in zip(taxid_level_column, taxid_level_ref_column):
            true_false_list.append(self.compare_tax_sets(taxid_set, taxid_ref_set, is_FP))
        return true_false_list

    def get_true_positive_and_true_negative(self, df_with_all_spectra_and_reference_and_results):
        pd.set_option("display.max_rows", None, "display.max_columns", None)
        df_taxid = df_with_all_spectra_and_reference_and_results[['SpectraID','taxID','Ref_taxID_DB']]
        df_taxid_level = df_with_all_spectra_and_reference_and_results[['SpectraID', f'taxID_{self.level}',f'Ref_taxID_{self.level}']]
        
        df_TN = df_taxid[df_taxid.taxID != {'DECOY/CRAP'} & df_taxid.Ref_taxID_DB.isna()]
        df_TN = df_TN[df_TN.taxID.isna() & df_TN.Ref_taxID_DB.isna()]
        TN=len(set(df_TN.SpectraID.tolist()))
        df_TP = df_taxid_level[self.check_taxid_in_reference(df_taxid_level[f'taxID_{self.level}'].tolist(), df_taxid_level[f'Ref_taxID_{self.level}'].tolist(), is_FP=False)]
        TP = len(set(df_TP.SpectraID.tolist()))
        
        df_FN = df_taxid[(df_taxid.taxID.notna() ) & df_taxid.Ref_taxID_DB.isna()]
        df_FN = df_FN[df_FN.taxID != {'DECOY/CRAP'}]
        FN=len(set(df_FN.SpectraID.tolist()))
        
       
        df_FP = df_taxid_level[self.check_taxid_in_reference(df_taxid_level[f'taxID_{self.level}'].tolist(), df_taxid_level[f'Ref_taxID_{self.level}'].tolist(), is_FP=True)]
        FP = len(set(df_FP.SpectraID.tolist()))
        
        print(f"TP: {TP}, FP: {FP}, TN: {TN}, FN: {FN}")
        return TP, FP, TN, FN

In [180]:
class DeterminatorSpecificitySensitivity(DeterminatorSpecificitySensitivity):
    def calculate_sensitivity(self, TP, FN):
        return TP/(TP+FN)*100

    def calculate_specificity(self, FP, TN):
        return TN/(TN + FP)*100

In [103]:
psm = PSM_FDR(path_to_identification_file)
print(path_to_identification_file)
fdr_pos, number_psms, decoys = psm.determine_FDR_position(result_df, 0.05, True)
fdr_applied_df = result_df[0:fdr_pos]

/home/jules/Documents/Tax2Proteome/benchmarking/results_searchgui_xtandem_analyzer_bachelor_thesis/uniprot/x_tandem_tsv/Run1_U1_2000ng_uniprot_species_nr.t.xml_reduced.tsv
Number of PSMs: 47476
Number of decoys: 2498
double identified spectra 3322
Position FDR border/Number of PSMs: 53296


In [178]:
determinator = DeterminatorSpecificitySensitivity(level, fdr_applied_df, reference_df, spectra_file)

In [98]:
df_with_all_spectra_and_reference_and_results = determinator.create_df_with_all_spectra_reference_and_result_taxa(path_to_all_info_tsv)

Number of different spectra:  164414
write df_with_all_spectra_and_reference_and_results /home/jules/Documents/Tax2Proteome/benchmarking/results_searchgui_xtandem_analyzer_bachelor_thesis/uniprot/x_tandem_tsv/Run1_U1_2000ng_uniprot_species_nr.t.xml_reduced_Run1_U1_2000ng_uniprot_species_nr_reference.tsv... 


In [157]:
TP, FP, TN, FN = determinator.get_true_positive_and_true_negative(df_with_all_spectra_and_reference_and_results)

TP: 47526, FP: 0, TN: 114440, FN: 0


In [167]:

df_s = df_with_all_spectra_and_reference_and_results[df_with_all_spectra_and_reference_and_results.taxID != {'DECOY/CRAP'}]
number_spectra = (len(set(df_s.SpectraID)))
print(number_spectra)

161936


In [168]:
number_complete = (TP+TN)
print(number_complete)

161966


In [169]:
print(number_complete - number_spectra)

30


In [179]:
print(determinator.calculate_sensitivity(TP,FN))
print(determinator.calculate_specificity(FP,TN))

100.0
100.0


In [291]:
class HelperMethod():

    @staticmethod
    def load_taxa_graph(path_to_taxdump):
        """
        # Try load pre-builded taxonomy graph or built taxonomy graph now
        :param options: user input options
        :return: TaxonGraph object
        """

        if not (path_to_taxdump.parents[0] / 'taxon_graph_results').is_file():
            taxon_graph = TaxonGraph()
            print("Start building taxon graph.")
            taxon_graph.create_graph(str(path_to_taxdump))
            print("Taxon graph successfully build.")
            # save TaxonGraph to harddrive:
            try:
                with open(str(path_to_taxdump.parents[0] / 'taxon_graph_results'), 'wb') as handle:
                    pickle.dump(taxon_graph, handle, protocol=pickle.HIGHEST_PROTOCOL)
                    print('Safe taxon graph to location: %s' % str(
                        path_to_taxdump.parents[0] / 'taxon_graph_results'))
            except FileNotFoundError:
                print('Error open tax_graph.')
                exit(1)
        # load Taxon Graph
        else:
            try:
                print('Load taxon graph from harddrive.')
                with open(str(path_to_taxdump.parents[0] / 'taxon_graph_results'), 'rb') as handle:
                    taxon_graph = pickle.load(handle)
            except UnicodeDecodeError or EOFError:
                print(
                    "Failed opening path to taxon graph / taxon_graph is corrupted. Delete %s file."
                    % str(path_to_taxdump.parents[0] / 'taxon_graph'))
                exit(1)
        return taxon_graph

    @staticmethod
    def get_taxid_specific_spectra(df, columnname, taxid, level, taxongraph):
        taxid_level = taxongraph.find_level_up(taxid, level)
        return df[[df[columnname] == taxid_level]]

    @staticmethod
    def create_df_with_all_spectra_and_dfs_to_compare(all_spectra_list, df1, spectra_column_1, df2, spectra_column_2):
        df_all_spectra = pd.DataFrame(all_spectra_list, columns=['SpectraID'])
        df_with_all_spectra_and_df1 = pd.merge(df_all_spectra, df1, how="outer", left_on='SpectraID', right_on=spectra_column_1)
        df_with_all_spectra_and_df1_df2 = pd.merge(df_with_all_spectra_and_df1, df2, how="outer", left_on='SpectraID', right_on=spectra_column_2)
        return df_with_all_spectra_and_df1_df2
    
    
    @staticmethod
    def get_difference_between_two_df(all_spectra_list, df1, spectra_column_1, column_of_interest_1, df2, spectra_column_2, column_of_interest_2):
        df_with_all_spectra_and_df1_df2 = HelperMethod.create_df_with_all_spectra_and_dfs_to_compare(all_spectra_list, df1, spectra_column_1, df2, spectra_column_2)
        return df_with_all_spectra_and_df1_df2[df_with_all_spectra_and_df1_df2[column_of_interest_1] !=  df_with_all_spectra_and_df1_df2[column_of_interest_2]]
    
    @staticmethod
    def get_all_spectra_IDs(ident_file):
        all_spec_IDs = set()
        with open(ident_file, 'r') as ident_file:
            for line in ident_file:
                if line.startswith('TITLE'):
                    all_spec_IDs.add(line.split()[0].split('TITLE=')[1])
        return all_spec_IDs
    
    @staticmethod
    def get_rows_with_different_taxa(tax_column_1, tax_column_2, taxon_1, taxon_2):
        true_false_list=[]
        for tax_set_1, tax_set_2 in zip(tax_column_1, tax_column_2):
            if not pd.isna(tax_set_1) and not pd.isna(tax_set_2):
                if (taxon_1 in tax_set_1 and taxon_2 not in tax_set_2) or (taxon_1 not in tax_set_1 and taxon_2 in tax_set_2):
                    true_false_list.append(True)
                else:
                    true_false_list.append(False)
            elif pd.isna(tax_set_1) and not pd.isna(tax_set_2):
                if taxon_2 in tax_set2:
                    true_false_list.append(True)
                else:
                    true_false_list.append(False)                  
            elif not pd.isna(tax_set_1) and pd.isna(tax_set_2):
                if taxon_1 in tax_set1:
                    true_false_list.append(True)
                else:
                    true_false_list.append(False)
            else:
                true_false_list.append(False)
        return true_false_list
    
    @staticmethod
    def get_difference_between_two_df_for_one_taxon(all_spectra_list, df1, spectra_column_1, column_of_interest_1, df2,
                                      spectra_column_2, column_of_interest_2, taxon, level1, level2, taxon_graph):
        df_with_all_spectra_and_df1_df2 = HelperMethod.create_df_with_all_spectra_and_dfs_to_compare(all_spectra_list, df1, spectra_column_1, df2, spectra_column_2)
        # remove empty lines
        df_with_all_spectra_and_df1_df2 = df_with_all_spectra_and_df1_df2[df_with_all_spectra_and_df1_df2.Hyperscore_x.notna() & df_with_all_spectra_and_df1_df2.Hyperscore_y.notna()]
        taxon_1 = taxon_graph.find_level_up(taxon, level1)
        taxon_2 = taxon_graph.find_level_up(taxon, level2)
        df_difference=df_with_all_spectra_and_df1_df2[HelperMethod.get_rows_with_different_taxa(df_with_all_spectra_and_df1_df2[column_of_interest_1].tolist(), df_with_all_spectra_and_df1_df2[column_of_interest_2].tolist(), taxon_1, taxon_2)]
        return df_difference

In [225]:
# Bacillus subtilis = 1423
path_to_uniprot_reduced_result_species_nr = '/home/jules/Documents/Tax2Proteome/benchmarking/results_searchgui_xtandem_analyzer_bachelor_thesis/uniprot/x_tandem_tsv/Run1_U1_2000ng_uniprot_species_nr.t.xml_reduced.tsv'
path_to_uniprot_reduced_result_genus_nr = '/home/jules/Documents/Tax2Proteome/benchmarking/results_searchgui_xtandem_analyzer_bachelor_thesis/uniprot/x_tandem_tsv/Run1_U1_2000ng_uniprot_genus_nr.t.xml_reduced.tsv'
path_to_spectra_file='/home/jules/Documents/Tax2Proteome/benchmarking/spectra/Run1_U1_2000ng.mgf'
# load df
uniprot_species_nr = ReferenceWriter.read_csv_with_generic_function(path_to_uniprot_reduced_result_species_nr,
                                            ['Protein', 'decoy', 'taxID', f'taxID_species'])
uniprot_genus_nr = ReferenceWriter.read_csv_with_generic_function(path_to_uniprot_reduced_result_genus_nr,
                                            ['Protein', 'decoy', 'taxID', f'taxID_genus'])



In [226]:
fdr_pos_species, number_psms, decoys = psm.determine_FDR_position(uniprot_species_nr, 0.05, True)
fdr_pos_genus, number_psms, decoys = psm.determine_FDR_position(uniprot_genus_nr, 0.05, True)

Number of PSMs: 47476
Number of decoys: 2498
double identified spectra 3322
Position FDR border/Number of PSMs: 53296
Number of PSMs: 35571
Number of decoys: 1872
double identified spectra 4579
Position FDR border/Number of PSMs: 42022


In [241]:
def rows_containg_taxon(taxon, tax_set_column):
    true_false_list=[]
    for tax_set in tax_set_column:
        if not pd.isna(tax_set):
            true_false_list.append(taxon in tax_set)
    return true_false_list

In [244]:
uniprot_species_nr = uniprot_species_nr[0:fdr_pos_species]
uniprot_genus_nr = uniprot_genus_nr[0:fdr_pos_genus]
print(len(set(uniprot_species_nr[uniprot_species_nr.taxID != {'CRAP/DECOY'}]["Title"].tolist())))
print(len(set(uniprot_genus_nr[uniprot_genus_nr.taxID != {'CRAP/DECOY'}]["Title"].tolist())))
print('Spectra identification Bacillus subtilis: ',len(set(uniprot_species_nr[rows_containg_taxon(1423, uniprot_species_nr.taxID_species.tolist())]["Title"].tolist())))
print('Spectra identification Bacillus: ',len(set(uniprot_genus_nr[rows_containg_taxon(1386, uniprot_genus_nr.taxID_genus.tolist())]["Title"].tolist())))
all_spectra_set = HelperMethod.get_all_spectra_IDs(path_to_spectra_file)
all_spectra_species_genus_df = HelperMethod.create_df_with_all_spectra_and_dfs_to_compare(all_spectra_set, uniprot_species_nr, 'Title', uniprot_genus_nr, 'Title')
all_spectra_species_genus_df = all_spectra_species_genus_df[all_spectra_species_genus_df.Hyperscore_x.notna() & all_spectra_species_genus_df.Hyperscore_y.notna()]

49974
37443
Spectra identification Bacillus subtilis:  1010
Spectra identification Bacillus:  2577


In [268]:
# get difference by taxon
def rows_one_column_with_taxid_one_without(taxon_level1, taxon_level2, tax_set_column1, tax_set_column2):
    true_false_list=[]
    for tax_set1, tax_set2 in zip(tax_set_column1, tax_set_column2):
        if not pd.isna(tax_set1):
            if taxon_level1 in tax_set1 and taxon_level2 not in tax_set2:
                true_false_list.append(True)
            else:
                true_false_list.append(False)
        else:
            if pd.isna(tax_set2):
                true_false_list.append(True)
            else:
                true_false_list.append(False)
    return true_false_list


In [193]:
taxid_bacillus_subtilis_species = taxon_graph.find_level_up(1423, 'species')
taxid_bacillus_subtilis_genus = taxon_graph.find_level_up(1423, 'genus')
print(taxid_bacillus_subtilis_species, taxid_bacillus_subtilis_genus)

1423 1386


In [298]:
bacillus_difference_df = all_spectra_species_genus_df[HelperMethod.get_rows_with_different_taxa(all_spectra_species_genus_df['taxID_species'].tolist(), all_spectra_species_genus_df['taxID_genus'].tolist(), taxid_bacillus_subtilis_species, taxid_bacillus_subtilis_genus)]
print('number of spectra with difference for bacillus species, genus: ', len(set(bacillus_difference_df.SpectraID.tolist())))
# bacillus_difference_df[['SpectraID', 'Peptide_x', 'Peptide_y', 'Hyperscore_x', 'Hyperscore_y', 'taxID_species', 'taxID_genus']].head(30)
a = bacillus_difference_df[rows_one_column_with_taxid_one_without(1423, 1386, bacillus_difference_df.taxID_species.tolist(), bacillus_difference_df.taxID_genus.tolist())]
# a[['SpectraID', 'Peptide_x', 'Peptide_y', 'Hyperscore_x', 'Hyperscore_y', 'taxID_species', 'taxID_genus']].head(30)
print('number of spectra identified for bacillus subtilis species, but not bacillus genus: ', len(set(a.SpectraID.tolist())))
b = bacillus_difference_df[rows_one_column_with_taxid_one_without(1386, 1423, bacillus_difference_df.taxID_genus.tolist(), bacillus_difference_df.taxID_species.tolist())]
# b[['SpectraID', 'Peptide_x', 'Peptide_y', 'Hyperscore_x', 'Hyperscore_y', 'taxID_species', 'taxID_genus']].head(30)
print('number of spectra identified for bacillus genus, but not bacillus subtilis species: ', len(set(b.SpectraID.tolist())))
c = bacillus_difference_df[bacillus_difference_df.Peptide_y == bacillus_difference_df.Peptide_x]
# c[['SpectraID', 'Peptide_x', 'Peptide_y', 'Hyperscore_x', 'Hyperscore_y', 'taxID_species', 'taxID_genus']].head(30)
print('spectra with same identified peptide: ', len(set(c.SpectraID.tolist())))
b[['SpectraID', 'Peptide_x', 'Peptide_y', 'Hyperscore_x', 'Hyperscore_y', 'taxID_species', 'taxID_genus']].head(30)
d = bacillus_difference_df[bacillus_difference_df.Peptide_x.isna() & bacillus_difference_df.Peptide_y.notna()]
d[['SpectraID', 'Peptide_x', 'Peptide_y', 'Hyperscore_x', 'Hyperscore_y', 'taxID_species', 'taxID_genus']].head(30)
e = bacillus_difference_df[bacillus_difference_df.Peptide_y.isna() & bacillus_difference_df.Peptide_x.notna()]
print('spectra only identified by bacillus genus and no other species taxa: ', len(set(d.SpectraID.tolist())))
print('spectra only identified by bacillus species and not genus level db: ', len(set(e.SpectraID.tolist())))
b[['SpectraID', 'Peptide_x', 'Peptide_y', 'Hyperscore_x', 'Hyperscore_y', 'taxID_species', 'taxID_genus']].head(30)
from statistics import mean
mean_length = mean([len(tax_set) for tax_set in b.taxID_genus])
genus_taxa_list = [item for sublist in b.taxID_genus for item in sublist]
genus_taxa_2_number_dict = dict((x,genus_taxa_list.count(x)) for x in set(genus_taxa_list))
print('mean number of genus taxa identify spectra, also identified by bacillus genus and not by bacillus subtilis species', mean_length)
print(genus_taxa_2_number_dict)

number of spectra with difference for bacillus species, genus:  1858
number of spectra identified for bacillus subtilis species, but not bacillus genus:  259
number of spectra identified for bacillus genus, but not bacillus subtilis species:  1816
spectra with same identified peptide:  1366
spectra only identified by bacillus genus and no other species taxa:  0
spectra only identified by bacillus species and not genus level db:  0
mean number of genus taxa identify spectra, also identified by bacillus genus and not by bacillus subtilis species 3.932239965472594
{1822464: 322, 40323: 491, 265: 181, 270: 19, 914: 277, 535: 257, 286: 910, 561: 1084, 497726: 1, 590: 1153, 35798: 153, 106589: 266, 226: 346, 357: 109, 872: 82, 1386: 2317, 3052: 28, 379: 217, 'DECOY/CRAP': 132, 1279: 766}


In [260]:
df = all_spectra_species_genus_df[['SpectraID', 'Peptide_x', 'Hyperscore_x', 'taxID_species', 'Peptide_y', 'Hyperscore_y', 'taxID_genus']]


In [198]:
def taxon_identified_in_spectra(taxid_species_list, taxid_genus_list, taxid_species, taxid_genus):
    true_false_list = []
    for taxa_set_species, taxa_set_genus in zip(taxid_species_list, taxid_genus_list):
        if not pd.isna(taxa_set_species):
            if taxid_species in taxa_set_species:
                true_false_list.append(True)
                continue
        if not pd.isna(taxa_set_genus):
            if taxid_genus in taxa_set_genus:
                true_false_list.append(True) 
                continue
        true_false_list.append(False)
    return true_false_list

In [220]:
df = df[taxon_identified_in_spectra(df.taxID_species.tolist(), df.taxID_genus.tolist(), taxid_bacillus_subtilis_species, taxid_bacillus_subtilis_genus)]
df.head(30)
print(max(df.Hyperscore_x.tolist()))
print(min(df.Hyperscore_x.tolist()))
print(max(df.Hyperscore_y.tolist()))
print(min(df.Hyperscore_y.tolist()))

112.9
8.1
112.9
8.7


In [201]:
 def flatten_set( s):
        return {item for sublist in s for item in sublist}

In [207]:
taxids_species_bacillus = set(flatten_set([taxset for taxset in df.taxID_species.tolist() if not pd.isna(taxset)]))
taxids_species_bacillus.remove('DECOY/CRAP')
print(taxids_species_bacillus)
taxids_genus_bacillus = {taxon_graph.find_level_up(taxon, 'genus') for taxon in taxids_species_bacillus}
print(taxids_genus_bacillus)

{1280, 384, 40324, 36873, 266, 1176649, 28108, 1149133, 1423, 1231, 274, 915, 536, 1034015, 44577, 28901, 294, 3055, 881, 562, 119219, 329852, 1294143}
{1822464, 40323, 265, 270, 914, 535, 286, 561, 497726, 590, 35798, 11990, 106589, 226, 357, 872, 1386, 3052, 379, 1279}


In [None]:
import pandas as pd
path_to_uniprot_result_species_nr = '/home/jules/Documents/Tax2Proteome/benchmarking/results_searchgui_xtandem_analyzer_bachelor_thesis/uniprot/x_tandem_tsv/Run1_U1_2000ng_uniprot_species_nr.t.xml.tsv'
path_to_uniprot_result_genus_nr = '/home/jules/Documents/Tax2Proteome/benchmarking/results_searchgui_xtandem_analyzer_bachelor_thesis/uniprot/x_tandem_tsv/Run1_U1_2000ng_uniprot_genus_nr.t.xml.tsv'
