In [61]:
import pandas as pd
import numpy as np
from pathlib import Path
from create_reference_from_tsv_and_pepxml import ReferenceWriter
from create_PSM_df import PSM_FDR
from collections import defaultdict
from ReadAccTaxon import ReadAccTaxon
# load taxon graph
import sys  
sys.path.insert(0, '/home/jules/tax2proteome_projects/tax2proteome/')
from TaxonGraph import TaxonGraph
taxon_graph = TaxonGraph()
taxon_graph.create_graph("/home/jules/Documents/Metaproteomics/databases/databases_tax2proteome/taxdump.tar.gz")

path_to_bachelor_results = "/home/jules/Documents/Metaproteomics/Tax2Proteome/benchmarking/results_searchgui_xtandem_analyzer_bachelor_thesis/"

uniprot_nr_reduced_tsv = {
    'subspecies': path_to_bachelor_results + "/uniprot_kleiner/x_tandem_tsv/Run1_U1_2000ng_uniprot_subspecies.t.xml_new_reduced.tsv",
    'species': path_to_bachelor_results + "/uniprot_kleiner/x_tandem_tsv/Run1_U1_2000ng_uniprot_species_nr.t.xml_new_reduced.tsv",
    'genus': path_to_bachelor_results + "/uniprot_kleiner/x_tandem_tsv/Run1_U1_2000ng_uniprot_genus_nr.t.xml_new_reduced.tsv",
    'family': path_to_bachelor_results + "/uniprot_kleiner/x_tandem_tsv/Run1_U1_2000ng_uniprot_family_nr.t.xml_new_reduced.tsv",
}

Kleiner_2 = [("Desulfovibrio vulgaris", [881]), ("Thermus thermophilus", [274]),
             ("Stenotrophomonas maltophilia", [40324]), ("Chlamydomonas reinhardtii", [3055]),  
             ("Nitrososphaera viennensis", [1034015]), ("altermonas macleodii", [28108]), 
             ("chromobacterium violaceum", [536]), ("Paracoccus denitrificans", [266]),
             ("staphylococcus aureus", [1280]), ("bacillus subtilis", [1423])]

Kleiner_3 = [("Enterobacteriaceae", [28901, 562],  [543]), ("Nitrosomonadaceae", [44577, 915, 1231],  [206379]),
             ("Rhizobium", [1176649, 384],  [82115]), ("Pseudomonas", [294, 1149133, 1294143], [135621]),
             ("viruses", [10754, 101570, 1985310, 329852, 1977402], [10744, 10699, 10662, 11989, 10860]),
             ("paraburkholderia", [36873, 119219], [119060])]

def get_hit_rows2(decoy_column):
    return [True if d_set in  [{True, False}, {False}] else False for d_set in decoy_column]

def get_psm_and_df_in_fdr(file, fdr, remove_one_charged_spectra=False, columns=None):
    cs = ['Protein', 'Hyperscore', 'decoy', 'taxID']
    if columns:
        cs = cs + columns
    reduced_df = ReferenceWriter.read_csv_with_generic_function(file, cs, remove_one_charged_spectra)
    fdr_pos_result, number_psm_result, number_decoy_result, double_spectra_result, score_last_item_result = PSM_FDR.determine_FDR_position(reduced_df, fdr)
    return number_psm_result, reduced_df[0:fdr_pos_result]

def get_df_in_fdr_without_decoy(file, fdr, remove_one_charged_spectra=True, columns=None):
    df = get_psm_and_df_in_fdr(file, fdr, remove_one_charged_spectra, columns)[1]
    df = df[get_hit_rows2(df.decoy)]
    return df

def get_acc_rows(column, acc):
    return [True if acc in acc_list else False for acc_list in column ]

def get_taxa_rows(column, taxID):
    if type(taxID)==int:
        return [True if taxID in t_set else False for t_set in column]
    elif type(taxID)==list:
        return [True if len(set(taxID).intersection(set(taxa_set)))>0 else False for taxa_set in column]

def get_exclusive_rows(column, taxid):
    if type(taxid)==int:
        return [True if {taxid} == t_set else False for t_set in column]
    elif type(taxid)==list:
        return [True if len(set(t_set).difference(taxid))==0 else False for t_set in column]

def remove_acc_row(column, acc_to_remove_set):
    return [False if len(set(accs).difference(acc_to_remove_set))==0 else True for accs in column]

def remove_empty_rows(column):
    return [False if len(accs)==0 else True for accs in column]

def add_tax_information(acc_list, acc2tax_dict, level):
    tax_list=[]
    for acc in acc_list:
        try:
            tax_list.append(taxon_graph.find_level_up(acc2tax_dict[acc], level))
        except KeyError:
            tax_list.append('CRAP')
    return tax_list

def remove_all_accs_with_less_then_x_peptides(df, level, nb_peptides, acc2tax_dict):
    acc_to_peptide_dict=defaultdict(set)
    accs_to_remove= set()
    accs_to_keep = set()
    for index, row in df.iterrows():
        for acc in row['Protein']:
            acc_to_peptide_dict[acc].add(row["Peptide"])
    for acc, pep_set in acc_to_peptide_dict.items():
        if len(pep_set)<nb_peptides:
            accs_to_remove.add(acc)
        else:
            accs_to_keep.add(acc)
    df_no_one_hits=df[remove_acc_row(df.Protein, accs_to_remove)]
    df_no_one_hits.Protein = df_no_one_hits.Protein.apply(lambda acc_list: set(acc_list).intersection(accs_to_keep))
    df_no_one_hits.Protein = df_no_one_hits.Protein.apply(lambda acc_set: sorted(list(acc_set)))
    df_no_one_hits = df_no_one_hits[remove_empty_rows(df_no_one_hits.Protein)]
    df_no_one_hits[f"taxID_{level}"] = df_no_one_hits.Protein.apply(lambda acc_list: add_tax_information(acc_list, acc2tax_dict, level))

    return df_no_one_hits

def remove_unrelated_accs_and_taxa(df, taxids, level, acc_2_taxid_dict):
    df.Protein = df.Protein.apply(lambda acc_list: [acc for acc in acc_list if \
                                                    get_taxid_of_acc(acc, level, taxon_graph, acc_2_taxid_dict) in taxids])

    df[f"taxID_{level}"] = df.Protein.apply(lambda acc_list: [get_taxid_of_acc(acc, level, taxon_graph, acc_2_taxid_dict) for acc in acc_list])
    return df

def taxon_exclusive_spectra(df, df_no_one_hits, taxon, level, acc_2_taxid_dict):
    df_identified_spectra_for_taxa = df[get_taxa_rows(df[f"taxID_{level}"], taxon)]
    if df_identified_spectra_for_taxa.empty:
        df_identified_spectra_for_taxa = pd.DataFrame(columns=["Title", "Protein", f"taxID_species"])
    df_exclusive_spectra = df_identified_spectra_for_taxa[get_exclusive_rows(df_identified_spectra_for_taxa[f"taxID_{level}"], taxon)]
    if df_exclusive_spectra.empty:
        df_exclusive_spectra = pd.DataFrame(columns=["Title", "Protein", f"taxID_species"])
    exclusive_spectra = set(df_exclusive_spectra.Title)
    df_identified_spectra_for_taxa = remove_unrelated_accs_and_taxa(df_identified_spectra_for_taxa, taxon, level, acc_2_taxid_dict)
    all_identified_accs = set([item for sublist in df_identified_spectra_for_taxa.Protein for item in sublist])
    all_identified_spectra = set(df_identified_spectra_for_taxa.Title)

    df_identified_spectra_no_one_hits_for_taxa = df_no_one_hits[get_taxa_rows(df_no_one_hits[f"taxID_{level}"], taxon)]
    if df_identified_spectra_no_one_hits_for_taxa.empty:
        df_identified_spectra_no_one_hits_for_taxa = pd.DataFrame(columns=["Title", "Protein", f"taxID_species"])
    df_exclusive_spectra_no_one_hits = df_identified_spectra_no_one_hits_for_taxa[get_exclusive_rows(df_identified_spectra_no_one_hits_for_taxa[f"taxID_{level}"], taxon)]
    if df_exclusive_spectra_no_one_hits.empty:
        df_exclusive_spectra_no_one_hits = pd.DataFrame(columns=["Title", "Protein", f"taxID_species"])
    exclusive_spectra_no_one_hits = set(df_exclusive_spectra_no_one_hits.Title)
    df_identified_spectra_no_one_hits_for_taxa = remove_unrelated_accs_and_taxa(df_identified_spectra_no_one_hits_for_taxa, taxon, level, acc_2_taxid_dict)
    all_identified_spectra_no_one_hits = set(df_identified_spectra_no_one_hits_for_taxa.Title)
    all_identified_accs_no_one_hits = set([item for sublist in df_identified_spectra_no_one_hits_for_taxa.Protein for item in sublist])

    return all_identified_spectra, all_identified_spectra_no_one_hits, all_identified_accs, all_identified_accs_no_one_hits, exclusive_spectra, exclusive_spectra_no_one_hitsrn all_identified_spectra, all_identified_spectra_no_one_hits, all_identified_accs, all_identified_accs_no_one_hits, exclusive_spectra, exclusive_spectra_no_one_hits


def get_acc2taxid_dict(df_in_fdr_uniprot_species, df_in_fdr_uniprot_family):
    all_accs={item for sublist in df_in_fdr_uniprot_species.Protein for item in sublist}
    all_accs=all_accs.union({item for sublist in df_in_fdr_uniprot_family.Protein for item in sublist})
    final_accs = set()
    for acc in all_accs:
        try:
            final_accs.add(acc.split('|')[1])
        except:
            final_accs.add(acc)
    print(len(final_accs))
    acc2tax_reader=ReadAccTaxon("/home/jules/Documents/Metaproteomics/databases/databases_tax2proteome/", "uniprot")
    acc_to_taxid_dict = acc2tax_reader.read_acc2tax(final_accs)
    acc_to_taxid_dict = {key: int(taxid) for key, taxid in acc_to_taxid_dict.items()}
    return acc_to_taxid_dict

def get_taxids_of_accs(acc_list, level, taxon_graph, acc_to_taxid_dict):
    taxid_list = []
    for acc in acc_list:
        taxid_list.append(get_taxid_of_acc(acc, level, taxon_graph, acc_to_taxid_dict))
    return taxid_list

def get_taxid_of_acc(acc, level, taxon_graph, acc_to_taxid_dict):
    try:
        taxid = taxon_graph.find_level_up(acc_to_taxid_dict[acc], level)
    except KeyError:
        taxid = "CRAP"
    return taxid

def sort_taxid_and_acc_in_df(df, level, acc_to_taxid_dict):
    df.Protein = df.Protein.apply(lambda acc_set: sorted(list(acc_set)))
    df[f"taxID_{level}"] = df.Protein.apply(lambda acc_list: get_taxids_of_accs(acc_list, level, taxon_graph, acc_to_taxid_dict))
    return df

def spectra_identified_in_df(df_species, df_family, taxon_species, taxon_family):
    species_spectra = set(df_species[get_taxa_rows(df_species.taxID_species, taxon_species)].Title)
    family_spectra = set(df_family[get_taxa_rows(df_family.taxID_family, taxon_family)].Title)
    new_identified_spectra = family_spectra.difference(species_spectra)
    all_species_spectra = set(df_species.Title)
    spectra_in_species_df = new_identified_spectra.intersection(all_species_spectra)
    return new_identified_spectra, spectra_in_species_df

def load_spec_and_fam_df():
    fdr=0.05
    df_in_fdr_uniprot_species = get_df_in_fdr_without_decoy(uniprot_nr_reduced_tsv['species'], fdr, columns=['taxID_species'])
    df_in_fdr_uniprot_family =  get_df_in_fdr_without_decoy(uniprot_nr_reduced_tsv['family'], fdr, columns=['taxID_family'])
    df_in_fdr_uniprot_species.Protein = df_in_fdr_uniprot_species.Protein.apply(lambda acc_set: {acc.split('|')[1] \
                                                                                                 for acc in acc_set})
    df_in_fdr_uniprot_family.Protein = df_in_fdr_uniprot_family.Protein.apply(lambda acc_set: {acc.split('|')[1] \
                                                                                               for acc in acc_set})

    return df_in_fdr_uniprot_species, df_in_fdr_uniprot_family

def write_header(out):
    out.write(f"name\ttaxon_species\ttaxon_family\t")    
    out.write(f"all identified_spectra for taxa level species\tall identified_spectra for taxa level family\t")
    out.write(f"all identified_spectra for taxa level species no one hits\tall identified_spectra for taxa level family no one hits\t")
    out.write("all_identified_accs_spe\tall_identified_accs_spe_no_one_hits\t")
    out.write("all_identified_accs_fam\tall_identified_accs_fam_no_one_hits\t")
    out.write(f"exclusive spectra for taxa level species\texclusive spectra for taxa level family\t")
    out.write(f"intersection species and family exclusive spectra\t")
    out.write(f"exclusive spectra for taxa level species no one hits\texclusive spectra for taxa level family no one hits\t")
    out.write(f"intersection species and family exclusive spectra no one hits\t")
    out.write(f"new_identified_spectra in fam\tnew_identified_spectra in fam bereits in species_df\t")
    out.write(f"new_identified_spectra in fam no one hits\tnew_identified_spectra in fam bereits in species_df no one hit\n")
    
def write_results(out, name, taxon_species, taxon_family, 
                  all_identified_spectra_spe, all_identified_spectra_fam,
                  all_identified_spectra_no_one_hits_spe, all_identified_spectra_no_one_hits_fam,                  
                  all_identified_accs_spe, all_identified_accs_spe_no_one_hits,
                  all_identified_accs_fam, all_identified_accs_fam_no_one_hits,
                  exclusive_spectra_spe, exclusive_spectra_fam,
                  exclusive_spectra_no_one_hits_spe, exclusive_spectra_no_one_hits_fam,
                  new_identified_spectra, spectra_in_species_df,
                  new_identified_spectra_no_one_hits, spectra_in_species_df_no_one_hits):
    out.write(f"{name}\t{taxon_species}\t{taxon_family}\t")
    out.write(f"{len(all_identified_spectra_spe)}\t{len(all_identified_spectra_fam)}\t")
    out.write(f"{len(all_identified_spectra_no_one_hits_spe)}\t{len(all_identified_spectra_no_one_hits_fam)}\t")
    out.write(f"{len(all_identified_accs_spe)}\t{len(all_identified_accs_spe_no_one_hits)}\t")
    out.write(f"{len(all_identified_accs_fam)}\t{len(all_identified_accs_fam_no_one_hits)}\t")
    out.write(f"{len(exclusive_spectra_spe)}\t{len(exclusive_spectra_fam)}\t")
    out.write(f"{len(exclusive_spectra_spe.intersection(exclusive_spectra_fam))}\t")
    out.write(f"{len(exclusive_spectra_no_one_hits_spe)}\t{len(exclusive_spectra_no_one_hits_fam)}\t")
    out.write(f"{len(exclusive_spectra_no_one_hits_spe.intersection(exclusive_spectra_no_one_hits_fam))}\t")
    out.write(f"{len(new_identified_spectra)}\t{len(spectra_in_species_df)}\t")
    out.write(f"{len(new_identified_spectra_no_one_hits)}\t{len(spectra_in_species_df_no_one_hits)}\t")
    out.write("\n")


In [2]:
df_in_fdr_uniprot_species, df_in_fdr_uniprot_family = load_spec_and_fam_df()

In [3]:
acc_to_taxid_dict = get_acc2taxid_dict(df_in_fdr_uniprot_species, df_in_fdr_uniprot_family)

775100
Start reading accession2prot database file with 8 threads.
10% read.
20% read.
30% read.
40% read.
50% read.
60% read.
70% read.
80% read.
90% read.


In [4]:
df_in_fdr_uniprot_species = sort_taxid_and_acc_in_df(df_in_fdr_uniprot_species, 'species', acc_to_taxid_dict)
df_in_fdr_uniprot_family = sort_taxid_and_acc_in_df(df_in_fdr_uniprot_family, 'family', acc_to_taxid_dict)

In [76]:
df_in_fdr_uniprot_species_no_one_hits = remove_all_accs_with_less_then_x_peptides(df_in_fdr_uniprot_species.copy(deep=True),  
                                                                                  'species', 2, acc_to_taxid_dict)
df_in_fdr_uniprot_family_no_one_hits = remove_all_accs_with_less_then_x_peptides(df_in_fdr_uniprot_family.copy(deep=True),  
                                                                                 'family', 2, acc_to_taxid_dict)

In [68]:
with open("/home/jules/Documents/Metaproteomics/Tax2Proteome/benchmarking/results_analysis/results_per_taxa_more_then_1_hits.tsv", "w") as out:
        write_header(out)

with open("/home/jules/Documents/Metaproteomics/Tax2Proteome/benchmarking/results_analysis/results_per_taxa_more_then_1_hits.tsv", "a") as out:
    for name, taxon_species in Kleiner_2:
        taxon_family = [taxon_graph.find_level_up(taxon_species[0], "family")]
        
        all_identified_spectra_spe, all_identified_spectra_no_one_hits_spe, all_identified_accs_spe, all_identified_accs_spe_no_one_hits, exclusive_spectra_spe, exclusive_spectra_no_one_hits_spe = \
            taxon_exclusive_spectra(df_in_fdr_uniprot_species, df_in_fdr_uniprot_species_no_one_hits, taxon_species, 'species', acc_to_taxid_dict)
        all_identified_spectra_fam, all_identified_spectra_no_one_hits_fam, all_identified_accs_fam, all_identified_accs_fam_no_one_hits, exclusive_spectra_fam, exclusive_spectra_no_one_hits_fam = \
            taxon_exclusive_spectra(df_in_fdr_uniprot_family, df_in_fdr_uniprot_family_no_one_hits, taxon_family, 'family', acc_to_taxid_dict)
        new_identified_spectra, spectra_in_species_df = spectra_identified_in_df(df_in_fdr_uniprot_species,
                                                                                 df_in_fdr_uniprot_family, taxon_species, taxon_family)
        
        new_identified_spectra_no_one_hits, spectra_in_species_df_no_one_hits = spectra_identified_in_df(df_in_fdr_uniprot_species_no_one_hits,
                                                                                 df_in_fdr_uniprot_family_no_one_hits, taxon_species, taxon_family)
        write_results(out, name, taxon_species, taxon_family, 
                  all_identified_spectra_spe, all_identified_spectra_fam,
                  all_identified_spectra_no_one_hits_spe, all_identified_spectra_no_one_hits_fam,                  
                  all_identified_accs_spe, all_identified_accs_spe_no_one_hits,
                  all_identified_accs_fam, all_identified_accs_fam_no_one_hits,
                  exclusive_spectra_spe, exclusive_spectra_fam,
                  exclusive_spectra_no_one_hits_spe, exclusive_spectra_no_one_hits_fam,
                  new_identified_spectra, spectra_in_species_df,
                  new_identified_spectra_no_one_hits, spectra_in_species_df_no_one_hits)

    for name, taxon_species, taxon_family in Kleiner_3:
        all_identified_spectra_spe, all_identified_spectra_no_one_hits_spe, all_identified_accs_spe, all_identified_accs_spe_no_one_hits, exclusive_spectra_spe, exclusive_spectra_no_one_hits_spe = \
           taxon_exclusive_spectra(df_in_fdr_uniprot_species, df_in_fdr_uniprot_species_no_one_hits, taxon_species, 'species', acc_to_taxid_dict)
        all_identified_spectra_fam, all_identified_spectra_no_one_hits_fam, all_identified_accs_fam, all_identified_accs_fam_no_one_hits, exclusive_spectra_fam, exclusive_spectra_no_one_hits_fam = \
            taxon_exclusive_spectra(df_in_fdr_uniprot_family, df_in_fdr_uniprot_family_no_one_hits, taxon_family, 'family', acc_to_taxid_dict)
        new_identified_spectra, spectra_in_species_df = spectra_identified_in_df(df_in_fdr_uniprot_species,
                                                                                 df_in_fdr_uniprot_family, taxon_species, taxon_family)
        
        new_identified_spectra_no_one_hits, spectra_in_species_df_no_one_hits = spectra_identified_in_df(df_in_fdr_uniprot_species_no_one_hits,
                                                                                 df_in_fdr_uniprot_family_no_one_hits, taxon_species, taxon_family)
        write_results(out, name, taxon_species, taxon_family, 
                  all_identified_spectra_spe, all_identified_spectra_fam,
                  all_identified_spectra_no_one_hits_spe, all_identified_spectra_no_one_hits_fam,                  
                  all_identified_accs_spe, all_identified_accs_spe_no_one_hits,
                  all_identified_accs_fam, all_identified_accs_fam_no_one_hits,
                  exclusive_spectra_spe, exclusive_spectra_fam,
                  exclusive_spectra_no_one_hits_spe, exclusive_spectra_no_one_hits_fam,
                  new_identified_spectra, spectra_in_species_df,
                  new_identified_spectra_no_one_hits, spectra_in_species_df_no_one_hits)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"taxID_{level}"] = df.Protein.apply(lambda acc_list: [get_taxid_of_acc(acc, level, taxon_graph, acc_2_taxid_dict) for acc in acc_list])


In [57]:
vt = ("viruses", [10754, 101570, 1985310, 329852, 1977402], [10744, 10699, 10662, 11989, 10860])
all_identified_spectra_spe, all_identified_spectra_no_one_hits_spe, all_identified_accs_spe, all_identified_accs_spe_no_one_hits, exclusive_spectra_spe, exclusive_spectra_no_one_hits_spe = \
            taxon_exclusive_spectra(df_in_fdr_uniprot_species, df_in_fdr_uniprot_species_no_one_hits, vt[1], 'species', acc_to_taxid_dict)
all_identified_spectra_fam, all_identified_spectra_no_one_hits_fam, all_identified_accs_fam, all_identified_accs_fam_no_one_hits, exclusive_spectra_fam, exclusive_spectra_no_one_hits_fam = \
    taxon_exclusive_spectra(df_in_fdr_uniprot_family, df_in_fdr_uniprot_family_no_one_hits, vt[2], 'family', acc_to_taxid_dict)
new_identified_spectra, spectra_in_species_df = spectra_identified_in_df(df_in_fdr_uniprot_species,
                                                                        df_in_fdr_uniprot_family, 
                                                                         vt[1], vt[2])
new_identified_spectra_no_one_hits, spectra_in_species_df_no_one_hits = spectra_identified_in_df(df_in_fdr_uniprot_species_no_one_hits,
                                                                        df_in_fdr_uniprot_family_no_one_hits, 
                                                                         vt[1], vt[2])
print("all_identified_spectra_fam: ", len(all_identified_spectra_fam), "\nall_identified_spectra_no_one_hits_fam: ", 
      len(all_identified_spectra_no_one_hits_fam), "\nall_identified_accs_fam: ", len(all_identified_accs_fam), 
      "\nall_identified_accs_fam_no_one_hits: ", len(all_identified_accs_fam_no_one_hits), 
      "\nspectra_in_species_df no one hits: ", len(spectra_in_species_df),
      "\nexclusive_spectra_fam: ", len(exclusive_spectra_fam), "\nexclusive_spectra_no_one_hits_fam: ", len(exclusive_spectra_no_one_hits_fam))
print(        len(exclusive_spectra_no_one_hits_spe), len(exclusive_spectra_no_one_hits_fam),
               len(new_identified_spectra), len(spectra_in_species_df),
                  len(new_identified_spectra_no_one_hits), len(spectra_in_species_df_no_one_hits))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"taxID_{level}"] = df.Protein.apply(lambda acc_list: [get_taxid_of_acc(acc, level, taxon_graph, acc_2_taxid_dict) for acc in acc_list])


all_identified_spectra_fam:  175 
all_identified_spectra_no_one_hits_fam:  70 
all_identified_accs_fam:  280 
all_identified_accs_fam_no_one_hits:  7 
spectra_in_species_df no one hits:  153 
exclusive_spectra_fam:  29 
exclusive_spectra_no_one_hits_fam:  0
0 0 174 153 70 70


In [77]:
("staphylococcus aureus", [1280]), ("bacillus subtilis", [1423])
taxon_family_b = [taxon_graph.find_level_up(1423, "family")]
taxon_family_s = [taxon_graph.find_level_up(1280, "family")]        
all_identified_spectra_spe_b, all_identified_spectra_no_one_hits_spe_b, all_identified_accs_spe_b, \
all_identified_accs_spe_no_one_hits_b, exclusive_spectra_spe_b, exclusive_spectra_no_one_hits_spe_b = \
    taxon_exclusive_spectra(df_in_fdr_uniprot_species, df_in_fdr_uniprot_species_no_one_hits, [1423], 'species', acc_to_taxid_dict)
all_identified_spectra_fam_b, all_identified_spectra_no_one_hits_fam_b, all_identified_accs_fam_b, \
all_identified_accs_fam_no_one_hits_b, exclusive_spectra_fam_b, exclusive_spectra_no_one_hits_fam_b = \
    taxon_exclusive_spectra(df_in_fdr_uniprot_family, df_in_fdr_uniprot_family_no_one_hits, taxon_family_b, 'family', acc_to_taxid_dict)
new_identified_spectra_b, spectra_in_species_df_b = spectra_identified_in_df(df_in_fdr_uniprot_species,\
                                                                         df_in_fdr_uniprot_family, [1423], taxon_family_b)

new_identified_spectra_no_one_hits_b, spectra_in_species_df_no_one_hits_b = spectra_identified_in_df(df_in_fdr_uniprot_species_no_one_hits,\
                                                                         df_in_fdr_uniprot_family_no_one_hits, [1423], taxon_family_b)

all_identified_spectra_spe_s, all_identified_spectra_no_one_hits_spe_s, all_identified_accs_spe_s, \
all_identified_accs_spe_no_one_hits_s, exclusive_spectra_spe_s, exclusive_spectra_no_one_hits_spe_s = \
    taxon_exclusive_spectra(df_in_fdr_uniprot_species, df_in_fdr_uniprot_species_no_one_hits, [1280], 'species', acc_to_taxid_dict)
all_identified_spectra_fam_s, all_identified_spectra_no_one_hits_fam_s, all_identified_accs_fam_s, \
all_identified_accs_fam_no_one_hits_s, exclusive_spectra_fam_s, exclusive_spectra_no_one_hits_fam_s = \
    taxon_exclusive_spectra(df_in_fdr_uniprot_family, df_in_fdr_uniprot_family_no_one_hits, taxon_family_s, 'family', acc_to_taxid_dict)
new_identified_spectra_s, spectra_in_species_df_s = spectra_identified_in_df(df_in_fdr_uniprot_species,\
                                                                         df_in_fdr_uniprot_family, [1280], taxon_family_s)

new_identified_spectra_no_one_hits_s, spectra_in_species_df_no_one_hits_s = spectra_identified_in_df(df_in_fdr_uniprot_species_no_one_hits,\
                                                                         df_in_fdr_uniprot_family_no_one_hits, [1280], taxon_family_s)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"taxID_{level}"] = df.Protein.apply(lambda acc_list: [get_taxid_of_acc(acc, level, taxon_graph, acc_2_taxid_dict) for acc in acc_list])


In [83]:
("staphylococcus aureus", [1280]), ("bacillus subtilis", [1423])
taxon_family_b = taxon_graph.find_level_up(1423, "family")
taxon_family_s = taxon_graph.find_level_up(1280, "family")        
all_identified_spectra_spe_bs, all_identified_spectra_no_one_hits_spe_bs, all_identified_accs_spe_bs, \
all_identified_accs_spe_no_one_hits_bs, exclusive_spectra_spe_bs, exclusive_spectra_no_one_hits_spe_bs = \
    taxon_exclusive_spectra(df_in_fdr_uniprot_species, df_in_fdr_uniprot_species_no_one_hits, [1423, 1280], 'species', acc_to_taxid_dict)
all_identified_spectra_fam_bs, all_identified_spectra_no_one_hits_fam_bs, all_identified_accs_fam_bs, \
all_identified_accs_fam_no_one_hits_bs, exclusive_spectra_fam_bs, exclusive_spectra_no_one_hits_fam_bs = \
    taxon_exclusive_spectra(df_in_fdr_uniprot_family, df_in_fdr_uniprot_family_no_one_hits, [taxon_family_b, taxon_family_s], 'family', acc_to_taxid_dict)
new_identified_spectra_bs, spectra_in_species_df_bs = spectra_identified_in_df(df_in_fdr_uniprot_species,\
                                                                         df_in_fdr_uniprot_family, [1423, 1280], [taxon_family_b, taxon_family_s])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"taxID_{level}"] = df.Protein.apply(lambda acc_list: [get_taxid_of_acc(acc, level, taxon_graph, acc_2_taxid_dict) for acc in acc_list])


In [84]:
print(len(all_identified_spectra_fam_s))
print(len(all_identified_spectra_fam_b))
print(len(all_identified_spectra_fam_bs))
print(len(all_identified_spectra_fam_b.intersection(all_identified_spectra_fam_s)))
print(len(all_identified_spectra_no_one_hits_spe_s))
print(len(all_identified_spectra_no_one_hits_spe_b))
print(len(all_identified_spectra_no_one_hits_spe_bs))
print(len(all_identified_spectra_no_one_hits_spe_s.intersection(all_identified_spectra_no_one_hits_spe_b)))


2850
2352
4315
887
656
665
1216
105


In [44]:
display(df_in_fdr_uniprot_family[df_in_fdr_uniprot_family.Title.isin(exclusive_spectra_fam)])

Unnamed: 0.1,Unnamed: 0,Title,Peptide,Hyperscore,Protein,decoy,taxID,taxID_family
23534,23534,Run1_U1_2000ng.10379.10379.2,IVIGKDEKGEA,39.0,"[A0A022FSK6, A0A0A6Q732, A0A0B3SUN3, A0A0B5FAY...",{False},"{264198, 2027911, 2546443, 942865, 942866, 795...","[119060, 119060, 119060, 119060, 119060, 11906..."
23535,23535,Run1_U1_2000ng.10379.10379.2,LVIGKDEKGEA,39.0,"[A0A0B1YBD4, A0A0D5AJP6, A0A0F0E454, A0A0S4TXR...",{False},"{190721, 1235457, 105219, 29443, 266264, 12170...","[119060, 119060, 119060, 119060, 119060, 11906..."
23536,23536,Run1_U1_2000ng.10379.10379.2,LVLGAGDEKTR,39.0,"[A0A345MGD1, A0A345MHC5]",{False},"{2283304, 2283303}","[10699, 10699]"
29458,29458,Run1_U1_2000ng.79182.79182.2,VINTWADIINR,34.9,[V5UVF5],{False},{1433841},[3051]
29459,29459,Run1_U1_2000ng.79182.79182.2,VINTWADILNR,34.9,"[I3ULI4, I6LFI3]",{False},{351708},"[10662, 10662]"
29460,29460,Run1_U1_2000ng.79182.79182.2,VLNTWADIINR,34.9,"[A0A0S2IBL8, A0A0S2IBL9, A0A0S2ICE7, A0A0S2ICQ...",{False},"{47904, 309537, 47906, 47907, 163303, 1034604,...","[3051, 3051, 3051, 3051, 3051, 3051, 3051, 305..."
29980,29980,Run1_U1_2000ng.57535.57535.2,QILSDGIIK,34.6,[C4NTF3],{False},{490913},[10744]
32419,32419,Run1_U1_2000ng.145406.145406.4,TQSLELDELTAGSRTPIIASLSAFDAEAEIGSREASK,33.2,[V9QKJ3],{False},{1435411},[10699]
35670,35670,Run1_U1_2000ng.117233.117233.3,IMNSDPDRVNDTIDAMAAGVNRAVMNAGR,31.5,[A0A0H4J0Q5],{False},{1655019},[10699]
36044,36044,Run1_U1_2000ng.157374.157374.3,YVFSKCVDYEGETTLTAASDIKTLIK,31.3,[A0A2S1GLZ7],{False},{2163633},[10662]


In [8]:
display(df_in_fdr_uniprot_family_no_one_hits)

Unnamed: 0.1,Unnamed: 0,Title,Peptide,Hyperscore,Protein,decoy,taxID,taxID_family
0,0,Run1_U1_2000ng.150015.150015.2,AALESTLAAITESLKEGDAVQLVGFGTFK,126.4,"[A0A0A7A0Z4, A0A0C5W875, A0A0J8VSE1, A0A0T9Y4H...",{False},"{83333, 1354253, 42895, 2583577, 2591007, 2576...","[543, 543, 543, 543, 543, 543, 543, 543, 543, ..."
1,1,Run1_U1_2000ng.125600.125600.2,APEELAGKTEEDALVAYLQGLGLNR,125.3,"[A0A352S0R2, A0A356HDG4]",{False},{1873897},"[119060, 119060]"
2,2,Run1_U1_2000ng.74833.74833.2,IEIGKENTIIIDGAGDASAIEGR,121.9,"[A0A2A4MCF7, A0A316EV17, A0A375BVD9, A0A375E8A...",{False},"{164546, 82633, 942865, 1249621, 2030806, 1001...","[119060, 119060, 119060, 119060, 119060, 11906..."
3,3,Run1_U1_2000ng.149614.149614.2,AALESTLAAITESLKEGDAVQLVGFGTFK,116.8,"[A0A0A7A0Z4, A0A0C5W875, A0A0J8VSE1, A0A0T9Y4H...",{False},"{83333, 1354253, 42895, 2583577, 2591007, 9857...","[543, 543, 543, 543, 543, 543, 543, 543, 543, ..."
4,4,Run1_U1_2000ng.112688.112688.2,ALGLYQQFDEDKGVQDVGAALATLR,116.5,"[A0A2A4M1L7, A0A2L0XCX6, A0A356HVB4, L2ELR9, Q...",{False},"{1873897, 119219, 1249621, 2030806, 266264}","[119060, 119060, 119060, 119060, 119060]"
...,...,...,...,...,...,...,...,...
43287,43287,Run1_U1_2000ng.86174.86174.2,LNLVAALIR,28.3,"[A0A081CX36, A0A083ZEP6, A0A125P3W6, A0A135P60...",{False},"{1850373, 1822225, 313367, 366618, 195105, 106...","[82115, 82115, 82115, 82115, 82115, 82115, 821..."
43296,43296,Run1_U1_2000ng.82956.82956.2,LFGLTIDPER,28.3,"[A0A078L9Z5, A0A0D0LP41, A0A0F3WE62, A0A0F6RER...",{False},"{190485, 48664, 2484250, 656410, 2576416, 2576...","[543, 543, 543, 543, 543, 543, 543, 543, 543, ..."
43297,43297,Run1_U1_2000ng.82315.82315.2,YYIEAGVPIEIK,28.3,"[A0A366FJS1, I2B9X0, L0M2E3]",{False},"{630626, 693444, 1398493}","[543, 543, 543]"
43298,43298,Run1_U1_2000ng.82315.82315.2,YYLEAGVPIEIK,28.3,"[A0A0C5VX13, A0A156J3W1, A0A1E8DY51, A0A2T8X89...",{False},"{90371, 1173778, 2583579, 179997, 550, 913070,...","[543, 543, 543, 543, 543, 543, 543, 543, 543, ..."


In [None]:
for name, taxon_species in Kleiner_2:
        taxon_family = [taxon_graph.find_level_up(taxon_species[0], "family")]
        
        all_identified_spectra_spe, all_identified_spectra_no_one_hits_spe, all_identified_accs_spe, all_identified_accs_spe_no_one_hits, exclusive_spectra_spe, exclusive_spectra_no_one_hits_spe = \
            taxon_exclusive_spectra(df_in_fdr_uniprot_species, df_in_fdr_uniprot_species_no_one_hits, taxon_species, 'species', acc_to_taxid_dict)
        all_identified_spectra_fam, all_identified_spectra_no_one_hits_fam, all_identified_accs_fam, all_identified_accs_fam_no_one_hits, exclusive_spectra_fam, exclusive_spectra_no_one_hits_fam = \
            taxon_exclusive_spectra(df_in_fdr_uniprot_family, df_in_fdr_uniprot_family_no_one_hits, taxon_family, 'family', acc_to_taxid_dict)
        new_identified_spectra, spectra_in_species_df = spectra_identified_in_df(df_in_fdr_uniprot_species,
                                                                                 df_in_fdr_uniprot_family, taxon_species, taxon_family)
        
        new_identified_spectra_no_one_hits, spectra_in_species_df_no_one_hits = spectra_identified_in_df(df_in_fdr_uniprot_species_no_one_hits,
                                                                                 df_in_fdr_uniprot_family_no_one_hits, taxon_species, taxon_family)
     