In [1]:
from pyteomics import pepxml
from pathlib import Path
from collections import defaultdict
import pandas as pd
import ast
from create_PSM_df import PSM_FDR

In [20]:
def read_pepXML(path_to_spectrum_identification_file):
    reader = pepxml.PepXML(source=str(path_to_spectrum_identification_file), use_index=True,
                           retrieve_refs=False, iterative=True)
    spectra_to_accs_dict = defaultdict(list)
    for spectrum_query in reader.iterfind("spectrum_query"):
        for search_hit in spectrum_query['search_hit']:
            percolator_q_value = search_hit['search_score']['Percolator q-Value']
            peptide = search_hit['peptide']
            for protein in search_hit['proteins']:
                protein_acc = protein['protein'].split()[0]
                if '_WP_' in protein_acc:
                    protein_acc='WP_' + protein_acc.split('WP_')[1]
                spectra_to_accs_dict['Run1_' + spectrum_query['spectrum']].append((protein_acc, percolator_q_value, peptide))
    return spectra_to_accs_dict

In [7]:
def read_acc_to_taxid_file(self, path_to_custom_tax):
    acc_to_tax_dict = {}
    with open(path_to_custom_tax, 'r') as tax:
        for line in tax.readlines():
            fields = line.split()
            taxid = fields[-1]
            acc = fields[1]
            if '_WP_' in acc:
                acc = 'WP_' + acc.split('WP_')[1]
            acc_to_tax_dict[acc]=taxid
    return acc_to_tax_dict

In [8]:
def write_Kleiner_spectrum_reference_file(spectra_to_accs_dict, path_to_custom_tax):
    acc_to_tax_dict = read_acc_to_taxid_file(path_to_custom_tax)
    print('writing ...')
    with open(self.path_to_reference_output, 'w') as output:
        output.write('SpectraID' + '\t' + 'Ref_ProteinAcc' + '\t' + 'Ref_Hyperscore' + '\t' + 'Ref_taxID_DB' + '\t'
                     + ('\t').join('Ref_taxid_' + level for level in self.tax_level) + '\n')
        for spectra, protein_list in spectra_to_accs_dict.items():
            level_specific_taxids = []
            for protein in protein_list:
                taxid = acc_to_tax_dict[protein[0]]
                if not level_specific_taxids:
                    level_specific_taxids = self.determine_level_specific_taxIDs(taxid)
                    level_specific_taxids = [{int(taxid)} for taxid in level_specific_taxids]
                else:
                    l = self.determine_level_specific_taxIDs(taxid)
                    for i, taxid in enumerate(l):
                        level_specific_taxids[i].add(int(taxid))
            list_of_tax_str = [(', ' ).join([str(taxid) for taxid in taxid_set]) for taxid_set in level_specific_taxids]
            output.write('Run1_' + spectra + '\t' + protein[0] + '\t' + str(protein[1]) + '\t' +
                              ('\t' ).join(list_of_tax_str) + '\n')

In [38]:
def write_pep_xml_to_tsv(spectra_to_accs_peptide_score_dict):
    with open('/home/jules/Documents/Tax2Proteome/benchmarking/spectra/Run1_U1_2000ng.pep.xml.tsv', 'w') as output:
        output.write('Title' + '\t' + 'ProteinAcc' + '\t'  + 'Peptide' + '\t' + 'Ref_Score'  + '\n')
        for spectrum, acc_and_score_list in spectra_to_accs_peptide_score_dict.items():
            for acc_score_peptide in acc_and_score_list:
                output.write(spectrum + '\t' + acc_score_peptide[0] + '\t' + acc_score_peptide[2] + '\t' + str(acc_score_peptide[1])  + '\n')
        
        

In [29]:
pep_xml_file = '/home/jules/Documents/Tax2Proteome/benchmarking/spectra/Run1_U1_2000ng.pep.xml'
custom_acc2tax_file_based_on_Kleiner_DB = '/home/jules/Documents/Tax2Proteome/benchmarking/Kleiner_ref_db/acc2tax_custom'
spectra_to_accs_peptide_score_dict = read_pepXML(pep_xml_file)

In [39]:
print(len(spectra_to_accs_peptide_score_dict))
print(spectra_to_accs_peptide_score_dict["Run1_U1_2000ng.3223.3223.3"])
write_pep_xml_to_tsv(spectra_to_accs_peptide_score_dict)

86317
[('K12_P52647', 0.02724, 'AINMMSRLEHVEEEK'), ('LT2_Q8ZP89', 0.02724, 'AINMMSRLEHVEEEK'), ('LT2_Q8ZLF5', 0.04747, 'HGMMANIEIKPTTGSGR'), ('LT2_Q8ZLF5', 0.04747, 'HGMMANIEIKPTTGSGR')]
