In [1]:
import pandas as pd
from pyteomics import fasta
from os import listdir, path
from collections import defaultdict
import subprocess
import shutil

In [None]:
# Create decoy protein database using the pyteomics (https://pypi.python.org/pypi/pyteomics)
fasta.write_decoy_db('/home/labxxx/iprg2016/fasta/iPRG2016.fasta', '/home/labxxx/iprg2016/fasta/iPRG2016_reverse.fasta', mode='reverse', prefix='DECOY_')

In [None]:
# Convert raw files to mzml format using the Proteowizard (http://proteowizard.sourceforge.net/)
subprocess.call(['wine', '/home/labxxx/pwiz/msconvert.exe', '/home/labxxx/iprg2016/raw_files/*.raw', '--mzML',
                '-o', '/home/labxxx/iprg2016/mzml/', '--filter', '"peakPicking true 1-"'])

In [12]:
# %%capture
# Convert mzml files to mgf format using the DeMix software (https://github.com/userbz/DeMix/)
# Note, that here we used in-house version of DeMix where the step of recalibration MS/MS spectra using the MSGF+ was skipped. 
try:
    del xml
except:
    pass

mzmlfolder = '/home/labxxx/iprg2016/mzml/'
mgfsfolder = '/home/labxxx/iprg2016/mgfs/'
for z in listdir(mzmlfolder):
    if z.endswith('.mzML'):
        input_mzml = path.join(mzmlfolder, z)
        %run '/home/labxxx/DeMix-master/run_demix_labxxx.py' $input_mzml
        

for z in listdir(mzmlfolder):
    if z.endswith('.mgf'):
        shutil.move(path.join(mzmlfolder, z), path.join(mgfsfolder, z.replace('.mzML.demix', '')))

In [None]:
# ALTERNATIVE WAY to obtain mgf files using only msconvert without using the of DeMix Software.

# Convert raw files to mgf format using the Proteowizard (http://proteowizard.sourceforge.net/)

# subprocess.call(['wine', '/home/labxxx/pwiz/msconvert.exe', '/home/labxxx/iprg2016/raw_files/*.raw', '--mgf',
#                 '-o', '/home/labxxx/iprg2016/mgfs/', '--filter', '"MS2Deisotope"'])

In [13]:
# Process mgf files using the X!Tandem search engine (http://www.thegpm.org/tandem/)
# and pepxmltk converter (https://bitbucket.org/markmipt/pyteomics.pepxmltk)
pepxmlfolder = '/home/labxxx/iprg2016/output/'
mgfsfolder = '/home/labxxx/iprg2016/mgfs/'
for z in listdir(mgfsfolder):
    if z.endswith('.mgf'):
        input_mgf = path.join(mgfsfolder, z)
        !runtandem '--tandem2xml' '/usr/bin/pepxmltk.py' '/home/labxxx/iprg2016/params/tandem.xml' $pepxmlfolder '/home/labxxx/iprg2016/fasta/iPRG2016_reverse.fasta' $input_mgf

In [2]:
%%capture
# Process output X!Tandem pep.xml files using the MPscore post-search algorithm (https://bitbucket.org/markmipt/mp-score)

try:
    del xml
except:
    pass

pepxmlfolder = '/home/labxxx/iprg2016/output/'
mgfsfolder = '/home/labxxx/iprg2016/mgfs/'
for z in listdir(pepxmlfolder):
    if z.endswith('.pep.xml'):
        input_pepxml = path.join(pepxmlfolder, z)
        input_mgf = path.join(mgfsfolder, z.replace('.pep.xml', '.mgf'))
        %run '/home/labxxx/work/PycharmProjects/mp-score/MPscore.py' $input_pepxml $input_mgf '/home/labxxx/iprg2016/fasta/iPRG2016_reverse.fasta' '/home/labxxx/iprg2016/params/mpscore_prot.cfg'

In [5]:
# Process output proteins tables and write the txt file in iPRG-style

all_files = ['A1', 'A2', 'A3',
    'B1', 'B2', 'B3',
    'C1', 'C2', 'C3',
    'D1', 'D2', 'D3'
]

resultsdict = defaultdict(dict)
for p in fasta.read('/home/labxxx/iprg2016/fasta/iPRG2016.fasta'):
    if p[0].startswith('HPRR'):
        resultsdict[p[0]] = dict()

def sequence_is_unique(sequence, sequence_to_proteins):
    return len(sequence_to_proteins[sequence]) == 1

for fn in all_files:
    df1p = pd.read_table('/home/labxxx/iprg2016/output/%s_proteins.csv' % (fn, ))
      
    qmap = dict()
    for qvalue, prot in df1p[['q-value', 'dbname']].values:
        qmap[prot] = qvalue
    
    for k in resultsdict:
        resultsdict[k][fn] = qmap.get(k, 1.0)

outfile = open('/home/labxxx/iprg2016/labxxx_results.txt', 'w')
outfile.write('"FDR"' + '\t' + '\t'.join('"' + ii + '"' for ii in all_files) + '\n')
for k, v in resultsdict.iteritems():
    outfile.write('"' + k + '"' + '\t' + '\t'.join(str(v.get(ii, 1.0)) for ii in all_files) + '\n')
outfile.close()