In [1]:
import pandas
from tqdm import tqdm

In [2]:
mapping = pandas.read_csv(
    '../../data/tsv/orfTypeMapping.tsv',
    sep='\t',
    header=0,
    index_col=['ORF_ID']
)
def addType(row, aType):
    oType = row.loc['orfType']
    if oType in {'canonical:ucsc', 'canonical:both', 'canonical:variant'}:
        return('Canonical')
    else:
        ORF_ID = row.loc['ORF_ID']
        return(mapping.loc[ORF_ID][aType])

In [3]:
def getSpectralRecall(row):
    return(min(
        (
            100.0 * (
                row.loc['backbone_cleavage_score'] /
                (len(row.loc['sequence']) - 1))
        ),
        100.0
    ))
def getFDR(row):
    if row.loc['deltaForwardReverseScore'] < 0:
        return(0)
    return(1)

In [4]:
THRESHOLD = dict()
for line in open('../../data/tsv/fdrThresholds.tsv', 'r'):
    line = line[:-1].split('\t')
    THRESHOLD[line[0]] = (
        line[1],
        line[2],
        line[3],
        line[4]
    )
for key, values in THRESHOLD.items():
    if key != 'Type':
        THRESHOLD[key] = tuple(map(int, values))
def fdrFilter(row, threshold):
    fdrType = row.loc['fdrType']
    thresh = threshold[fdrType]
    if not row.loc['score'] > thresh[0]:
        return(False)
    if not row.loc['percent_scored_peak_intensity'] > thresh[1]:
        return(False)
    if not row.loc['backbone_cleavage_score'] > thresh[2]:
        return(False)
    if not row.loc['SpectralRecall'] > thresh[3]:
        return(False)
    return(True)

In [5]:
def updateTypes(pep, out):
    df = pandas.read_csv(
        pep,
        sep='\t',
        header=0,
        index_col=['Peptide:UID']
    )
    df['SpectralRecall'] = df.apply(getSpectralRecall, axis=1)
    df['mergeType'] = df.apply(addType, axis=1, args=('mergeType', ))
    df['fdrType'] = df.apply(addType, axis=1, args=('fdrType', ))
    df['plotType'] = df.apply(addType, axis=1, args=('plotType', ))
    df['condType'] = df.apply(addType, axis=1, args=('condType', ))
    df['fdr'] = df.apply(getFDR, axis=1)
    if 'MHCI' in pep:
        df['fdrFilter'] = df.apply(fdrFilter, axis=1, args=(THRESHOLD, ))
    df.to_csv(
        out,
        sep='\t',
        header=True,
        index=True
    )

In [6]:
conditions = [
#    'B721.MHCI',
#    'B721.Whole',
    'CLL.5283.MHCI',
    'CLL.5328.MHCI',
    'CLL.5341.MHCI',
    'MEL.2.MHCI',
    'MEL.2.10IP.MHCI',
    'MEL.2s.MHCI',
    'MEL.2s.IFN.MHCI',
    'MEL.6.MHCI',
    'MEL.6.IFN.MHCI',
    'MEL.11.MHCI',
    'MEL.11.IFN.MHCI',
    'MEL.15.MHCI',
    'MEL.15.IFN.MHCI',
    'GBM.H4512.MHCI',
    'GBM.H4512.IFN.MHCI',
#    'GBM.H4512.Whole',
    'GBM.H4198.MHCI',
    'GBM.H4198.IFN.MHCI',
    'GBM.7.MHCI',
    'GBM.7.IFN.MHCI',
    'OV.CP-594.MHCI',
    'OV.SLS3-M1.MHCI',
    'RCC.9.MHCI'
]
for cond in tqdm(conditions):
    pep = '../../data/pep/' + cond + '.pep'
    out = '../../data/final/' + cond + '.pep'
    updateTypes(pep, out)

100%|██████████| 22/22 [00:24<00:00,  1.66it/s]
