In [1]:
import pandas
from tqdm import tqdm

In [2]:
def assignPeptides(merge, out):
    out = open(out, 'w')
    rows = list()
    UID = ''
    index = 0
    for line in tqdm(open(merge, 'r'), desc=merge[merge.rfind('/') + 1:merge.rfind('.')]):
        index += 1
        if index == 1:
            out.write(line)
            header = line[:-1].split('\t')
            category = header.index('category')
            orfType = header.index('orfType')
            HEAD = header.index('header')
            if 'mean.tpm' in header:
                TPM = header.index('mean.tpm')
            else:
                TPM = True
        else:
            line = line[:-1].split('\t')
            if UID == '':
                UID = line[0]
            if line[0] == UID:
                rows.append(line)
            else:
                newLine = filtRows(rows, category, orfType, TPM, HEAD)
                if len(newLine) != 0:
                    out.write('\t'.join(newLine) + '\n')
                rows = list()
                rows.append(line)
                UID = line[0]
    newLine = filtRows(rows, category, orfType, TPM, HEAD)
    if len(newLine) != 0:
        out.write('\t'.join(newLine) + '\n')
    out.close()

In [3]:
def filtRows(rows, category, orfType, TPM, HEAD):
    # Remove smORF
    rowsFilt = list()
    for row in rows:
        if row[category] != 'smORF':
            rowsFilt.append(row)
    # Check for Contaminant
    for row in rowsFilt:
        if row[category] == 'Contaminant':
            return(list())
    # Sort Annotated vs nuORF
    typeSet = list()
    for row in rowsFilt:
        typeSet.append(row[orfType])
    typeSet = set(typeSet)
    canonical = {
        'canonical:both',
        'canonical:ucsc',
        'canonical:gencode',
        'canonical:variant',
        'CDS'}
    finalRows = list()
    # Check if any canonical
    if len(canonical & typeSet) > 0:
        for row in rowsFilt:
            if row[orfType] in canonical:
                finalRows.append(row)
    else:
        for row in rowsFilt:
            if row[orfType] not in canonical:
                finalRows.append(row)
    if type(TPM) != bool:
        maxTPM = 0.0
        for row in finalRows:
            if row[TPM] == '':
                tpm = 0.0
            else:
                tpm = float(row[TPM])
            if tpm > maxTPM or maxTPM == 0:
                maxTPM = tpm
                maxRow = row
    elif TPM:
        maxHead = ''
        for row in finalRows:
            if row[HEAD] == '':
                head = maxHead
            else:
                head = row[HEAD]
            if head > maxHead or maxHead == '':
                maxHead = head
                maxRow = row
    if len(finalRows) == 0:
        return(list())
    return(maxRow)

In [4]:
conditions = [
#    'B721.MHCI',
#    'B721.Whole',
    'CLL.5283.MHCI',
    'CLL.5328.MHCI',
    'CLL.5341.MHCI',
    'MEL.2.MHCI',
    'MEL.2.10IP.MHCI',
    'MEL.2s.MHCI',
    'MEL.2s.IFN.MHCI',
    'MEL.6.MHCI',
    'MEL.6.IFN.MHCI',
    'MEL.11.MHCI',
    'MEL.11.IFN.MHCI',
    'MEL.15.MHCI',
    'MEL.15.IFN.MHCI',
    'GBM.H4512.MHCI',
    'GBM.H4512.IFN.MHCI',
#    'GBM.H4512.Whole',
    'GBM.H4198.MHCI',
    'GBM.H4198.IFN.MHCI',
    'GBM.7.MHCI',
    'GBM.7.IFN.MHCI',
    'OV.CP-594.MHCI',
    'OV.SLS3-M1.MHCI',
    'RCC.9.MHCI'
]
for cond in conditions:
    merge = '../../data/merge/' + cond + '.merge'
    out = '../../data/pep/' + cond + '.pep'
    assignPeptides(merge, out)

CLL.5283.MHCI: 51225it [00:00, 295918.25it/s]
CLL.5328.MHCI: 3061it [00:00, 272405.94it/s]
CLL.5341.MHCI: 3963it [00:00, 310465.77it/s]
MEL.2.MHCI: 49160it [00:00, 305481.83it/s]
MEL.2.10IP.MHCI: 20555it [00:00, 303067.51it/s]
MEL.2s.MHCI: 52555it [00:00, 308229.62it/s]
MEL.2s.IFN.MHCI: 43232it [00:00, 318212.23it/s]
MEL.6.MHCI: 15615it [00:00, 326511.97it/s]
MEL.6.IFN.MHCI: 14587it [00:00, 361290.11it/s]
MEL.11.MHCI: 24602it [00:00, 288990.96it/s]
MEL.11.IFN.MHCI: 7774it [00:00, 287647.05it/s]
MEL.15.MHCI: 39491it [00:00, 362449.53it/s]
MEL.15.IFN.MHCI: 15902it [00:00, 361333.47it/s]
GBM.H4512.MHCI: 37580it [00:00, 296687.84it/s]
GBM.H4512.IFN.MHCI: 45964it [00:00, 297165.93it/s]
GBM.H4198.MHCI: 45686it [00:00, 373702.81it/s]
GBM.H4198.IFN.MHCI: 44971it [00:00, 359647.27it/s]
GBM.7.MHCI: 2492it [00:00, 122203.71it/s]
GBM.7.IFN.MHCI: 9960it [00:00, 296933.43it/s]
OV.CP-594.MHCI: 22466it [00:00, 363902.33it/s]
OV.SLS3-M1.MHCI: 5882it [00:00, 348843.30it/s]
RCC.9.MHCI: 12106it [00:00, 36