In [1]:
import pandas
from tqdm import tqdm

In [2]:
def assignPeptides(merge, out):
    df = pandas.read_csv(merge, sep='\t', header=0, index_col=False)
    df = df[df['category'] != 'smORF'].copy()
    peptides = set(df['Peptide:UID'])
    matches = []
    for peptide in tqdm(peptides, desc=merge):
        matches.append(refineMatches(
            df[df['Peptide:UID'] == peptide]
        ))
    df = pandas.concat(matches)
    df = df[df['category'] != 'Contaminant']
    df.to_csv(out, sep='\t', header=True, index=False)

In [3]:
def refineMatches(df):
    categories = set(df['category'])
    if 'Contaminant' in categories:
        return(df[df['category'] == 'Contaminant'])
    if 'UCSC' in categories:
        df = df[df['category'] == 'UCSC']
        return(df.iloc[[0]])
    return(df.iloc[[0]])

In [4]:
conditions = [
    'DBC.RNA',
    'DBC.RPF',
    'DBC.B721',
    'DBC.PS',
    'DBC.NULL'
]
for cond in conditions:
    merge = '../../data/merge/' + cond + '.merge'
    out = '../../data/final/' + cond + '.pep'
    assignPeptides(merge, out)

../../data/merge/DBC.RNA.merge: 100%|██████████| 26759/26759 [15:17<00:00, 29.39it/s]
../../data/merge/DBC.RPF.merge: 100%|██████████| 27386/27386 [16:07<00:00, 28.30it/s]
../../data/merge/DBC.B721.merge: 100%|██████████| 28749/28749 [14:32<00:00, 34.31it/s]
../../data/merge/DBC.PS.merge: 100%|██████████| 28789/28789 [17:49<00:00, 27.86it/s]
../../data/merge/DBC.NULL.merge: 100%|██████████| 28455/28455 [06:48<00:00, 69.62it/s]
