# Map Peptides to Proteins

#### Import Libraries & Setup Environment

In [1]:
import pandas
import re
from tqdm import tqdm
tsvBase = '../../data/tsv/'
refBase = '../../data/ref/'
mapBase = '../../data/map/'

#### Define Functions

In [2]:
def writeMatchesHelper(sequence, peptideUID, reference, out):
    length = len(sequence)
    for proteinUID, refseq in reference.items():
        if sequence in refseq:
            paddedSeq = '-' * 30 + refseq + '-' * 30
            for match in re.finditer(sequence, paddedSeq):
                start = match.start()
                end = match.end()
                upstream = paddedSeq[
                    start - 30:
                    start]
                downstream = paddedSeq[
                    end:
                    end + 30]
                newEntry = '\t'.join([
                    peptideUID,
                    proteinUID,
                    sequence,
                    upstream,
                    downstream]) + '\n'
                out.write(newEntry)

In [3]:
def writeMatches(row, reference, out):
    seqList = row.loc['sequenceList'].split(',')
    peptideUID = row.loc['Peptide:UID']
    for seq in seqList:
        writeMatchesHelper(seq, peptideUID, reference, out)

In [4]:
def readReference(ref):
    reference = dict()
    ref = open(ref, 'r')
    for line in ref:
        line = line[:-1]
        line = line.split('\t')
        reference[line[0]] = line[3]
    ref.close()
    return(reference)

In [5]:
def generateMap(peptides, ref, out):
    peptides = pandas.read_csv(
        peptides,
        sep='\t',
        header=0,
        index_col=False)
    reference = readReference(ref)
    out = open(out, 'w')
    out.write('Peptide:UID\tProtein:UID\tsequence\t30AA upstream\t30AA downstream\n')
    tqdm.pandas(desc='Generate map')
    peptides.progress_apply(
        writeMatches,
        axis=1,
        args=(reference, out))
    out.close()

# Map Peptides

## CLL

#### CLL 5283 MHC-I

In [6]:
tsv = tsvBase + 'CLL.5283.MHCI.tsv'
ref = refBase + 'CLL.5283.ref'
out = mapBase + 'CLL.5283.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 6058/6058 [07:34<00:00, 14.17it/s]


#### CLL 5328 MHC-I

In [7]:
tsv = tsvBase + 'CLL.5328.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'CLL.5328.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 396/396 [00:28<00:00, 13.46it/s]


#### CLL 5341 MHC-I

In [8]:
tsv = tsvBase + 'CLL.5341.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'CLL.5341.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 487/487 [00:36<00:00, 13.46it/s]


## GBM

#### GBM H4512 MHC-I

In [9]:
tsv = tsvBase + 'GBM.H4512.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'GBM.H4512.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 4486/4486 [05:26<00:00, 13.72it/s]


#### GBM H4512 IFN MHC-I

In [10]:
tsv = tsvBase + 'GBM.H4512.IFN.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'GBM.H4512.IFN.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 5559/5559 [06:48<00:00, 13.59it/s]


#### GBM H4512 Whole Proteome

In [None]:
tsv = tsvBase + 'GBM.H4512.Whole.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'GBM.H4512.Whole.map'
generateMap(tsv, ref, out)

Generate map:  27%|██▋       | 24564/91879 [32:14<1:24:33, 13.27it/s]

#### GBM H4198 MHC-I

In [11]:
tsv = tsvBase + 'GBM.H4198.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'GBM.H4198.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 5485/5485 [06:29<00:00, 14.08it/s]


#### GBM H4198 IFN MHC-I

In [12]:
tsv = tsvBase + 'GBM.H4198.IFN.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'GBM.H4198.IFN.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 4801/4801 [05:39<00:00, 14.13it/s]


#### GBM 7 MHC-I

In [13]:
tsv = tsvBase + 'GBM.7.MHCI.tsv'
ref = refBase + 'GBM.7.ref'
out = mapBase + 'GBM.7.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 279/279 [00:19<00:00, 14.20it/s]


#### GBM 7 IFN MHC-I

In [14]:
tsv = tsvBase + 'GBM.7.IFN.MHCI.tsv'
ref = refBase + 'GBM.7.ref'
out = mapBase + 'GBM.7.IFN.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 1198/1198 [01:29<00:00, 14.74it/s]


## Melanoma

#### MEL 2 MHC-I

In [6]:
tsv = tsvBase + 'MEL.2.MHCI.tsv'
ref = refBase + 'MEL.2.ref'
out = mapBase + 'MEL.2.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 6064/6064 [08:38<00:00, 14.03it/s]


#### MEL 2 10IP MHC-I

In [7]:
tsv = tsvBase + 'MEL.2.10IP.MHCI.tsv'
ref = refBase + 'MEL.2.ref'
out = mapBase + 'MEL.2.10IP.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 2561/2561 [03:38<00:00, 12.32it/s]


#### MEL 2 SEL MHC-I

In [8]:
tsv = tsvBase + 'MEL.2s.MHCI.tsv'
ref = refBase + 'MEL.2.ref'
out = mapBase + 'MEL.2s.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 6275/6275 [08:45<00:00, 12.96it/s]


#### MEL 2 SEL IFN MHC-I

In [9]:
tsv = tsvBase + 'MEL.2s.IFN.MHCI.tsv'
ref = refBase + 'MEL.2.ref'
out = mapBase + 'MEL.2s.IFN.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 5052/5052 [06:54<00:00, 12.18it/s]


#### MEL 6 MHC-I

In [None]:
tsv = tsvBase + 'MEL.6.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'MEL.6.MHCI.map'
generateMap(tsv, ref, out)

#### MEL 6 IFN MHC-I

In [None]:
tsv = tsvBase + 'MEL.6.IFN.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'MEL.6.IFN.MHCI.map'
generateMap(tsv, ref, out)

#### MEL 11 MHC-I

In [10]:
tsv = tsvBase + 'MEL.11.MHCI.tsv'
ref = refBase + 'MEL.11.ref'
out = mapBase + 'MEL.11.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 2688/2688 [04:24<00:00, 10.60it/s]


#### MEL 11 IFN MHC-I

In [11]:
tsv = tsvBase + 'MEL.11.IFN.MHCI.tsv'
ref = refBase + 'MEL.11.ref'
out = mapBase + 'MEL.11.IFN.MHCI.map'
generateMap(tsv, ref, out)

Generate map: 100%|██████████| 872/872 [01:25<00:00, 10.18it/s]


#### MEL 15 MHC-I

In [None]:
tsv = tsvBase + 'MEL.15.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'MEL.15.MHCI.map'
generateMap(tsv, ref, out)

#### MEL 15 IFN MHC-I

In [None]:
tsv = tsvBase + 'MEL.15.IFN.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'MEL.15.IFN.MHCI.map'
generateMap(tsv, ref, out)

## Ovarian Cancer

#### OV SLS3-M1

In [None]:
tsv = tsvBase + 'OV.SLS3-M1.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'OV.SLS3-M1.MHCI.map'
generateMap(tsv, ref, out)

#### OV CP-594

In [None]:
tsv = tsvBase + 'OV.CP-594.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'OV.CP-594.MHCI.map'
generateMap(tsv, ref, out)

## Renal Cell Carcinoma

#### RCC 9

In [None]:
tsv = tsvBase + 'RCC.9.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'RCC.9.MHCI.map'
generateMap(tsv, ref, out)

## B721.221

#### B721 MHC-I

In [None]:
tsv = tsvBase + 'B721.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'B721.MHCI.map'
generateMap(tsv, ref, out)

#### B721 Whole Proteome

In [None]:
tsv = tsvBase + 'B721.Whole.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'B721.Whole.map'
generateMap(tsv, ref, out)

### Database Comparison

#### Null

In [None]:
tsv = tsvBase + 'DBC.NULL.tsv'
ref = refBase + 'B721.NULL.ref'
out = mapBase + 'DBC.NULL.map'
generateMap(tsv, ref, out)

#### B721

In [None]:
tsv = tsvBase + 'DBC.B721.tsv'
ref = refBase + 'B721.ref'
out = mapBase + 'DBC.B721.map'
generateMap(tsv, ref, out)

#### Pan Sample

In [None]:
tsv = tsvBase + 'DBC.PS.tsv'
ref = refBase + 'PanSample.ref'
out = mapBase + 'DBC.PanSample.map'
generateMap(tsv, ref, out)

#### RPF

In [None]:
tsv = tsvBase + 'DBC.RPF.tsv'
ref = refBase + 'B721.RPF.ref'
out = mapBase + 'DBC.RPF.map'
generateMap(tsv, ref, out)

#### RNA

In [None]:
tsv = tsvBase + 'DBC.RNA.tsv'
ref = refBase + 'B721.RNA.ref'
out = mapBase + 'DBC.RNA.map'
generateMap(tsv, ref, out)