# Merge Map files with Reference and MS Search tables

#### Import Libraries & Setup Environment

In [1]:
import pandas
import numpy
ucsc = '../../data/map/ucsc.map'
pepMapBase = '../../data/map/'
tsvBase = '../../data/tsv/'
refBase = '../../data/ref/'
tpmBase = '../../data/tpm/'
outBase = '../../data/merge/'

### Functions to generate ORF_ID
This depends on premade maps for the UCSC references, but generates a column to merge the Reference and TPM tables. The Contaminants and smORF proteins are discarded from future analysis.

In [2]:
def ucsdORFID(header, ucscMap):
    if header in ucscMap:
        return(ucscMap[header])
    return(numpy.nan)

In [3]:
def contamORFID(header):
    return(numpy.nan)

In [4]:
def nuORFID(header):
    header = header.split(' | ')[1]
    header = header.replace('~', '|')
    return(header)

In [5]:
def smORFID(header):
    return(numpy.nan)

In [6]:
def variantORFID(header):
    header = header.split('|')
    header = header[:3]
    header = '|'.join(header)
    return(header)

In [7]:
def getORFID(row, ucscMap):
    orfType = row.loc['category']
    header = row.loc['header']
    if orfType == 'UCSC':
        return(ucsdORFID(header, ucscMap))
    if orfType == 'Contaminant':
        return(contamORFID(header))
    if orfType == 'nuORF':
        return(nuORFID(header))
    if orfType == 'smORF':
        return(smORFID(header))
    if orfType == 'Variant':
        return(variantORFID(header))
    return('[ERROR]')

### Parse & Merge Tables

In [8]:
def parseUcscMap(path):
    ucscMap = dict()
    path = open(path, 'r')
    for line in path:
        line = line[:-1]
        line = line.split('\t')
        ucscMap[line[0]] = line[1]
    path.close()
    return(ucscMap)

In [9]:
def parseTPM(path):
    tpm = pandas.read_csv(
        path,
        sep='\t',
        header=0,
        index_col=False,
        usecols=['ORF_ID', 'mean.purity', 'mean.tpm', 'std.purity', 'std.tpm'])
    return(tpm)

In [10]:
def multipleMaps(ORF_ID):
    if ',' in str(ORF_ID):
        return(True)
    return(False)

In [11]:
def parseRefTPM(reference, ucscMap):
    ref = pandas.read_csv(
        reference,
        sep='\t',
        header=0,
        index_col=False)
    ref.drop(
        ['sequence'],
        axis=1,
        inplace=True)
    ucsc = parseUcscMap(ucscMap)
    ref['ORF_ID'] = ref.apply(
        getORFID,
        axis=1,
        args=(ucsc,))
    multi = ref[ref['ORF_ID'].apply(multipleMaps) == True].copy()
    if multi.shape[0] > 0:
        multiRows = dict()
        i = 0
        for row in multi.itertuples():
            IDs = str(row[-1]).split(',')
            for ids in IDs:
                core = list(row[1:-1])
                core.append(ids)
                multiRows[i] = tuple(core)
                i += 1
        multi = pandas.DataFrame.from_dict(
            multiRows,
            orient='index'
        )
        multi.columns = ref.columns
        ref = pandas.concat(
            [
                ref[ref['ORF_ID'].apply(multipleMaps) == False].copy(),
                multi
            ],
            axis=0,
            ignore_index=True
        )
    return(ref)

In [12]:
def parseRefNoTPM(reference):
    ref = pandas.read_csv(
        reference,
        sep='\t',
        header=0,
        index_col=False)
    ref.drop(
        ['sequence'],
        axis=1,
        inplace=True)
    return(ref)

In [13]:
def parseTSV(path):
    tsv = pandas.read_csv(
        path,
        sep='\t',
        header=0,
        index_col=False)
    tsv.drop(
        ['sequence', 'sequenceMulti', 'sequenceList'],
        axis=1,
        inplace=True)
    return(tsv)

In [14]:
def parseMap(path):
    peptideMap = pandas.read_csv(
        path,
        sep='\t',
        header=0,
        index_col=False)
    return(peptideMap)

In [15]:
def mergeTables(peptideMap, tsv, ref, tpm, ucscMap, out):
    peptideMap = parseMap(peptideMap)
    tsv = parseTSV(tsv)
    ref = parseRefTPM(ref, ucscMap)
    tpm = parseTPM(tpm)
    merge = peptideMap.merge(
        tsv,
        how='left',
        on='Peptide:UID')
    merge = merge.merge(
        ref,
        how='left',
        on='Protein:UID')
    merge = merge.merge(
        tpm,
        how='left',
        on='ORF_ID')
    merge.to_csv(
        out,
        sep='\t',
        header=True,
        index=False)

In [16]:
def mergeTablesNoTPM(peptideMap, tsv, ref, out, ucscMap=False):
    peptideMap = parseMap(peptideMap)
    tsv = parseTSV(tsv)
    if not ucscMap:
        ref = parseRefNoTPM(ref)
    else:
        ref = parseRefTPM(ref, ucscMap)
    merge = peptideMap.merge(
        tsv,
        how='left',
        on='Peptide:UID')
    merge = merge.merge(
        ref,
        how='left',
        on='Protein:UID')
    merge.to_csv(
        out,
        sep='\t',
        header=True,
        index=False)

## B721.221

#### Merge B721 MHC-I

In [17]:
pepMap = pepMapBase + 'B721.MHCI.map'
tsv = tsvBase + 'B721.MHCI.tsv'
ref = refBase + 'PanSample.ref'
tpm = tpmBase + 'B721.tpm'
out = outBase + 'B721.MHCI.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

#### Merge B721 Whole Peptidome

In [18]:
pepMap = pepMapBase + 'B721.Whole.map'
tsv = tsvBase + 'B721.Whole.tsv'
ref = refBase + 'PanSample.ref'
tpm = tpmBase + 'B721.tpm'
out = outBase + 'B721.Whole.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

## CLL

#### Merge CLL 5283 MHC-I

In [19]:
pepMap = pepMapBase + 'CLL.5283.MHCI.map'
tsv = tsvBase + 'CLL.5283.MHCI.tsv'
ref = refBase + 'CLL.5283.ref'
tpm = tpmBase + 'CLL.5283.tpm'
out = outBase + 'CLL.5283.MHCI.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

#### Merge CLL 5328 MHC-I

In [20]:
pepMap = pepMapBase + 'CLL.5328.MHCI.map'
tsv = tsvBase + 'CLL.5328.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = outBase + 'CLL.5328.MHCI.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out, ucscMap=ucsc)

#### Merge CLL 5341 MHC-I

In [21]:
pepMap = pepMapBase + 'CLL.5341.MHCI.map'
tsv = tsvBase + 'CLL.5341.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = outBase + 'CLL.5341.MHCI.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out, ucscMap=ucsc)

## MEL

#### Merge MEL 2 MHC-I

In [22]:
pepMap = pepMapBase + 'MEL.2.MHCI.map'
tsv = tsvBase + 'MEL.2.MHCI.tsv'
ref = refBase + 'MEL.2.ref'
tpm = tpmBase + 'MEL.2.tpm'
out = outBase + 'MEL.2.MHCI.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

#### Merge MEL 2 10IP MHC-I

In [23]:
pepMap = pepMapBase + 'MEL.2.10IP.MHCI.map'
tsv = tsvBase + 'MEL.2.10IP.MHCI.tsv'
ref = refBase + 'MEL.2.ref'
tpm = tpmBase + 'MEL.2.tpm'
out = outBase + 'MEL.2.10IP.MHCI.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

#### Merge MEL 2 SEL MHC-I

In [24]:
pepMap = pepMapBase + 'MEL.2s.MHCI.map'
tsv = tsvBase + 'MEL.2s.MHCI.tsv'
ref = refBase + 'MEL.2.ref'
tpm = tpmBase + 'MEL.2.tpm'
out = outBase + 'MEL.2s.MHCI.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

#### Merge MEL 2 SEL IFN MHC-I

In [25]:
pepMap = pepMapBase + 'MEL.2s.IFN.MHCI.map'
tsv = tsvBase + 'MEL.2s.IFN.MHCI.tsv'
ref = refBase + 'MEL.2.ref'
tpm = tpmBase + 'MEL.2.tpm'
out = outBase + 'MEL.2s.IFN.MHCI.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

#### Merge MEL 6 MHC-I

In [26]:
pepMap = pepMapBase + 'MEL.6.MHCI.map'
tsv = tsvBase + 'MEL.6.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = outBase + 'MEL.6.MHCI.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out, ucscMap=ucsc)

#### Merge MEL 6 IFN MHC-I

In [27]:
pepMap = pepMapBase + 'MEL.6.IFN.MHCI.map'
tsv = tsvBase + 'MEL.6.IFN.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = outBase + 'MEL.6.IFN.MHCI.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out, ucscMap=ucsc)

#### Merge MEL 11 MHC-I

In [28]:
pepMap = pepMapBase + 'MEL.11.MHCI.map'
tsv = tsvBase + 'MEL.11.MHCI.tsv'
ref = refBase + 'MEL.11.ref'
tpm = tpmBase + 'MEL.11.tpm'
out = outBase + 'MEL.11.MHCI.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

#### Merge MEL 11 IFN MHC-I

In [29]:
pepMap = pepMapBase + 'MEL.11.IFN.MHCI.map'
tsv = tsvBase + 'MEL.11.IFN.MHCI.tsv'
ref = refBase + 'MEL.11.ref'
tpm = tpmBase + 'MEL.11.tpm'
out = outBase + 'MEL.11.IFN.MHCI.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

#### Merge MEL 15 MHC-I

In [30]:
pepMap = pepMapBase + 'MEL.15.MHCI.map'
tsv = tsvBase + 'MEL.15.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = outBase + 'MEL.15.MHCI.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out, ucscMap=ucsc)

#### Merge MEL 15 IFN MHC-I

In [31]:
pepMap = pepMapBase + 'MEL.15.IFN.MHCI.map'
tsv = tsvBase + 'MEL.15.IFN.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = outBase + 'MEL.15.IFN.MHCI.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out, ucscMap=ucsc)

## GBM

#### Merge GBM H4512 MHC-I

In [32]:
pepMap = pepMapBase + 'GBM.H4512.MHCI.map'
tsv = tsvBase + 'GBM.H4512.MHCI.tsv'
ref = refBase + 'PanSample.ref'
tpm = tpmBase + 'GBM.H4512.tpm'
out = outBase + 'GBM.H4512.MHCI.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

#### Merge GBM H4512 IFN MHC-I

In [33]:
pepMap = pepMapBase + 'GBM.H4512.IFN.MHCI.map'
tsv = tsvBase + 'GBM.H4512.IFN.MHCI.tsv'
ref = refBase + 'PanSample.ref'
tpm = tpmBase + 'GBM.H4512.tpm'
out = outBase + 'GBM.H4512.IFN.MHCI.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

#### Merge GBM H4512 Whole Proteome

In [34]:
pepMap = pepMapBase + 'GBM.H4512.Whole.map'
tsv = tsvBase + 'GBM.H4512.Whole.tsv'
ref = refBase + 'PanSample.ref'
tpm = tpmBase + 'GBM.H4512.tpm'
out = outBase + 'GBM.H4512.Whole.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

#### Merge GBM H4198 MHC-I

In [35]:
pepMap = pepMapBase + 'GBM.H4198.MHCI.map'
tsv = tsvBase + 'GBM.H4198.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = outBase + 'GBM.H4198.MHCI.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out, ucscMap=ucsc)

#### Merge GBM H4198 IFN MHC-I

In [36]:
pepMap = pepMapBase + 'GBM.H4198.IFN.MHCI.map'
tsv = tsvBase + 'GBM.H4198.IFN.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = outBase + 'GBM.H4198.IFN.MHCI.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out, ucscMap=ucsc)

#### Merge GBM 7 MHC-I

In [37]:
pepMap = pepMapBase + 'GBM.7.MHCI.map'
tsv = tsvBase + 'GBM.7.MHCI.tsv'
ref = refBase + 'GBM.7.ref'
tpm = tpmBase + 'GBM.7.tpm'
out = outBase + 'GBM.7.MHCI.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

#### Merge GBM 7 IFN MHC-I

In [38]:
pepMap = pepMapBase + 'GBM.7.IFN.MHCI.map'
tsv = tsvBase + 'GBM.7.IFN.MHCI.tsv'
ref = refBase + 'GBM.7.ref'
tpm = tpmBase + 'GBM.7.tpm'
out = outBase + 'GBM.7.IFN.MHCI.merge'
mergeTables(pepMap, tsv, ref, tpm, ucsc, out)

## Ovarian Cancer

#### OV CP-594

In [39]:
pepMap = pepMapBase + 'OV.CP-594.MHCI.map'
tsv = tsvBase + 'OV.CP-594.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = outBase + 'OV.CP-594.MHCI.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out, ucscMap=ucsc)

#### OV SLS3-M1

In [40]:
pepMap = pepMapBase + 'OV.SLS3-M1.MHCI.map'
tsv = tsvBase + 'OV.SLS3-M1.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = outBase + 'OV.SLS3-M1.MHCI.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out, ucscMap=ucsc)

## Renal Cell Carcinoma

#### RCC 9

In [41]:
pepMap = pepMapBase + 'RCC.9.MHCI.map'
tsv = tsvBase + 'RCC.9.MHCI.tsv'
ref = refBase + 'PanSample.ref'
out = outBase + 'RCC.9.MHCI.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out, ucscMap=ucsc)

# Database Comparison

#### Merge RNA

In [42]:
pepMap = pepMapBase + 'DBC.RNA.map'
tsv = tsvBase + 'DBC.RNA.tsv'
ref = refBase + 'B721.RNA.ref'
out = outBase + 'DBC.RNA.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out)

#### Merge RPF

In [43]:
pepMap = pepMapBase + 'DBC.RPF.map'
tsv = tsvBase + 'DBC.RPF.tsv'
ref = refBase + 'B721.RPF.ref'
out = outBase + 'DBC.RPF.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out)

#### Merge B721

In [44]:
pepMap = pepMapBase + 'DBC.B721.map'
tsv = tsvBase + 'DBC.B721.tsv'
ref = refBase + 'B721.ref'
out = outBase + 'DBC.B721.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out)

#### Merge Pan Sample

In [45]:
pepMap = pepMapBase + 'DBC.PanSample.map'
tsv = tsvBase + 'DBC.PS.tsv'
ref = refBase + 'PanSample.ref'
out = outBase + 'DBC.PS.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out)

#### Merge Null

In [46]:
pepMap = pepMapBase + 'DBC.NULL.map'
tsv = tsvBase + 'DBC.NULL.tsv'
ref = refBase + 'B721.NULL.ref'
out = outBase + 'DBC.NULL.merge'
mergeTablesNoTPM(pepMap, tsv, ref, out)