# Parse TPM & Purity
Convert from the per sample TPM files to Unified tables to merge with Peptide Information.

#### Import Packages & Setup Environment

In [1]:
import pandas
import statistics as stats
pandas.set_option('display.max_columns', 500)

#### Define Functions

In [2]:
def readTPM(path, name):
    keep = [
        name + '.purity',
        name + '.tpm'
    ]
    df = pandas.read_csv(
        path,
        sep='\t',
        header=0,
        index_col='ORF_ID',
        low_memory=False
    )
    df[name + '.purity'] = df['Purity']
    df[name + '.tpm'] = df['TPM']
    df = df[keep]
    return(df)

In [3]:
def meanPurity(row):
    allNames = list(row.index)
    names = []
    for name in allNames:
        if 'purity' in name:
            names.append(name)
    values = []
    for name in names:
        values.append(row.loc[name])
    return(stats.mean(values))

In [4]:
def meanTPM(row):
    allNames = list(row.index)
    names = []
    for name in allNames:
        if 'tpm' in name:
            names.append(name)
    values = []
    for name in names:
        values.append(row.loc[name])
    return(stats.mean(values))

In [5]:
def stdPurity(row):
    allNames = list(row.index)
    names = []
    for name in allNames:
        if 'purity' in name:
            names.append(name)
    values = []
    for name in names:
        values.append(row.loc[name])
    return(stats.stdev(values))

In [6]:
def stdTPM(row):
    allNames = list(row.index)
    names = []
    for name in allNames:
        if 'tpm' in name:
            names.append(name)
    values = []
    for name in names:
        values.append(row.loc[name])
    return(stats.stdev(values))

## B721

In [7]:
A0101 = readTPM('../../data/tpm/B721_A0101_1.tpm', 'A0101')
A3303 = readTPM('../../data/tpm/B721_A3303_1.tpm', 'A3303')
B1501 = readTPM('../../data/tpm/B721_B1501_1.tpm', 'B1501')
B4402 = readTPM('../../data/tpm/B721_B4402_1.tpm', 'B4402')
B721 = A0101.join([A3303, B1501, B4402])
B721.reset_index(
    drop=False,
    inplace=True)
B721['mean.purity'] = B721.apply(meanPurity, axis=1)
B721['mean.tpm'] = B721.apply(meanTPM, axis=1)
B721['std.purity'] = B721.apply(stdPurity, axis=1)
B721['std.tpm'] = B721.apply(stdTPM, axis=1)
B721.to_csv(
    '../../data/tpm/B721.tpm',
    sep='\t',
    header=True,
    index=False)

## MEL 11

In [8]:
Mel11_1 = readTPM('../../data/tpm/Mel11_1.tpm', '1')
Mel11_3 = readTPM('../../data/tpm/Mel11_3.tpm', '3')
Mel11_4 = readTPM('../../data/tpm/Mel11_4.tpm', '4')
Mel11 = Mel11_1.join([Mel11_3, Mel11_4])
Mel11.reset_index(
    drop=False,
    inplace=True)
Mel11['mean.purity'] = Mel11.apply(meanPurity, axis=1)
Mel11['mean.tpm'] = Mel11.apply(meanTPM, axis=1)
Mel11['std.purity'] = Mel11.apply(stdPurity, axis=1)
Mel11['std.tpm'] = Mel11.apply(stdTPM, axis=1)
Mel11.to_csv(
    '../../data/tpm/MEL.11.tpm',
    sep='\t', 
    header=True,
    index=False)

## GBM

In [9]:
GBM_ImHm = readTPM('../../data/tpm/GBM_H4512_ImHm.tpm', 'ImHm')
GBM_ImHp = readTPM('../../data/tpm/GBM_H4512_ImHp.tpm', 'ImHp')
GBM_IpHm = readTPM('../../data/tpm/GBM_H4512_IpHm.tpm', 'IpHm')
GBM_IpHp = readTPM('../../data/tpm/GBM_H4512_IpHp.tpm', 'IpHp')
GBM = GBM_ImHm.join([GBM_IpHm, GBM_ImHp, GBM_IpHp])
GBM.reset_index(
    drop=False,
    inplace=True)
GBM['mean.purity'] = GBM.apply(meanPurity, axis=1)
GBM['mean.tpm'] = GBM.apply(meanTPM, axis=1)
GBM['std.purity'] = GBM.apply(stdPurity, axis=1)
GBM['std.tpm'] = GBM.apply(stdTPM, axis=1)
GBM.to_csv(
    '../../data/tpm/GBM.H4512.tpm',
    sep='\t',
    header=True,
    index=False)

## CLL 5283

In [10]:
CLL_1 = readTPM('../../data/tpm/CLL5_1.tpm', '1')
CLL_2 = readTPM('../../data/tpm/CLL5_2.tpm', '2')
CLL_3 = readTPM('../../data/tpm/CLL5_3_1.tpm', '3')
CLL = CLL_1.join([CLL_2, CLL_3])
CLL.reset_index(
    drop=False,
    inplace=True)
CLL['mean.purity'] = CLL.apply(meanPurity, axis=1)
CLL['mean.tpm'] = CLL.apply(meanTPM, axis=1)
CLL['std.purity'] = CLL.apply(stdPurity, axis=1)
CLL['std.tpm'] = CLL.apply(stdTPM, axis=1)
CLL.to_csv(
    '../../data/tpm/CLL.5283.tpm',
    sep='\t',
    header=True,
    index=False)

## MEL 2

In [11]:
Mel2_2 = readTPM('../../data/tpm/Mel2_2.tpm', '2')
Mel2_3 = readTPM('../../data/tpm/Mel2_3.tpm', '3')
Mel2 = Mel2_2.join([Mel2_3])
Mel2.reset_index(
    drop=False,
    inplace=True)
Mel2['mean.purity'] = Mel2.apply(meanPurity, axis=1)
Mel2['mean.tpm'] = Mel2.apply(meanTPM, axis=1)
Mel2['std.purity'] = Mel2.apply(stdPurity, axis=1)
Mel2['std.tpm'] = Mel2.apply(stdTPM, axis=1)
Mel2.to_csv(
    '../../data/tpm/MEL.2.tpm',
    sep='\t',
    header=True,
    index=False)

## MEL 5

In [12]:
Mel5 = readTPM('../../data/tpm/Mel5_1.tpm', 'Mel5')
Mel5.reset_index(
    drop=False,
    inplace=True)
Mel5['mean.purity'] = Mel5.apply(meanPurity, axis=1)
Mel5['mean.tpm'] = Mel5.apply(meanTPM, axis=1)
Mel5['std.purity'] = Mel5.apply(stdPurity, axis=1)
Mel5['std.tpm'] = Mel5.apply(stdTPM, axis=1)
Mel5.to_csv(
    '../../data/tpm/MEL.5.tpm',
    sep='\t',
    header=True,
    index=False)

## GBM 7

In [13]:
GBM7_1 = readTPM('../../data/tpm/GBM7_1.tpm', '1')
GBM7_2 = readTPM('../../data/tpm/GBM7_2.tpm', '2')
GBM7 = GBM7_1.join([GBM7_2])
GBM7.reset_index(
    drop=False,
    inplace=True)
GBM7['mean.purity'] = GBM7.apply(meanPurity, axis=1)
GBM7['mean.tpm'] = GBM7.apply(meanTPM, axis=1)
GBM7['std.purity'] = GBM7.apply(stdPurity, axis=1)
GBM7['std.tpm'] = GBM7.apply(stdTPM, axis=1)
GBM7.to_csv(
    '../../data/tpm/GBM.7.tpm',
    sep='\t',
    header=True,
    index=False)