# Step 0. Normalization

This script is to normalize the peak areas for heart and brain samples. This script can be run locally.

Input data: DIA-NN output and the meta table.

Output data: Normalized peptide-level MS1 and MS2 areas.

Created by Kirsten Chen

Created on Sept 20, 2022

Last modified on Sept 22, 2022


In [1]:
import pandas as pd
import numpy as np

raw_data  = pd.read_csv('Heart/report.tsv', sep='\t')
# this is done due to MBR
raw_data = raw_data[raw_data['Lib.PG.Q.Value'] <= 0.01]

raw_data.head()


Unnamed: 0,File.Name,Run,Protein.Group,Protein.Ids,Protein.Names,Genes,PG.Quantity,PG.Normalised,PG.MaxLFQ,Genes.Quantity,...,Decoy.Evidence,Decoy.CScore,Fragment.Quant.Raw,Fragment.Quant.Corrected,Fragment.Correlations,MS2.Scan,IM,iIM,Predicted.IM,Predicted.iIM
23,F:\SILAM_Proteomics_Kirsten\SILAM_Heart\090220...,09022022_DIA_SILAM_IHJ01_3_S4-A3_1_778,Q3TW96,Q3TW96,UAP1L_MOUSE,Uap1l1,3421.56,3887.8,3887.8,3421.56,...,1.12641,0.106719,1184.05;379.016;1137.05;535.023;216.009;1739.0...,1147.47;304.386;930.707;535.023;216.009;1739.0...,0.782612;0.644893;0.68738;0.732888;0.731967;0....,22555,1.1001,1.09833,1.09129,1.10667
24,F:\SILAM_Proteomics_Kirsten\SILAM_Heart\090220...,09022022_DIA_SILAM_IHJ01_6_S4-A6_1_781,Q3TW96,Q3TW96,UAP1L_MOUSE,Uap1l1,4084.53,3130.07,3130.07,4084.53,...,0.683583,0.076346,2179.09;342.013;887.039;1352.05;170.007;1612.0...,2179.09;342.013;407.637;613.099;135.111;1292.3...,0.970511;0.824456;0.589296;0.669061;0.630105;0...,22479,1.09364,1.09833,1.1012,1.09032
25,F:\SILAM_Proteomics_Kirsten\SILAM_Heart\090220...,09022022_DIA_SILAM_IHJ01_9_S4-A9_1_784,Q3TW96,Q3TW96,UAP1L_MOUSE,Uap1l1,4106.01,3658.66,3658.66,4106.01,...,0.864389,0.196242,1339.05;265.013;847.031;1598.07;122.005;1764.0...,1339.05;21.4871;847.031;1368.18;80.5332;1398.7...,0.947722;0.425962;0.821575;0.759144;0.651164;0...,22346,1.09716,1.09833,1.09792,1.09714
26,F:\SILAM_Proteomics_Kirsten\SILAM_Heart\090220...,09022022_DIA_SILAM_IHJ01_11_S4-A11_1_786,Q3TW96,Q3TW96,UAP1L_MOUSE,Uap1l1,2897.15,2913.0,2913.01,2897.15,...,0.892085,0.182242,1228.05;242.011;1253.05;1339.06;46.0018;1635.0...,774.747;242.011;172.923;712.002;46.0018;1410.4...,0.711774;0.828315;0.224136;0.576091;0.546801;0...,22365,1.08781,1.09833,1.09483,1.09051
27,F:\SILAM_Proteomics_Kirsten\SILAM_Heart\090220...,09022022_DIA_SILAM_IHJ01_12_S4-A12_1_787,Q3TW96,Q3TW96,UAP1L_MOUSE,Uap1l1,3848.59,3857.63,3857.63,3848.59,...,1.29987,0.282116,1710.06;234.01;556.022;1186.05;60.0027;1096.05...,1710.06;177.058;358.118;1186.05;58.79;952.481;...,0.973339;0.642635;0.598728;0.848378;0.574693;0...,22384,1.08938,1.09833,1.09433,1.09276


In [2]:
# meta
mrg = pd.read_excel('Meta tables/IHJ01 SILAM Heart MS plate.xlsx')
mrg = mrg[['Sample ID Short','Mouse ID','Oxygen','Day']]
mrg

Unnamed: 0,Sample ID Short,Mouse ID,Oxygen,Day
0,1,1,21,0
1,2,6,60,0
2,3,11,8,0
3,4,16,21,2
4,5,21,60,2
...,...,...,...,...
96,97,75,8,16
97,98,80,21,32
98,99,85,60,32
99,100,90,8,32


In [6]:
def process_data(raw, mrg, level='MS1', norm=True):
    '''
    get a long format report and after selecting the 
    '''
    if level == 'MS1':
        lev = 'Ms1.Area'
    else:
        lev = 'Precursor.Quantity'
    raw = raw[['Run', 'Protein.Group',
              'Genes', lev, 'Modified.Sequence', 'Stripped.Sequence']]
    
    if norm:
        # normalize intensity to fractional
        raw_wide = pd.pivot_table(data=raw, values=lev, 
                                  index=['Protein.Group','Genes', 'Modified.Sequence', 'Stripped.Sequence'], 
                                  columns='Run').reset_index()
        lhalf_life = ['P21619','P68433','P43277',
                      'P56382','Q3U5Q7','Q8C6K9', 'P68134','Q8C5H8', 'P10922', 'Q60675']
        lhalf_df = raw_wide[raw_wide['Protein.Group'].isin(lhalf_life)].set_index(['Protein.Group',
                                                                                   'Genes', 
                                                                                   'Modified.Sequence', 
                                                                                   'Stripped.Sequence'])
        
        lhalf_df = np.nanmedian(lhalf_df.values, axis=0)
        raw_wide.set_index(['Protein.Group','Genes', 'Modified.Sequence', 'Stripped.Sequence'], inplace=True)
        raw_wide2 = raw_wide / lhalf_df.flatten() * np.nanmean(lhalf_df)
        raw_wide2.reset_index(inplace=True)
        raw = pd.melt(raw_wide2, id_vars=[
                            'Protein.Group', 'Genes', 'Modified.Sequence', 'Stripped.Sequence'])
    
    raw['Run_ID'] = np.array([i.split('_')[4] for i in raw['Run']]).astype(int)  # to be edited!!
    
    raw = pd.merge(raw, mrg, left_on='Run_ID', right_on ='Sample ID Short')
    raw = raw.drop(['Sample ID Short', 'Run_ID'], axis=1)
    raw['quant'] = lev
    raw['Genes'] = raw['Genes'].str.upper()
    raw.rename(columns={lev: 'value'}, inplace=True)
    
    return raw


full = []
for x in [True, False]:
    raw_ms1 = process_data(raw_data, mrg, 'MS1')
    raw_ms2 = process_data(raw_data, mrg, 'MS2')
    raw_concat = pd.concat([raw_ms1, raw_ms2])
    raw_concat['quant'] = raw_concat['quant'].replace({'Precursor.Quantity':'MS2.Area'})
    raw_concat['normalized'] = x
    full.append(raw_concat)
#raw_concat = raw_concat.groupby(['Protein.Group', 'Modified.Sequence',
#'Genes', 'Oxygen', 'quant', 'Day']).mean().reset_index()
full = pd.concat(full)
full.to_csv('peptide_level.csv')
