In [None]:
###################################################################################
## Name: mRNA_normalization
## Goal: Normalize TCGA (bulk) mRNA data by
##       1) Subset to corrected data and drop rows with NA values
##       2) Apply upper-quantile normalization and centering
## Output: Normalized mRNA data for predictor variables of model
##         Dataset dimension: (16335, 11069)
## Author: Claire Su
## Date Last Modified: 2/28/2020
## Notes:
###################################################################################

In [2]:
## import pakcages 
import pandas as pd
import numpy as np

In [3]:
## import raw mRNA data 
pan_mrna = pd.read_csv('/home/isu/miRNA_project/raw_data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv', 
                         sep='\t', header=[0], index_col=0)

In [4]:
pan_mrna.shape #(20531, 11069)

(20531, 11069)

In [62]:
## check that data is read in correctly
pan_mrna.head()

Unnamed: 0_level_0,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J5-01A-11R-A29S-07,TCGA-OR-A5J6-01A-31R-A29S-07,TCGA-OR-A5J7-01A-11R-A29S-07,TCGA-OR-A5J8-01A-11R-A29S-07,TCGA-OR-A5J9-01A-11R-A29S-07,TCGA-OR-A5JA-01A-11R-A29S-07,TCGA-OR-A5JB-01A-11R-A29S-07,...,TCGA-CG-4449-01A-01R-1157-13,TCGA-CG-4462-01A-01R-1157-13,TCGA-CG-4465-01A-01R-1157-13,TCGA-CG-4466-01A-01R-1157-13,TCGA-CG-4469-01A-01R-1157-13,TCGA-CG-4472-01A-01R-1157-13,TCGA-CG-4474-01A-02R-1157-13,TCGA-CG-4475-01A-01R-1157-13,TCGA-CG-4476-01A-01R-1157-13,TCGA-CG-4477-01A-01R-1157-13
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
?|100130426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
?|100133144,3.2661,2.6815,1.7301,0.0,0.0,1.1673,1.4422,0.0,4.4556,7.1293,...,4.358154,5.676995,5.21935,14.846708,20.115492,6.997533,18.311906,12.057112,18.62874,17.874417
?|100134869,3.9385,8.9948,6.565,1.5492,4.4709,6.0529,2.2876,1.3599,5.0581,0.0,...,2.65636,3.342794,2.423442,5.055287,11.626054,13.654193,7.417109,11.585177,11.482418,14.919338
?|10357,149.135,81.0777,86.4879,53.9117,66.9063,103.506,94.9316,78.1955,69.2389,155.709,...,633.299781,294.018042,686.569179,563.573453,1039.307597,639.238135,742.479964,506.336449,712.452165,703.713324
?|10431,2034.1,1304.93,1054.66,2350.89,1257.99,1866.43,995.027,1762.12,1213.53,2005.57,...,1202.538277,644.002317,1181.884532,663.885074,647.530395,1297.152549,1152.909807,1375.495774,971.893874,1736.988111


In [5]:
## remove rows with NA values
pan_mrna_nomissing=pan_mrna.dropna(axis=0)
pan_mrna.shape #(20531, 11069)
pan_mrna_nomissing.shape #(16335, 11069) 

(16335, 11069)

In [6]:
## upper-quannle ormalization
pan_mrna_qnormalized=pan_mrna_nomissing.apply(lambda col: np.log1p(col/col[col.astype(bool)].quantile(0.75)))

In [7]:
### center  (for each row/gene center by mean expression across cell samples)
pan_mrna_qcnormalized = pan_mrna_qnormalized.apply(lambda row: row-row.mean(), axis=1)

In [10]:
pan_mrna_qcnormalized.shape

(16335, 11069)

In [9]:
# save normalized data frame
pan_mrna_qcnormalized.to_csv('/home/isu/miRNA_project/normalized_data/mRNA_normalized.csv')