In [None]:
###################################################################################
## Name: miRNA_normalization
## Goal: Normalize TCGA (bulk) microRNA data by
##       1) Subset to corrected data and drop rows with NA values
##       2) Apply upper-quantile normalization and centering
## Output: Normalized miRNA data for outcome variable of model
##         Dataset dimension: (662, 10824)
## Author: Claire Su
## Date Last Modified: 2/28/2020
## Notes:
###################################################################################

In [1]:
## import pakcages 
import pandas as pd
import numpy as np

In [3]:
## read in raw miRNA data
pan_microrna = pd.read_csv('/home/isu/miRNA_project/raw_data/pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.csv', 
                         sep=',', header=[0], index_col=0)

In [8]:
## subset to corrected miRNA
pan_microrna_corrected = pan_microrna[pan_microrna.Correction == 'Corrected']
pan_microrna_corrected.shape ## (662, 10825) 

(662, 10825)

In [11]:
## remove rows with NA values
pan_microrna_nomissing=pan_microrna_corrected.dropna(axis=0)
pan_microrna_nomissing.shape

## drop the Correction column
pan_microrna_nomissing=pan_microrna_nomissing.drop(["Correction"],axis=1)

In [13]:
## upper-quantile normalization 
pan_microrna_qnormalized=pan_microrna_nomissing.apply(lambda col: np.log1p(col/col[col.astype(bool)].quantile(0.75)))

## centering
pan_microrna_qcnormalized = pan_microrna_qnormalized.apply(lambda row: row-row.mean(), axis=1)

In [14]:
## check normalized data dimension
pan_microrna_qcnormalized.shape ##(662, 10824)

(662, 10824)

In [15]:
## save file
pan_microrna_qcnormalized.to_csv('/home/isu/miRNA_project/normalized_data/miRNA_normalized.csv')