In [None]:
###################################################################################
## Name: mRNA_miRNA_match
## Goal: Match the mRNA and miRNA data on cell barcode
## Output: Subsets of normalized mRNA & miRNA data to cells that are present in both samples
##         Dataset dimension: mRNA-(16335, 9858); miRNA-(662, 9858)
## Author: Claire Su
## Date Last Modified: 2/28/2020
## Notes:
###################################################################################

In [1]:
## import packages 
import pandas as pd
import numpy as np

In [2]:
## import normalized mRNA
mrna_normalized = pd.read_csv('/home/isu/miRNA_project/normalized_data/mRNA_normalized.csv', 
                         sep=',', header=[0], index_col=0)

In [8]:
mrna_normalized.head()

Unnamed: 0_level_0,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J5-01A-11R-A29S-07,TCGA-OR-A5J6-01A-31R-A29S-07,TCGA-OR-A5J7-01A-11R-A29S-07,TCGA-OR-A5J8-01A-11R-A29S-07,TCGA-OR-A5J9-01A-11R-A29S-07,TCGA-OR-A5JA-01A-11R-A29S-07,TCGA-OR-A5JB-01A-11R-A29S-07,...,TCGA-CG-4449-01A-01R-1157-13,TCGA-CG-4462-01A-01R-1157-13,TCGA-CG-4465-01A-01R-1157-13,TCGA-CG-4466-01A-01R-1157-13,TCGA-CG-4469-01A-01R-1157-13,TCGA-CG-4472-01A-01R-1157-13,TCGA-CG-4474-01A-02R-1157-13,TCGA-CG-4475-01A-01R-1157-13,TCGA-CG-4476-01A-01R-1157-13,TCGA-CG-4477-01A-01R-1157-13
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
?|100133144,-0.007189,-0.007797,-0.008649,-0.010226,-0.010226,-0.009147,-0.008913,-0.010226,-0.006268,-0.003612,...,-0.006214,-0.004908,-0.005281,0.003122,0.008417,-0.003618,0.007003,0.000847,0.007044,0.006044
?|100134869,-0.006951,-0.002488,-0.004641,-0.009207,-0.006434,-0.005027,-0.00853,-0.009368,-0.006119,-0.010612,...,-0.008164,-0.007477,-0.008312,-0.006046,0.000206,0.002243,-0.003597,3.1e-05,6.8e-05,0.002987
?|10357,-0.004116,-0.063198,-0.058212,-0.086378,-0.073379,-0.042702,-0.051224,-0.065056,-0.074339,0.001206,...,0.325923,0.109707,0.367897,0.278021,0.545048,0.339391,0.399205,0.249485,0.376406,0.364063
?|10431,0.434053,0.152294,0.045396,0.513585,0.149819,0.374501,0.01653,0.331527,0.103885,0.424631,...,0.117758,-0.155537,0.124026,-0.158034,-0.15498,0.172977,0.11056,0.191315,0.017895,0.324603
?|155060,0.036224,-0.025062,0.084757,0.144264,-0.060414,-0.133174,0.104608,0.032732,0.004709,-0.107688,...,-0.113659,-0.067661,-0.092819,-0.09005,0.058732,-0.137774,-0.04424,-0.089597,-0.028702,-0.105938


In [3]:
## rename the columns to the first 20 digits to be matched with miRNA samples
##  check https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/
mrna_normalized_renamed = mrna_normalized.rename(lambda i: i[0:19], axis='columns')

In [15]:
mrna_normalized_renamed.head()

Unnamed: 0_level_0,TCGA-OR-A5J1-01A-11,TCGA-OR-A5J2-01A-11,TCGA-OR-A5J3-01A-11,TCGA-OR-A5J5-01A-11,TCGA-OR-A5J6-01A-31,TCGA-OR-A5J7-01A-11,TCGA-OR-A5J8-01A-11,TCGA-OR-A5J9-01A-11,TCGA-OR-A5JA-01A-11,TCGA-OR-A5JB-01A-11,...,TCGA-CG-4449-01A-01,TCGA-CG-4462-01A-01,TCGA-CG-4465-01A-01,TCGA-CG-4466-01A-01,TCGA-CG-4469-01A-01,TCGA-CG-4472-01A-01,TCGA-CG-4474-01A-02,TCGA-CG-4475-01A-01,TCGA-CG-4476-01A-01,TCGA-CG-4477-01A-01
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
?|100133144,-0.007189,-0.007797,-0.008649,-0.010226,-0.010226,-0.009147,-0.008913,-0.010226,-0.006268,-0.003612,...,-0.006214,-0.004908,-0.005281,0.003122,0.008417,-0.003618,0.007003,0.000847,0.007044,0.006044
?|100134869,-0.006951,-0.002488,-0.004641,-0.009207,-0.006434,-0.005027,-0.00853,-0.009368,-0.006119,-0.010612,...,-0.008164,-0.007477,-0.008312,-0.006046,0.000206,0.002243,-0.003597,3.1e-05,6.8e-05,0.002987
?|10357,-0.004116,-0.063198,-0.058212,-0.086378,-0.073379,-0.042702,-0.051224,-0.065056,-0.074339,0.001206,...,0.325923,0.109707,0.367897,0.278021,0.545048,0.339391,0.399205,0.249485,0.376406,0.364063
?|10431,0.434053,0.152294,0.045396,0.513585,0.149819,0.374501,0.01653,0.331527,0.103885,0.424631,...,0.117758,-0.155537,0.124026,-0.158034,-0.15498,0.172977,0.11056,0.191315,0.017895,0.324603
?|155060,0.036224,-0.025062,0.084757,0.144264,-0.060414,-0.133174,0.104608,0.032732,0.004709,-0.107688,...,-0.113659,-0.067661,-0.092819,-0.09005,0.058732,-0.137774,-0.04424,-0.089597,-0.028702,-0.105938


In [None]:
## save to csv
mrna_normalized_renamed.to_csv('/home/isu/miRNA_project/normalized_data/mrna_normalized_renamed.csv')

In [4]:
## read in miRNA dataset and rename columns
microrna_normalized = pd.read_csv('/home/isu/miRNA_project/normalized_data/miRNA_normalized.csv', 
                         sep=',', header=[0], index_col=0)
microrna_normalized.head()

Unnamed: 0_level_0,TCGA-C4-A0F6-01A-11R-A10V-13,TCGA-CU-A0YO-01A-11R-A10V-13,TCGA-BT-A0S7-01A-11R-A10V-13,TCGA-CU-A0YR-01A-12R-A10V-13,TCGA-BL-A0C8-01A-11R-A10V-13,TCGA-C4-A0F0-01A-12R-A10V-13,TCGA-BL-A13J-01A-11R-A10V-13,TCGA-BT-A0YX-01A-11R-A10V-13,TCGA-CU-A0YN-01A-21R-A10V-13,TCGA-CU-A0YR-11A-13R-A10V-13,...,TCGA-AG-A020-01A-21R-A082-13,TCGA-AG-A01Y-01A-41R-A082-13,TCGA-AG-A01W-01A-21R-A082-13,TCGA-AG-3726-01A-02T-0906-13,TCGA-AG-3605-01A-01T-0827-13,TCGA-AG-3584-01A-01T-0822-13,TCGA-AG-3599-01A-02T-0827-13,TCGA-AG-3583-01A-01T-0822-13,TCGA-AG-3598-01A-01T-0827-13,TCGA-AG-3586-01A-02T-0822-13
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hsa-let-7a-2-3p,-0.096518,-0.059309,-0.017156,-0.100776,-0.096693,-0.034009,-0.034,-0.083818,-0.074024,-0.027468,...,-0.119897,-0.06957,-0.079228,-0.087972,-0.103827,-0.02011,-0.046386,-0.084806,-0.048916,-0.064222
hsa-let-7a-3p,-0.074348,0.366087,0.23445,-0.249528,-0.025498,-0.018863,0.140189,-0.028565,0.065308,-0.108119,...,0.555727,0.575986,0.209247,-0.051397,0.143354,0.097874,0.26447,0.126195,0.202228,0.163525
hsa-let-7a-5p,-0.807536,-0.178267,0.650867,-0.529122,0.097222,0.247807,0.150286,0.043417,-0.088568,0.393956,...,0.297262,0.688233,0.162399,-0.084191,0.595498,0.370962,0.231508,0.240959,0.472725,0.039017
hsa-let-7b-3p,-0.31152,-0.191564,0.054194,-0.273958,-0.165673,-0.14112,-0.111053,-0.22357,0.014016,-0.150772,...,0.074421,-0.154328,-0.111632,0.012789,-0.101535,0.071444,0.257463,0.180531,0.443909,-0.035733
hsa-let-7b-5p,-0.979621,-0.64275,0.805858,-1.173967,-0.93755,0.11368,-0.265482,-0.45267,-0.445293,0.268768,...,-0.862435,-0.647105,-0.543768,-0.900613,-0.258135,-0.433047,-0.144974,0.217156,-0.082366,-0.066606


In [5]:
microrna_normalized_renamed = microrna_normalized.rename(lambda i: i[0:19], axis='columns')

In [19]:
microrna_normalized_renamed.head()

Unnamed: 0_level_0,TCGA-C4-A0F6-01A-11,TCGA-CU-A0YO-01A-11,TCGA-BT-A0S7-01A-11,TCGA-CU-A0YR-01A-12,TCGA-BL-A0C8-01A-11,TCGA-C4-A0F0-01A-12,TCGA-BL-A13J-01A-11,TCGA-BT-A0YX-01A-11,TCGA-CU-A0YN-01A-21,TCGA-CU-A0YR-11A-13,...,TCGA-AG-A020-01A-21,TCGA-AG-A01Y-01A-41,TCGA-AG-A01W-01A-21,TCGA-AG-3726-01A-02,TCGA-AG-3605-01A-01,TCGA-AG-3584-01A-01,TCGA-AG-3599-01A-02,TCGA-AG-3583-01A-01,TCGA-AG-3598-01A-01,TCGA-AG-3586-01A-02
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hsa-let-7a-2-3p,-0.096518,-0.059309,-0.017156,-0.100776,-0.096693,-0.034009,-0.034,-0.083818,-0.074024,-0.027468,...,-0.119897,-0.06957,-0.079228,-0.087972,-0.103827,-0.02011,-0.046386,-0.084806,-0.048916,-0.064222
hsa-let-7a-3p,-0.074348,0.366087,0.23445,-0.249528,-0.025498,-0.018863,0.140189,-0.028565,0.065308,-0.108119,...,0.555727,0.575986,0.209247,-0.051397,0.143354,0.097874,0.26447,0.126195,0.202228,0.163525
hsa-let-7a-5p,-0.807536,-0.178267,0.650867,-0.529122,0.097222,0.247807,0.150286,0.043417,-0.088568,0.393956,...,0.297262,0.688233,0.162399,-0.084191,0.595498,0.370962,0.231508,0.240959,0.472725,0.039017
hsa-let-7b-3p,-0.31152,-0.191564,0.054194,-0.273958,-0.165673,-0.14112,-0.111053,-0.22357,0.014016,-0.150772,...,0.074421,-0.154328,-0.111632,0.012789,-0.101535,0.071444,0.257463,0.180531,0.443909,-0.035733
hsa-let-7b-5p,-0.979621,-0.64275,0.805858,-1.173967,-0.93755,0.11368,-0.265482,-0.45267,-0.445293,0.268768,...,-0.862435,-0.647105,-0.543768,-0.900613,-0.258135,-0.433047,-0.144974,0.217156,-0.082366,-0.066606


In [6]:
## match sample barcodes from mRNA and microRNA
matched_sample_barcodes = set(mrna_normalized_renamed.columns).intersection(set(microrna_normalized_renamed.columns))
len(set(matched_sample_barcodes)) ## 10468 appear in both datasets

10468

In [None]:
## keep only cancer samples, check https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/
cancer_subjects= [i for i in matched_sample_barcodes if int(i[13:15])<10]
len(set(cancer_subjects)) ## 9858 cancer subjects
cancer_subjects

In [8]:
## filter the normalized mRNA and miRNA datasets by the matched cancer patient barcodes
microrna_selected=microrna_normalized_renamed.filter(items=cancer_subjects)
microrna_selected.shape

(662, 9858)

In [12]:
mrna_selected=mrna_normalized_renamed.filter(items=cancer_subjects)
mrna_selected.shape #(16335, 9858)

In [14]:
## save to csv files
mrna_selected.to_csv('/home/isu/miRNA_project/normalized_data/mrna_selected.csv')
microrna_selected.to_csv('/home/isu/miRNA_project/normalized_data/microrna_selected.csv')

In [15]:
mrna_selected.shape

(16335, 9858)