In [2]:
import pandas as pd
import numpy as np
import gzip
import os
from sklearn.preprocessing import Imputer,normalize

## To get the information of case id and stage for cancer and tumor

In [3]:
def find_dir(dir_name):
    file_list = []
    for path_name, dir, files_name in os.walk(dir_name):
        for file in files_name:
            file_list.append(os.path.join(path_name, file))
    return file_list

def load_zip(file_list,index):
    with gzip.open(file_list[index]) as f:
        features_train = pd.read_csv(f,header = None)
    return features_train

In [4]:
miRNA = str(os.getcwd())+'/miRNA'
DNA_methylation = str(os.getcwd())+'/DNA methylation'

In [5]:
clincial_list = pd.read_csv('clinical.tsv',sep = '\t')

In [6]:
case_id_temp = clincial_list['case_submitter_id']
stage = clincial_list['ajcc_pathologic_stage']
tumor_stage = clincial_list['tumor_stage']
file_id = clincial_list['case_id']

In [7]:
Brca_case_file_relation = pd.read_csv('BRCA_Gene_expression.tsv',sep = '\t')
ge_file_name = Brca_case_file_relation['File Name']
ge_cae_id = Brca_case_file_relation['Case ID']

## To filter the case id which miss either the stage or tumor stage

In [8]:
case_id = []

for i in range(len(case_id_temp)):
    if i % 2 == 0: 
        if stage[i] != "'--":
            if tumor_stage[i] != 'not reported':
                case_id.append(case_id_temp[i])
                
# this is the case ID which contain the patient have both stage and tumor stage information        
case_id = np.array(case_id) 

# Preprocessing Gene expression data

In [9]:
gene_expression_dict_string = str(os.getcwd())+'/gene expression'
gene_expression_dict_info = find_dir(gene_expression_dict_string)
gene_expression_data = []
gene_expression_case_ID = []
gene_expression_empty = []

for i in range(2,len(gene_expression_dict_info)):
    f_temp = gene_expression_dict_info[i].split("/")
    file_id = f_temp[-1]
    
    for j in range(len(ge_file_name)):
        if file_id == ge_file_name[j]:
            if ge_cae_id[j] in case_id:
                gene_expression_case_ID.append(ge_cae_id[j])
                data_temp = gene_expression_dict_info[i]

                with gzip.open(data_temp) as f:
                    features_train = pd.read_csv(f,sep = '\t',header = None)
                    
                gene_expression_empty.append(np.array(features_train)[:,0])
                gene_expression_data.append(np.array(features_train)[:,1])
                

In [10]:
gene_expression_data_matrix = pd.DataFrame(np.array(gene_expression_data))
gene_index = gene_expression_empty[0]

In [11]:
rename_column = np.array([column for column in gene_expression_data_matrix])
gene_expression_data_matrix.columns = list(gene_index)
gene_expression_data_matrix = gene_expression_data_matrix.T
gene_expression_data_matrix.columns = gene_expression_case_ID

## Delete the zero value when:
1. the sample will be remove when it missing 20% of gene
2. the number of zeros more than 20% of the point

In [12]:
gene_point_rate_temp = np.array(gene_expression_data)
gene_point_rate_list = []

for i in range(len(gene_expression_data[0])):
    rate = (len(gene_point_rate_temp) - np.count_nonzero(gene_point_rate_temp[:,i]))/1276
    gene_point_rate_list.append(rate)

drop_gene_point_list = []

for i in range(len(gene_point_rate_list)):
    if gene_point_rate_list[i] >= 0.2:
        drop_gene_point_list.append(list(gene_index)[i])

In [13]:
copy_data = gene_expression_data_matrix
gene_expression_data_with_gene_point_delete = copy_data.drop(drop_gene_point_list)
gene_left = gene_expression_data_with_gene_point_delete.index

In [14]:
imputer = Imputer(missing_values = 0,strategy = "mean",axis = 0)

imputed_data_temp = imputer.fit_transform(gene_expression_data_with_gene_point_delete)

imputed_data = normalize(imputed_data_temp,norm='l2')



In [15]:
gene_imputation_matrix = pd.DataFrame(np.array(imputed_data))
gene_index = gene_expression_empty[0]

In [16]:
rename_column_impute = np.array([column for column in gene_imputation_matrix])
gene_imputation_matrix.columns = gene_expression_case_ID
gene_imputation_matrix.index = gene_left

In [17]:
gene_imputation_matrix.to_csv('Breast_Gene_expression.csv')

In [18]:
gene_imputation_matrix

Unnamed: 0,TCGA-AR-A1AP,TCGA-A2-A0CS,11BR003,TCGA-BH-A0HK,TCGA-BH-A1FM,11BR014,TCGA-LL-A6FQ,TCGA-E9-A1R6,TCGA-LL-A73Y,TCGA-BH-A28O,...,TCGA-B6-A0IK,TCGA-A2-A1G4,TCGA-AR-A2LQ,TCGA-AC-A62Y,TCGA-A8-A0A4,TCGA-D8-A27G,TCGA-A2-A1FX,TCGA-WT-AB44,TCGA-BH-A0HB,TCGA-AC-A3TN
ENSG00000167578.15,0.006705,0.013605,0.067247,0.009030,0.003361,0.060010,0.025642,0.008649,0.016296,0.006330,...,0.006513,0.002170,0.004965,0.011212,0.004087,0.004934,0.007927,0.013738,0.009650,0.025321
ENSG00000078237.5,0.027102,0.030692,0.051421,0.018811,0.021246,0.012746,0.022559,0.015100,0.021448,0.011036,...,0.036194,0.026055,0.019431,0.044875,0.011749,0.029752,0.020081,0.025219,0.021252,0.013480
ENSG00000146083.10,0.028583,0.024017,0.013661,0.030151,0.016606,0.006023,0.012314,0.037588,0.033382,0.030918,...,0.032121,0.029348,0.018201,0.015818,0.022110,0.028060,0.026561,0.013772,0.029844,0.016466
ENSG00000158486.12,0.001127,0.000394,0.068247,0.000187,0.000137,0.075657,0.000007,0.000032,0.000295,0.000084,...,0.000330,0.000012,0.000058,0.000040,0.000168,0.000030,0.000022,0.000012,0.000159,0.000016
ENSG00000198242.12,0.012768,0.024633,0.000858,0.039648,0.019159,0.000476,0.063058,0.020041,0.013669,0.021569,...,0.012719,0.029458,0.016705,0.011510,0.020981,0.016780,0.020744,0.024381,0.104795,0.028172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000135094.9,0.032403,0.007385,0.041262,0.006358,0.002850,0.034194,0.018109,0.012020,0.002737,0.001745,...,0.005428,0.010249,0.007275,0.049104,0.002729,0.020786,0.025842,0.039771,0.024168,0.006249
ENSG00000009694.12,0.000261,0.000569,0.095505,0.000490,0.001061,0.079146,0.000053,0.000051,0.001641,0.002121,...,0.000328,0.000851,0.001495,0.000541,0.001430,0.001333,0.000218,0.000149,0.157066,0.000026
ENSG00000105063.17,0.036217,0.029195,0.018152,0.024450,0.014186,0.015042,0.031663,0.032152,0.023897,0.020440,...,0.037829,0.026983,0.022404,0.018013,0.026310,0.028775,0.023423,0.009507,0.030372,0.021847
ENSG00000231119.2,0.002477,0.001803,0.090163,0.001099,0.001441,0.074719,0.002239,0.002415,0.000552,0.001444,...,0.003308,0.005415,0.001677,0.001867,0.002076,0.001121,0.000996,0.001001,0.002259,0.004014


# Finish the preprocessing of gene expression data

In [19]:
# test
df = pd.read_csv('Breast_Gene_expression.csv')
gene_site = list(df['Unnamed: 0'])
gene_expression_after_preprocessing = df.index = gene_site
gene_expression_after_preprocessing = df.drop(columns = ['Unnamed: 0'])

In [20]:
gene_expression_after_preprocessing

Unnamed: 0,TCGA-AR-A1AP,TCGA-A2-A0CS,11BR003,TCGA-BH-A0HK,TCGA-BH-A1FM,11BR014,TCGA-LL-A6FQ,TCGA-E9-A1R6,TCGA-LL-A73Y,TCGA-BH-A28O,...,TCGA-B6-A0IK,TCGA-A2-A1G4,TCGA-AR-A2LQ,TCGA-AC-A62Y,TCGA-A8-A0A4,TCGA-D8-A27G,TCGA-A2-A1FX,TCGA-WT-AB44,TCGA-BH-A0HB,TCGA-AC-A3TN
ENSG00000167578.15,0.006705,0.013605,0.067247,0.009030,0.003361,0.060010,0.025642,0.008649,0.016296,0.006330,...,0.006513,0.002170,0.004965,0.011212,0.004087,0.004934,0.007927,0.013738,0.009650,0.025321
ENSG00000078237.5,0.027102,0.030692,0.051421,0.018811,0.021246,0.012746,0.022559,0.015100,0.021448,0.011036,...,0.036194,0.026055,0.019431,0.044875,0.011749,0.029752,0.020081,0.025219,0.021252,0.013480
ENSG00000146083.10,0.028583,0.024017,0.013661,0.030151,0.016606,0.006023,0.012314,0.037588,0.033382,0.030918,...,0.032121,0.029348,0.018201,0.015818,0.022110,0.028060,0.026561,0.013772,0.029844,0.016466
ENSG00000158486.12,0.001127,0.000394,0.068247,0.000187,0.000137,0.075657,0.000007,0.000032,0.000295,0.000084,...,0.000330,0.000012,0.000058,0.000040,0.000168,0.000030,0.000022,0.000012,0.000159,0.000016
ENSG00000198242.12,0.012768,0.024633,0.000858,0.039648,0.019159,0.000476,0.063058,0.020041,0.013669,0.021569,...,0.012719,0.029458,0.016705,0.011510,0.020981,0.016780,0.020744,0.024381,0.104795,0.028172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000135094.9,0.032403,0.007385,0.041262,0.006358,0.002850,0.034194,0.018109,0.012020,0.002737,0.001745,...,0.005428,0.010249,0.007275,0.049104,0.002729,0.020786,0.025842,0.039771,0.024168,0.006249
ENSG00000009694.12,0.000261,0.000569,0.095505,0.000490,0.001061,0.079146,0.000053,0.000051,0.001641,0.002121,...,0.000328,0.000851,0.001495,0.000541,0.001430,0.001333,0.000218,0.000149,0.157066,0.000026
ENSG00000105063.17,0.036217,0.029195,0.018152,0.024450,0.014186,0.015042,0.031663,0.032152,0.023897,0.020440,...,0.037829,0.026983,0.022404,0.018013,0.026310,0.028775,0.023423,0.009507,0.030372,0.021847
ENSG00000231119.2,0.002477,0.001803,0.090163,0.001099,0.001441,0.074719,0.002239,0.002415,0.000552,0.001444,...,0.003308,0.005415,0.001677,0.001867,0.002076,0.001121,0.000996,0.001001,0.002259,0.004014


# Preprocessing DNA methylation data

In [21]:
# get case id from methylation data
#methylation_dict_string = str(os.getcwd())+'/DNA methylation'
#methylation_dict_info = find_dir(methylation_dict_string)
#methylation_data_temp = []

#for i in range(2,len(methylation_dict_info)):
#    methylation_case_id_temp = methylation_dict_info[i]
#    methylation_case_id_temp_old = methylation_case_id_temp.split('/')
#    check = methylation_case_id_temp_old[-1]
    
#    if len(check) > 30:
#        check_txt = check.split('.')
#        if check_txt[-1] == 'txt':
#            methylation_data_temp.append(check_txt[-3][0:12])

In [44]:
import time
methylation_dict_string = str(os.getcwd())+'/DNA methylation'
methylation_dict_info = find_dir(methylation_dict_string)
methylation_file_temp = []
methylation_data = []
methylation_case_ID = []

for i in range(len(methylation_dict_info)):
    methylation_dict_temp = methylation_dict_info[i].split('/')
    if (methylation_dict_temp[-2] != 'logs'):
        methylation_file_temp.append(methylation_dict_info[i])

time_start=time.time()
for i in range(len(methylation_file_temp)):
    methylation_dict_temp = methylation_file_temp[i].split('/')
    
    if len(methylation_dict_temp[-1]) > 50:
        methylation_string_temp = methylation_dict_temp[-1].split('.')
        methylation_sample_id_temp = methylation_string_temp[-3][0:12]
        
        if methylation_sample_id_temp in case_id:
            features_train = pd.read_csv(methylation_file_temp[i],sep = '\t')
            if len(np.array(features_train['Beta_value'])) > 450000:
                methylation_data.append(np.array(features_train['Beta_value'],dtype='object'))
                methylation_case_ID.append(methylation_sample_id_temp)

time_end=time.time()
print('time cost',(time_end-time_start)/60,'min')

time cost 34.863402616977694 min


In [45]:
methylation_data_matrix = pd.DataFrame(np.array(methylation_data))

In [46]:
methylation_index_temp = pd.read_csv(methylation_file_temp[3],sep = '\t')
methylation_index = np.array(methylation_index_temp['Composite Element REF'])

In [52]:
methylation_data_table = methylation_data_matrix.T

In [60]:
methylation_data_table.columns = methylation_case_ID
methylation_data_table.index = list(methylation_index)
methylation_data_without_NaN_table = methylation_data_table.dropna()

In [63]:
methylation_data_table

Unnamed: 0,TCGA-E9-A1N6,TCGA-AR-A0TR,TCGA-A7-A13E,TCGA-D8-A3Z6,TCGA-BH-A0BJ,TCGA-E2-A1BC,TCGA-JL-A3YX,TCGA-AN-A0XW,TCGA-D8-A1JJ,TCGA-AO-A128,...,TCGA-V7-A7HQ,TCGA-BH-A0AU,TCGA-C8-A3M8,TCGA-AO-A0JD,TCGA-A2-A1FX,TCGA-AO-A0JF,TCGA-BH-A1EN,TCGA-LL-A50Y,TCGA-A2-A0YI,TCGA-C8-A1HN
cg00000029,0.0974702,0.0902712,0.133297,0.162573,0.150082,0.285877,0.137249,0.172926,0.0567535,0.506672,...,0.119453,0.115619,0.136442,0.156686,0.148039,0.26707,0.194329,0.260403,0.331892,0.106938
cg00000108,,,,,,,,,,,...,,,,,,,,,,
cg00000109,,,,,,,,,,,...,,,,,,,,,,
cg00000165,0.568963,0.624962,0.233786,0.463615,0.187375,0.146606,0.346276,0.224147,0.340406,0.193579,...,0.139335,0.1889,0.747882,0.150829,0.176492,0.163322,0.147221,0.110181,0.656204,0.781729
cg00000236,0.923342,0.924496,0.83802,0.924401,0.911773,0.803059,0.940538,0.910029,0.882604,0.92428,...,0.904255,0.851658,0.931252,0.927573,0.908795,0.905158,0.885305,0.837084,0.933008,0.887352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rs9363764,0.0312758,0.0983058,0.954433,0.961257,0.582206,0.956914,0.961128,0.386153,0.532215,0.563152,...,0.0609025,0.553691,0.9586,0.60373,0.276243,0.0615775,0.547516,0.891497,0.557438,0.934054
rs939290,0.972134,0.964097,0.536283,0.0270323,0.540835,0.530344,0.642707,0.0222345,0.979866,0.350112,...,0.556301,0.0269432,0.525623,0.971941,0.520719,0.556676,0.967848,0.561092,0.539758,0.474704
rs951295,0.479726,0.976124,0.0509582,0.0512361,0.542903,0.550087,0.696237,0.0283013,0.0327521,0.474218,...,0.524953,0.525841,0.537211,0.58857,0.0382917,0.0425764,0.969587,0.515872,0.568855,0.0399705
rs966367,0.489239,0.528066,0.528919,0.938073,0.0636314,0.942212,0.0376583,0.504337,0.441601,0.917273,...,0.947476,0.946737,0.505206,0.81056,0.927144,0.495318,0.476972,0.853295,0.0494109,0.386131


In [64]:
methylation_data_without_NaN_table

Unnamed: 0,TCGA-E9-A1N6,TCGA-AR-A0TR,TCGA-A7-A13E,TCGA-D8-A3Z6,TCGA-BH-A0BJ,TCGA-E2-A1BC,TCGA-JL-A3YX,TCGA-AN-A0XW,TCGA-D8-A1JJ,TCGA-AO-A128,...,TCGA-V7-A7HQ,TCGA-BH-A0AU,TCGA-C8-A3M8,TCGA-AO-A0JD,TCGA-A2-A1FX,TCGA-AO-A0JF,TCGA-BH-A1EN,TCGA-LL-A50Y,TCGA-A2-A0YI,TCGA-C8-A1HN
cg00000029,0.0974702,0.0902712,0.133297,0.162573,0.150082,0.285877,0.137249,0.172926,0.0567535,0.506672,...,0.119453,0.115619,0.136442,0.156686,0.148039,0.26707,0.194329,0.260403,0.331892,0.106938
cg00000165,0.568963,0.624962,0.233786,0.463615,0.187375,0.146606,0.346276,0.224147,0.340406,0.193579,...,0.139335,0.1889,0.747882,0.150829,0.176492,0.163322,0.147221,0.110181,0.656204,0.781729
cg00000236,0.923342,0.924496,0.83802,0.924401,0.911773,0.803059,0.940538,0.910029,0.882604,0.92428,...,0.904255,0.851658,0.931252,0.927573,0.908795,0.905158,0.885305,0.837084,0.933008,0.887352
cg00000289,0.727423,0.51554,0.678618,0.754645,0.547061,0.656906,0.612436,0.642086,0.661218,0.675941,...,0.707438,0.710583,0.802442,0.617008,0.78397,0.563152,0.722705,0.674279,0.620495,0.507343
cg00000292,0.739543,0.599793,0.718535,0.567382,0.720612,0.555534,0.672357,0.812197,0.480822,0.707976,...,0.692709,0.700905,0.579886,0.878588,0.66808,0.708174,0.634605,0.648683,0.858341,0.587258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rs9363764,0.0312758,0.0983058,0.954433,0.961257,0.582206,0.956914,0.961128,0.386153,0.532215,0.563152,...,0.0609025,0.553691,0.9586,0.60373,0.276243,0.0615775,0.547516,0.891497,0.557438,0.934054
rs939290,0.972134,0.964097,0.536283,0.0270323,0.540835,0.530344,0.642707,0.0222345,0.979866,0.350112,...,0.556301,0.0269432,0.525623,0.971941,0.520719,0.556676,0.967848,0.561092,0.539758,0.474704
rs951295,0.479726,0.976124,0.0509582,0.0512361,0.542903,0.550087,0.696237,0.0283013,0.0327521,0.474218,...,0.524953,0.525841,0.537211,0.58857,0.0382917,0.0425764,0.969587,0.515872,0.568855,0.0399705
rs966367,0.489239,0.528066,0.528919,0.938073,0.0636314,0.942212,0.0376583,0.504337,0.441601,0.917273,...,0.947476,0.946737,0.505206,0.81056,0.927144,0.495318,0.476972,0.853295,0.0494109,0.386131


In [62]:
methylation_data_without_NaN_table.to_csv('Breast_DNA_methylation.csv')

# miRNA data preprocessing

# Protein data preprocessing

In [None]:
x = np.array(case_id)
y =str(os.getcwd())+'/protein'

dic = find_dir(y)
data = pd.read_csv(dic[0])
sample_id_temp = np.array(data['Sample_ID'])
sample_id = []

for i in range(len(sample_id_temp)):
    temp = sample_id_temp[i].split('-')
    string = temp[0]
    for j in range(1,3):
        string = string + '-' + temp[j]
    sample_id.append(string)
    
sample_id = np.array(sample_id)

In [None]:
flag = 0
for i in range(len(x)):
    for j in range(len(sample_id)):
        if (i % 2) == 0:
            if x[i] == sample_id[j]:
                print(i,x[i])
                flag += 1
flag