In [None]:
import json, os
import shutil

def preprocess_tcga_data(json_file, source_dirctory, target_dirctory):
    # json_file : 代表下载文件的描述信息,用于清洗数据信息
    # source_dirctory: 代表文件目录
    # target_dirctory: 目标目录
    # 本函数的目的是将下载下来的数据清洗处理成 submit_id.count.gz类型的文件,后续进一步处理
    if not json_file or json_file == '':
        return False
    if not os.path.exists(source_dirctory):
        return False
    if not os.path.exists(target_dirctory):
        os.makedirs(target_dirctory)
    
    # read json_file
    with open(json_file, 'r') as jf:
        json_data = jf.read()
    
    json_info = json.loads(json_data)
    
    # 遍历json中的file，开始修改并且copy文件到target_dirctory
    for file_info in json_info:
        file_id = file_info['file_id']
        file_name = file_info['file_name']
        submit_id = file_info['associated_entities'][0]['entity_submitter_id']
        source_file = source_dirctory + '/' + file_id + '/' + file_name
        target_file = target_dirctory + '/' + submit_id + '.gz'
        shutil.copy(source_file, target_file)
        print('file_name :{} copy done'.format(file_name))
        
        
        

In [None]:
preprocess_tcga_data('./metadata.cart.2021-02-25 -正常.json', './tcga_luad_normal', './tcga_data')

In [None]:
preprocess_tcga_data('./metadata.cart.2021-02-25-患病.json', './tcga_luad', './tcga_data')

## 解压gz文件

```
gunzip ./*
```

In [None]:
import pandas as pd

GFF3 = pd.read_csv(
    filepath_or_buffer='Homo_sapiens.GRCh38.103.gtf', 
    sep='\t', 
    header=None,
    names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'],
    skiprows=[i for i in range(5)])

GFF3 = GFF3[GFF3['source'].notnull()]

GFF3['attributes'].head()


In [None]:
#a_list = GFF3[GFF3['type']=='gene']['attributes'].to_list()

GFF3 = GFF3[GFF3['type']=='gene']

GFF3['gene_id'] = GFF3.apply(lambda x : x.attributes.split(';')[0].strip().split(' ')[1].strip('"'), axis=1)  
GFF3['gene_name'] = GFF3.apply(lambda x : x.attributes.split(';')[2].strip().split(' ')[1].strip('"'), axis=1)
GFF3['gene_biotype'] = GFF3.apply(lambda x : x.attributes.split(';')[4].strip().split(' ')[1].strip('"'), axis=1)



GFF3 = pd.DataFrame(GFF3, columns = ['gene_id', 'gene_name', 'gene_biotype'])


In [None]:
df = pd.read_csv('./tcga_data/TCGA-05-4244-01A-01R-1107-07', sep='\t', header=None, names=['gene_id', 'TCGA-05-4244-01A-01R-1107-07'])

df['gene_id'] = df.apply(lambda x : x.gene_id.split('.')[0], axis = 1)

x = pd.merge(GFF3, df, on = 'gene_id')

x1 = x[x['gene_biotype'] == 'protein_coding']
x2 = x1.reset_index()
x3 = x2.set_index('gene_name', drop=True)



In [None]:
# 这个函数读取所有的数据，放在一起按照上面的数据进行merge
import os
def get_all_info_matrix(target_df):
    # target_df 是GFF文件，用来做merge df使用
    info_files = os.listdir('./tcga_data')
    if len(info_files) == 0:
        return None
    for file in info_files:
        df = pd.read_csv('./tcga_data' + '/' + file, sep='\t', header=None, names=['gene_id', file])
        df['gene_id'] = df.apply(lambda x : x.gene_id.split('.')[0], axis = 1)
        target_df = pd.merge(target_df, df, on = 'gene_id')
        print('file:{} merge done'.format(file))
    return target_df

        

In [None]:
merge_info = get_all_info_matrix(GFF3)

In [9]:
merge_info.to_csv('tcga_matrix.csv')

In [12]:
final_df = pd.read_csv('tcga_matrix.csv')
final_df1 = final_df[final_df['gene_biotype'] == 'protein_coding']


Unnamed: 0.1,Unnamed: 0,gene_id,gene_name,gene_biotype,TCGA-44-7667-01A-31R-2066-07,TCGA-97-7941-01A-11R-2187-07,TCGA-86-8074-01A-11R-2241-07,TCGA-55-7576-01A-11R-2066-07,TCGA-86-8674-01A-21R-2403-07,TCGA-49-AARO-01A-12R-A41B-07,...,TCGA-44-2662-11A-01R-1758-07,TCGA-55-A4DF-01A-11R-A24H-07,TCGA-86-A4P8-01A-11R-A24X-07,TCGA-J2-A4AD-01A-11R-A24H-07,TCGA-69-8255-01A-11R-2287-07,TCGA-69-7760-01A-11R-2170-07,TCGA-73-4676-11A-01R-1755-07,TCGA-55-6983-11A-01R-1949-07,TCGA-44-6775-01C-02R-A277-07,TCGA-44-6147-01B-06R-A277-07
7,7,ENSG00000186092,OR4F5,protein_coding,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
54,54,ENSG00000187634,SAMD11,protein_coding,35,625,179,196,301,218,...,446,41,156,876,437,47,22,24,15,27
55,55,ENSG00000188976,NOC2L,protein_coding,16084,5881,9911,11996,5357,3088,...,3174,1806,3404,6499,14527,2573,1206,1375,280,130
56,56,ENSG00000187961,KLHL17,protein_coding,313,438,892,686,469,891,...,214,241,740,1338,1407,142,59,86,16,42
57,57,ENSG00000187583,PLEKHN1,protein_coding,53,416,1114,1305,440,1088,...,44,67,113,866,668,123,12,23,15,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56450,56450,ENSG00000212907,MT-ND4L,protein_coding,29971,44730,16282,84962,10274,45378,...,18476,23349,19634,54905,91286,78509,10099,11900,6032,12349
56451,56451,ENSG00000198886,MT-ND4,protein_coding,360079,663978,185281,969439,214707,476635,...,350121,401141,425740,938673,1111936,1097030,146480,184230,51803,157128
56455,56455,ENSG00000198786,MT-ND5,protein_coding,37765,347122,40732,214575,55271,209613,...,179892,202938,205506,216208,331056,357396,49412,107375,35478,84119
56456,56456,ENSG00000198695,MT-ND6,protein_coding,8494,132093,13172,66358,20450,72779,...,65500,88791,79448,87039,110402,152036,27192,46499,3437,6834


In [23]:
final_df1['Unnamed: 0']

f2 = final_df1.drop(['Unnamed: 0'], axis=1, inplace=False)

f3 = f2.reset_index()
f4 = f3.drop(['index'], axis=1, inplace=False)

Unnamed: 0,gene_id,gene_name,gene_biotype,TCGA-44-7667-01A-31R-2066-07,TCGA-97-7941-01A-11R-2187-07,TCGA-86-8074-01A-11R-2241-07,TCGA-55-7576-01A-11R-2066-07,TCGA-86-8674-01A-21R-2403-07,TCGA-49-AARO-01A-12R-A41B-07,TCGA-75-6205-01A-11R-1755-07,...,TCGA-44-2662-11A-01R-1758-07,TCGA-55-A4DF-01A-11R-A24H-07,TCGA-86-A4P8-01A-11R-A24X-07,TCGA-J2-A4AD-01A-11R-A24H-07,TCGA-69-8255-01A-11R-2287-07,TCGA-69-7760-01A-11R-2170-07,TCGA-73-4676-11A-01R-1755-07,TCGA-55-6983-11A-01R-1949-07,TCGA-44-6775-01C-02R-A277-07,TCGA-44-6147-01B-06R-A277-07
0,ENSG00000186092,OR4F5,protein_coding,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ENSG00000187634,SAMD11,protein_coding,35,625,179,196,301,218,26,...,446,41,156,876,437,47,22,24,15,27
2,ENSG00000188976,NOC2L,protein_coding,16084,5881,9911,11996,5357,3088,2710,...,3174,1806,3404,6499,14527,2573,1206,1375,280,130
3,ENSG00000187961,KLHL17,protein_coding,313,438,892,686,469,891,232,...,214,241,740,1338,1407,142,59,86,16,42
4,ENSG00000187583,PLEKHN1,protein_coding,53,416,1114,1305,440,1088,200,...,44,67,113,866,668,123,12,23,15,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19558,ENSG00000212907,MT-ND4L,protein_coding,29971,44730,16282,84962,10274,45378,5228,...,18476,23349,19634,54905,91286,78509,10099,11900,6032,12349
19559,ENSG00000198886,MT-ND4,protein_coding,360079,663978,185281,969439,214707,476635,69624,...,350121,401141,425740,938673,1111936,1097030,146480,184230,51803,157128
19560,ENSG00000198786,MT-ND5,protein_coding,37765,347122,40732,214575,55271,209613,16856,...,179892,202938,205506,216208,331056,357396,49412,107375,35478,84119
19561,ENSG00000198695,MT-ND6,protein_coding,8494,132093,13172,66358,20450,72779,5540,...,65500,88791,79448,87039,110402,152036,27192,46499,3437,6834


In [24]:
f4.to_csv('tcga_protein.csv')

In [27]:
f5 = f4.drop(['gene_id', 'gene_biotype'], axis = 1, inplace = False)
f5.to_csv('tcga_pure.csv')

In [55]:
f6 = f5.set_index('gene_name')
f7 = f6.T

f7.to_csv('tcga_train.csv')

f7.drop(['OR4F5'], axis=1, inplace = True)

f7['name'] = f7.index

f7['label'] = f7.apply(lambda x : int('11' in x.name.split('-')[3]), axis=1)

f7


gene_name,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,ISG15,AGRN,RNF223,C1orf159,...,MT-ATP6,MT-CO3,MT-ND3,MT-ND4L,MT-ND4,MT-ND5,MT-ND6,MT-CYB,name,label
TCGA-44-7667-01A-31R-2066-07,35,16084,313,53,20,255,6844,6234,63,621,...,101403,208608,108321,29971,360079,37765,8494,175939,TCGA-44-7667-01A-31R-2066-07,0
TCGA-97-7941-01A-11R-2187-07,625,5881,438,416,216,521,5658,27770,82,764,...,299710,423461,56691,44730,663978,347122,132093,318261,TCGA-97-7941-01A-11R-2187-07,0
TCGA-86-8074-01A-11R-2241-07,179,9911,892,1114,328,982,12036,74321,61,1257,...,103129,169534,43398,16282,185281,40732,13172,138475,TCGA-86-8074-01A-11R-2241-07,0
TCGA-55-7576-01A-11R-2066-07,196,11996,686,1305,536,374,19128,49490,525,838,...,327333,401299,318401,84962,969439,214575,66358,583865,TCGA-55-7576-01A-11R-2066-07,0
TCGA-86-8674-01A-21R-2403-07,301,5357,469,440,241,275,1072,21999,157,617,...,73469,127060,26566,10274,214707,55271,20450,102491,TCGA-86-8674-01A-21R-2403-07,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-69-7760-01A-11R-2170-07,47,2573,142,123,67,15,1165,3574,1,252,...,574107,658434,81687,78509,1097030,357396,152036,712497,TCGA-69-7760-01A-11R-2170-07,0
TCGA-73-4676-11A-01R-1755-07,22,1206,59,12,11,86,1456,4394,26,123,...,59102,122024,38753,10099,146480,49412,27192,85277,TCGA-73-4676-11A-01R-1755-07,1
TCGA-55-6983-11A-01R-1949-07,24,1375,86,23,6,28,455,6403,32,187,...,63581,115012,34336,11900,184230,107375,46499,104199,TCGA-55-6983-11A-01R-1949-07,1
TCGA-44-6775-01C-02R-A277-07,15,280,16,15,7,2,12,469,10,60,...,16892,57067,4392,6032,51803,35478,3437,28407,TCGA-44-6775-01C-02R-A277-07,0


In [56]:
f7.to_csv('final_train.csv')

In [4]:
import pandas as pd

df = pd.read_csv('final_train.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,ISG15,AGRN,RNF223,...,MT-ATP6,MT-CO3,MT-ND3,MT-ND4L,MT-ND4,MT-ND5,MT-ND6,MT-CYB,name,label
0,TCGA-44-7667-01A-31R-2066-07,35,16084,313,53,20,255,6844,6234,63,...,101403,208608,108321,29971,360079,37765,8494,175939,TCGA-44-7667-01A-31R-2066-07,0
1,TCGA-97-7941-01A-11R-2187-07,625,5881,438,416,216,521,5658,27770,82,...,299710,423461,56691,44730,663978,347122,132093,318261,TCGA-97-7941-01A-11R-2187-07,0
2,TCGA-86-8074-01A-11R-2241-07,179,9911,892,1114,328,982,12036,74321,61,...,103129,169534,43398,16282,185281,40732,13172,138475,TCGA-86-8074-01A-11R-2241-07,0
3,TCGA-55-7576-01A-11R-2066-07,196,11996,686,1305,536,374,19128,49490,525,...,327333,401299,318401,84962,969439,214575,66358,583865,TCGA-55-7576-01A-11R-2066-07,0
4,TCGA-86-8674-01A-21R-2403-07,301,5357,469,440,241,275,1072,21999,157,...,73469,127060,26566,10274,214707,55271,20450,102491,TCGA-86-8674-01A-21R-2403-07,0


In [8]:
df['label'].size

592

In [11]:
df[df['label'] == 0]['label'].size

533