<a href="https://colab.research.google.com/github/hiraksarkar/BioBombe/blob/master/Process_Data_As_Biobombe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Pan Cancer Data Processing

[Source Notebook](https://github.com/greenelab/BioBombe/blob/76ec84f72f8c40f55871f90d5d7f12bc8c647b94/0.expression-download/2A.process-pancanatlas-data.ipynb) 

In [0]:
import os
from urllib.request import urlretrieve

In [2]:
!wget http://api.gdc.cancer.gov/data/9a4679c3-855d-4055-8be9-3577ce10f66e -O EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2-v2.geneExp.tsv

--2020-05-22 21:35:12--  http://api.gdc.cancer.gov/data/9a4679c3-855d-4055-8be9-3577ce10f66e
Resolving api.gdc.cancer.gov (api.gdc.cancer.gov)... 192.170.230.246, 2605:9a00:10:4008::102
Connecting to api.gdc.cancer.gov (api.gdc.cancer.gov)|192.170.230.246|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://api.gdc.cancer.gov/data/9a4679c3-855d-4055-8be9-3577ce10f66e [following]
--2020-05-22 21:35:12--  https://api.gdc.cancer.gov/data/9a4679c3-855d-4055-8be9-3577ce10f66e
Connecting to api.gdc.cancer.gov (api.gdc.cancer.gov)|192.170.230.246|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1879492443 (1.8G) [application/octet-stream]
Saving to: ‘EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2-v2.geneExp.tsv’


2020-05-22 21:37:11 (15.1 MB/s) - ‘EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2-v2.geneExp.tsv’ saved [1879492443/1879492443]



In [0]:
import os
import random
import pandas as pd

In [0]:
# Commit from https://github.com/cognoma/cancer-data/
sample_commit = 'da832c5edc1ca4d3f665b038d15b19fced724f4c'

url = 'https://raw.githubusercontent.com/cognoma/cancer-data/{}/mapping/tcga_cancertype_codes.csv'.format(sample_commit)
cancer_types_df = pd.read_csv(url,
                              dtype='str',
                              keep_default_na=False)

cancertype_codes_dict = dict(zip(cancer_types_df['TSS Code'],
                                 cancer_types_df.acronym))

In [9]:
cancer_types_df.head(2)

Unnamed: 0,TSS Code,Source Site,Study Name,BCR,acronym
0,1,International Genomics Consortium,ovarian serous cystadenocarcinoma,IGC,OV
1,2,MD Anderson Cancer Center,glioblastoma multiforme,IGC,GBM


In [0]:
url = 'https://raw.githubusercontent.com/cognoma/cancer-data/{}/mapping/tcga_sampletype_codes.csv'.format(sample_commit)
sample_types_df = pd.read_csv(url, dtype='str')

In [0]:
sampletype_codes_dict = dict(zip(sample_types_df.Code,
                                 sample_types_df.Definition))

In [14]:
sample_types_df.head(2)

Unnamed: 0,Code,Definition,Short Letter Code
0,1,Primary Solid Tumor,TP
1,2,Recurrent Solid Tumor,TR


In [0]:
genes_commit = 'ad9631bb4e77e2cdc5413b0d77cb8f7e93fc5bee'
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(genes_commit)
gene_df = pd.read_table(url)

# Only consider protein-coding genes
gene_df = (
    gene_df.query("gene_type == 'protein-coding'")
)

In [17]:
gene_df.head(2)

Unnamed: 0,entrez_gene_id,symbol,description,chromosome,gene_type,synonyms,aliases
0,1,A1BG,alpha-1-B glycoprotein,19,protein-coding,A1B|ABG|GAB|HYST2477,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...
1,2,A2M,alpha-2-macroglobulin,12,protein-coding,A2MD|CPAMD5|FWP007|S863-7,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...


In [0]:
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(genes_commit)
updater_df = pd.read_table(url)

old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,
                             updater_df.new_entrez_gene_id))

In [0]:
file = os.path.join('EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2-v2.geneExp.tsv')
tcga_expr_df = pd.read_table(file, index_col=0)

In [23]:
tcga_expr_df.shape

(20531, 11069)

In [25]:
tcga_expr_df[tcga_expr_df.columns[:2]].head(2)

Unnamed: 0_level_0,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
?|100130426,0.0,0.0
?|100133144,3.2661,2.6815


In [0]:
tcga_expr_df.index = tcga_expr_df.index.map(lambda x: x.split('|')[1])

In [0]:
tcga_expr_df = (tcga_expr_df
    .dropna(axis='rows')
    .rename(index=old_to_new_entrez)
    .groupby(level=0).mean()
    .transpose()
    .sort_index(axis='rows')
    .sort_index(axis='columns')
)

tcga_expr_df.index.rename('sample_id', inplace=True)

In [0]:
# Update sample IDs to remove multiple samples measured on the same tumor
# and to map with the clinical information
tcga_expr_df.index = tcga_expr_df.index.str.slice(start=0, stop=15)
tcga_expr_df = tcga_expr_df.loc[~tcga_expr_df.index.duplicated(), :]

In [0]:
# Filter for valid Entrez gene identifiers
tcga_expr_df = tcga_expr_df.loc[:, tcga_expr_df.columns.isin(gene_df.entrez_gene_id.astype(str))]

In [30]:
print(tcga_expr_df.shape)
tcga_expr_df.head(5)

(11060, 16148)


gene_id,1,10,100,1000,10000,10001,10002,10003,100037417,10004,100049587,10005,10006,10007,10008,10009,1001,10010,100101267,100101467,10011,100113407,100125288,100128553,100128569,100128731,100128927,100129396,100129583,100129792,100129842,10013,100130086,100130311,100130449,100130733,100130742,100130771,100130933,100130958,...,995,9950,9953,9955,9956,9957,9958,996,9960,9961,9962,9963,9965,9966,9967,9968,9969,997,9970,9972,9973,9975,9976,9978,998,9980,9982,9984,9985,9986,9987,9988,9989,999,9990,9991,9992,9993,9994,9997
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
TCGA-02-0047-01,125.007,10.449,136.452,2302.47,1297.52,271.674,1.2293,8.6051,187.492,15.3662,118.627,333.876,2339.21,1056.58,399.521,805.188,232.337,660.746,1236.06,494.348,685.331,939.795,26.4237,4.3025,9.8344,2038.15,232.951,3.6879,104.441,48.8153,195.458,1405.7,45.4839,19.2999,7.3758,9.8344,1.8439,32.6255,4.3025,14.7515,...,51.6304,820.554,27.6591,62.694,271.674,336.827,735.732,1250.81,402.594,2510.22,3786.23,1.8439,0.0,2.4586,2363.93,1103.91,2431.54,797.197,9.6623,1013.55,330.065,777.529,112.48,926.71,5494.94,536.587,0.0,314.085,1062.72,162.881,4033.31,791.278,1810.14,264.913,684.225,1097.76,11.6783,4815.14,288.269,299.948
TCGA-02-0055-01,391.804,1.1212,222.004,1819.76,903.154,321.233,0.0,38.1219,426.853,44.8493,287.036,596.922,1321.02,1211.49,200.14,498.949,375.613,1216.54,970.428,255.983,1492.36,359.355,37.5221,4.4849,12.8942,3723.6,167.624,2.2425,50.8984,322.624,63.3497,1425.65,17.3791,45.9089,1.6819,20.1822,1.1212,3.8514,0.0,17.9397,...,71.7589,1071.9,82.4107,485.494,115.487,164.261,1113.95,1385.28,447.933,3918.15,944.639,1.6819,0.0,6.1668,1998.6,665.452,1233.36,859.986,16.5158,1304.56,584.163,773.651,807.288,2076.36,8510.72,577.435,0.0,392.432,132.866,366.643,4431.67,921.424,2038.96,2.2425,466.534,1399.86,16.8185,2228.45,309.46,787.106
TCGA-02-2483-01,271.852,4.6438,255.831,2888.87,1319.68,458.048,4.6438,5.9103,260.002,29.1293,104.697,406.012,1997.35,1423.11,453.404,1167.7,382.48,509.129,1165.17,566.514,752.295,783.536,41.448,0.4222,5.9103,3048.71,200.528,0.8443,70.552,45.9145,352.084,904.697,19.4195,9.3298,12.2427,29.5937,2.1108,4.5805,0.4222,8.8654,...,178.153,919.894,0.4222,8.8654,64.1689,312.401,784.38,1505.44,937.119,804.644,1584.38,0.8443,2.533,1.2665,2732.24,939.314,1358.1,1759.58,19.3098,1244.96,453.404,997.15,179.419,1926.86,6246.75,448.338,0.0,530.237,521.794,339.842,7839.58,846.708,2229.87,18.9974,565.184,1339.95,8.8654,2737.73,306.491,457.203
TCGA-02-2485-01,83.9429,20.0,129.048,6965.71,10136.2,418.571,5.2381,37.619,539.333,19.5238,35.7143,346.419,984.424,1209.05,102.857,1241.43,11.9048,731.905,1477.62,699.457,769.048,1216.67,98.6905,2.381,5.7143,1779.96,137.143,10.4762,60.6952,54.4524,333.333,1604.29,8.0952,33.1048,4.2857,32.8571,5.7143,10.9619,2.8571,12.381,...,147.619,989.048,20.4762,36.6667,103.809,213.333,754.286,1269.05,448.962,685.238,1548.57,1.9048,105.238,0.0,2146.19,1177.62,1608.1,2370.0,9.8952,1150.48,378.571,639.524,176.667,1029.4,5491.9,626.667,0.0,484.762,207.619,276.19,6560.0,1592.74,1840.95,112.381,484.881,726.667,11.4286,2979.05,439.524,426.667
TCGA-02-2486-01,108.256,3.6585,205.488,2250.61,873.171,441.463,1.8293,83.5366,265.061,29.878,1004.88,594.317,1907.56,2364.63,270.122,843.902,19.5122,1017.68,1510.37,424.61,1402.44,353.049,37.8598,0.0,6.0976,4162.11,334.146,0.6098,290.049,341.287,146.341,1199.39,17.6829,13.5854,1.8293,8.5488,1.8293,181.689,10.3659,0.6098,...,3.6585,1074.39,3.0488,7.3171,84.7561,320.122,929.268,1313.41,399.89,2569.51,1121.34,2.439,0.0,0.6098,2317.68,667.073,854.878,1181.71,14.2195,931.098,638.415,373.171,665.854,1606.18,7618.9,431.707,0.0,451.829,212.195,270.122,4330.49,816.335,1134.76,9.1463,397.677,1098.17,12.8049,1340.24,251.22,541.463


In [0]:
# Extract sample type in the order of the gene expression matrix
tcga_id = pd.DataFrame(tcga_expr_df.index)

# Extract the last two digits of the barcode and recode sample-type
tcga_id = tcga_id.assign(sample_type = tcga_id.sample_id.str[-2:])
tcga_id.sample_type = tcga_id.sample_type.replace(sampletype_codes_dict)

# Extract the first two ID numbers after `TCGA-` and recode cancer-type
tcga_id = tcga_id.assign(cancer_type = tcga_id.sample_id.str[5:7])
tcga_id.cancer_type = tcga_id.cancer_type.replace(cancertype_codes_dict)

# Append cancer-type with sample-type to generate stratification variable
tcga_id = tcga_id.assign(stratify_samples = tcga_id.cancer_type.str.cat(tcga_id.sample_type))

# Get stratification counts - function cannot work with singleton strats
stratify_counts = tcga_id.stratify_samples.value_counts().to_dict()

# Recode stratification variables if they are singletons
tcga_id = tcga_id.assign(stratify_samples_count = tcga_id.stratify_samples)
tcga_id.stratify_samples_count = tcga_id.stratify_samples_count.replace(stratify_counts)
tcga_id.loc[tcga_id.stratify_samples_count == 1, "stratify_samples"] = "other"

In [0]:
!mkdir -p /content/drive/My\ Drive/BioBombe/pancandata

In [45]:
!ls /content/drive/My\ Drive/BioBombe/

pancandata


In [0]:
!cp EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2-v2.geneExp.tsv /content/drive/My\ Drive/BioBombe/pancandata/

In [48]:
!ls /content/drive/My\ Drive/BioBombe/pancandata/

EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2-v2.geneExp.tsv


In [0]:
data_folder = '/content/drive/My Drive/BioBombe/pancandata/'

In [0]:
file = os.path.join(data_folder, 'tcga_sample_identifiers.tsv')

(
    tcga_id.drop(['stratify_samples', 'stratify_samples_count'], axis='columns')
    .to_csv(file, sep='\t', index=False)
)


In [51]:
!ls /content/drive/My\ Drive/BioBombe/pancandata/

EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2-v2.geneExp.tsv
tcga_sample_identifiers.tsv


In [52]:
tcga_id.head(2)

Unnamed: 0,sample_id,sample_type,cancer_type,stratify_samples,stratify_samples_count
0,TCGA-02-0047-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor,154
1,TCGA-02-0055-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor,154


In [0]:
cancertype_count_df = (
    pd.DataFrame(tcga_id.cancer_type.value_counts())
    .reset_index()
    .rename({'index': 'cancertype', 'cancer_type': 'n ='}, axis='columns')
)

file = os.path.join(data_folder, 'tcga_sample_counts.tsv')
cancertype_count_df.to_csv(file, sep='\t', index=False)

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
train_df, test_df = train_test_split(tcga_expr_df,
                                     test_size=0.1,
                                     random_state=123,
                                     stratify=tcga_id.stratify_samples_count)

In [0]:
train_file = os.path.join(data_folder, 'train_tcga_expression_matrix_processed.tsv.gz')
train_df.to_csv(train_file, sep='\t', compression='gzip', float_format='%.3g')

In [0]:
test_file = os.path.join(data_folder, 'test_tcga_expression_matrix_processed.tsv.gz')
test_df.to_csv(test_file, sep='\t', compression='gzip', float_format='%.3g')

In [61]:
cancertype_count_df.head()

Unnamed: 0,cancertype,n =
0,BRCA,1218
1,KIRC,606
2,LUAD,576
3,THCA,572
4,UCEC,567


-------------------------------------

## Variational Auto Encoder