In [150]:
import os, fnmatch
import matplotlib.pyplot as plt
import csv
import numpy as np
import pandas as pd

In [151]:
import sys
import json
import glob


In [657]:
if not os.path.exists("data"):
    os.makedirs("data")

In [152]:

BASE_PATH = '/data/archive/downstream/'
SOURCES = ['TARGET', 'TCGA', 'THR', 'TH']
def donor_to_samples(donor_id):
    sample_paths = glob.glob(BASE_PATH + donor_id + '*')
    sample_ids = [p[p.rfind('/')+1:] for p in sample_paths]
    return sample_ids

In [153]:
targetIds = donor_to_samples('TARGET')
thoIds = donor_to_samples('TH0')
thrIds = donor_to_samples('THR')
tgcaIds = donor_to_samples('TCGA')


In [154]:
listOfPathsToUMEND = sorted(glob.glob("/data/archive/downstream/" + 'TH0*/secondary/ucsc_cgl*/RSEM/Hugo' + '/rsem_genes.hugo.results'))
listOfPathsToDisease = sorted(glob.glob('/data/archive/downstream/TH0*/tertiary/treehouse-[8]*[!_compv5]/diagnosed_disease.txt'))

In [170]:
clinical = pd.read_csv("/data/archive/compendium/v5/clinical.tsv",sep='\t')

ribodDiagnosis = pd.read_csv("riboDepleted_samples_that_passedQC_and_have_known_diagnosis.tsv", sep='\t')

ribodDiagnosis=ribodDiagnosis.rename(columns={'Treehouse SAMPLE identifier':'th_sampleid','Diagnosis/Disease':'disease'})

methods = pd.read_csv("TranscriptMethod_THPEDv1.csv")

methods = methods.rename(columns={'Treehouse SAMPLE identifier':'th_sampleid'})

clinicalIdTissue = clinical[['th_sampleid','anat_sample','disease']] 

In [544]:
clinicalIdTissue.head()

Unnamed: 0,th_sampleid,anat_sample,disease
0,TH03_0010_S01,,acute leukemia of ambiguous lineage
1,TH03_0010_S02,,acute leukemia of ambiguous lineage
2,TH03_0103_S01,,rhabdomyosarcoma
3,TH03_0104_S01,,hepatoblastoma
4,TH03_0105_S01,,rhabdomyosarcoma


### Data set analysis
- Glioblastoma Multiforme has the most RiboDepleted labels

In [718]:
# df.loc[df['TR_method']=='PolyA'].groupby('disease').size()
# lengthy list of all diseases PolyA

In [717]:
df.loc[df['TR_method']=='RiboMinus'].groupby('disease').size()

disease
acute lymphoblastic leukemia              13
acute myeloid leukemia                     2
fibrolamellar hepatocellular carcinoma    19
glioblastoma multiforme                   41
leukemia/lymphoma                          1
lymphoma                                   1
dtype: int64

# Create Labels

Training sets for glioblastoma multiforme             

In [707]:
df = pd.merge(clinicalIdTissue, ribodDiagnosis, how='outer', on='th_sampleid')
df = pd.merge(df,methods,how='left',on='th_sampleid')
df.disease_y.fillna(df.disease_x, inplace=True)
df=df.rename(columns={'disease_y':'disease','anat_sample':'tissue'})

del df['disease_x']

df.tissue.fillna('NA',inplace=True)

glioblastomaLabels = df.loc[df['disease']=='glioblastoma multiforme']

# ribodGlioblastomaLabels = df.loc[df['TR_method']=='RiboMinus'].loc[df['disease']=='glioblastoma multiforme']

# polyGliomaBrainLabels = df.loc[df['disease']=="glioma"].loc[df['tissue']=="brain"]

Test set with RiboD samples

In [720]:
ribodGlioblastomaLabelsTest = df.loc[df['TR_method']=='RiboMinus'].loc[df['disease']=='fibrolamellar hepatocellular carcinoma']
ribodGlioblastomaLabelsTest.head(77)

Unnamed: 0,th_sampleid,tissue,disease,TR_method
11357,THR12_0277_S01,,fibrolamellar hepatocellular carcinoma,RiboMinus
11358,THR12_0278_S01,,fibrolamellar hepatocellular carcinoma,RiboMinus
11360,THR12_0278_S03,,fibrolamellar hepatocellular carcinoma,RiboMinus
11361,THR12_0279_S01,,fibrolamellar hepatocellular carcinoma,RiboMinus
11363,THR12_0280_S01,,fibrolamellar hepatocellular carcinoma,RiboMinus
11365,THR12_0280_S03,,fibrolamellar hepatocellular carcinoma,RiboMinus
11367,THR12_0281_S01,,fibrolamellar hepatocellular carcinoma,RiboMinus
11369,THR12_0282_S01,,fibrolamellar hepatocellular carcinoma,RiboMinus
11370,THR12_0365_S01,,fibrolamellar hepatocellular carcinoma,RiboMinus
11371,THR12_0365_S02,,fibrolamellar hepatocellular carcinoma,RiboMinus


# Create Features 

In [None]:
compendium = pd.read_hdf("/data/archive/compendium/v5/v5_hugo_log2tpm.11340x58581.2018-02-03.hd5")

In [721]:
compendium

Unnamed: 0,THR15_0330_S01,THR29_0776_S01,THR19_0418_S01,THR11_0247_S01,THR08_0162_S01,THR22_0588_S01,TARGET-40-0A4I65-01A-01R,THR19_0471_S01,THR09_0563_S01,THR30_0827_S01,...,TCGA-18-3414-01,TCGA-13-1481-01,TCGA-BP-4803-01,TCGA-A8-A09K-01,TCGA-61-1907-01,TCGA-IB-7885-01,TCGA-95-7947-01,TCGA-VQ-AA6F-01,TCGA-BR-8588-01,TCGA-DD-A115-01
5S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,0.969012,0.000000,0.000000,1.251719,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5_8S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7SK,0.000000,0.014355,0.088594,0.404177,0.000000,0.000000,0.033276,0.035624,0.000000,0.000000,...,0.000000,0.021551,0.000000,0.000000,0.000000,0.038016,0.042669,0.016758,0.000000,0.000000
A1BG,2.618239,5.016140,1.150560,3.214125,4.200850,4.244887,4.935460,0.555816,3.920293,4.967630,...,3.323396,4.452866,2.424979,4.679187,5.000447,3.129315,2.666772,1.526119,1.646188,9.799418
A1BG-AS1,1.150560,2.330558,0.536053,3.260026,4.517276,3.014355,2.464668,0.163499,2.967169,3.388190,...,1.014435,1.682619,1.070450,3.106042,1.659971,1.310415,1.400608,0.739941,0.918464,0.739941
A1CF,0.000000,0.000000,0.014355,0.189034,0.000000,0.014355,0.000000,0.000000,0.042644,0.000000,...,0.000000,0.000000,2.053139,0.000000,0.000000,0.475177,0.042784,1.056670,2.039190,4.890918
A2M,7.211499,5.324811,5.035184,8.866383,0.790772,5.159064,7.760021,4.468583,4.026800,5.426936,...,5.807862,5.067828,8.840647,6.531269,6.084696,8.018768,7.903234,7.372823,7.635666,9.240585
A2M-AS1,0.321928,0.948601,0.422233,1.035624,0.163499,1.516015,1.201634,0.124328,0.226509,0.333424,...,0.948698,1.169999,1.511009,1.560755,1.281002,0.807444,0.641648,1.214170,0.887589,0.713785
A2ML1,0.000000,0.056584,0.344828,0.298658,0.189034,0.014355,0.000000,0.014355,0.084064,3.147307,...,3.790796,0.137632,0.000000,0.097748,0.097748,1.201685,4.122709,0.084203,0.000000,0.000000
A2ML1-AS1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.084203,0.000000,0.000000


In [626]:
basepath="/data/archive/downstream/"
postidpath="/secondary/ucsc_cgl-rnaseq-cgl-pipeline-0.0.0-0000000/RSEM/Hugo/"
filename="rsem_genes.hugo.results"
hugoLogTpmPlusOneFilePathList = glob.glob(basepath + "THR21*" + postidpath + filename)

riboGliomablastomaExpression = pd.DataFrame()
for filepath in hugoLogTpmPlusOneFilePathList:
    curDf = pd.read_csv(filepath,sep='\t')
    riboGliomablastomaExpression[filepath[25:24+15]]=curDf['TPM']


In [648]:
basepath="/data/archive/downstream/"
postidpath="/secondary/ucsc_cgl-rnaseq-cgl-pipeline-0.0.0-0000000/RSEM/Hugo/"
filename="rsem_genes.hugo.results"
hugoLogTpmPlusOneFilePathList = glob.glob(basepath + "THR12*" + postidpath + filename)

riboGliomablastomaExpressionTest = pd.DataFrame()
for filepath in hugoLogTpmPlusOneFilePathList:
    curDf = pd.read_csv(filepath,sep='\t')
    riboGliomablastomaExpressionTest[filepath[25:24+15]]=curDf['TPM']

In [629]:
riboGliomablastomaExpression=riboGliomablastomaExpression.groupby('gene').aggregate(np.sum).add(1).apply(np.log2)

In [726]:
riboGliomablastomaExpression.head()

Unnamed: 0_level_0,THR21_0520_S01,THR21_0521_S01,THR21_0522_S01,THR21_0523_S01,THR21_0524_S01,THR21_0525_S01,THR21_0526_S01,THR21_0527_S01,THR21_0528_S01,THR21_0530_S01,...,THR21_0552_S01,THR21_0553_S01,THR21_0554_S01,THR21_0554_S02,THR21_0555_S01,THR21_0556_S01,THR21_0557_S01,THR21_0558_S01,THR21_0559_S01,THR21_0560_S01
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5S_rRNA,0.0,0.0,0.765535,2.232661,0.903038,0.903038,0.0,0.0,1.207893,0.0,...,0.0,0.0,0.0,0.0,1.22033,0.0,0.0,0.0,0.0,0.0
5_8S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7SK,1.269033,2.169925,0.378512,1.469886,1.321928,0.584963,0.678072,0.485427,0.594549,0.641546,...,0.526069,0.575312,0.389567,1.169925,0.536053,0.422233,0.823749,0.411426,0.214125,1.182692
A1BG,2.053111,0.782409,1.070389,0.137504,2.226509,0.632268,2.238787,0.516015,0.773996,1.765535,...,2.104337,2.260026,1.257011,2.353323,0.485427,1.176323,1.536053,0.895303,3.49057,3.620586
A1BG-AS1,0.910733,0.286881,0.575312,0.137504,1.356144,0.536053,1.367371,0.263034,0.275007,0.887525,...,0.555816,0.669027,0.678072,1.500802,0.15056,0.807355,0.807355,0.389567,1.799087,3.193772


In [None]:
pd.merge(compendium, riboGliomablastomaExpression,how='outer',on=compendium.iloc[:,0])

# Convert Ribo Depleted Expression to log2(TPM+1)

In [628]:
curDf = pd.read_csv(hugoLogTpmPlusOneFilePathList[0],sep='\t')
riboGliomablastomaExpression['gene']=curDf['gene_name']

thid = ['gene']
thid.extend(list(ribodGlioblastomaLabels['th_sampleid']))
riboGliomablastomaExpression=riboGliomablastomaExpression[thid]
riboGliomablastomaExpression=riboGliomablastomaExpression.sort_values('gene')

In [650]:
curDf = pd.read_csv(hugoLogTpmPlusOneFilePathList[0],sep='\t')
riboGliomablastomaExpressionTest['gene']=curDf['gene_name']

thid = ['gene']
thid.extend(list(ribodGlioblastomaLabelsTest['th_sampleid']))
riboGliomablastomaExpressionTest = riboGliomablastomaExpressionTest[thid]
riboGliomablastomaExpressionTest = riboGliomablastomaExpressionTest.sort_values('gene')


In [651]:
riboGliomablastomaExpressionTest = riboGliomablastomaExpressionTest.groupby('gene').aggregate(np.sum).add(1).apply(np.log2)

In [630]:
riboGliomablastomaExpression.head()

Unnamed: 0_level_0,THR21_0520_S01,THR21_0521_S01,THR21_0522_S01,THR21_0523_S01,THR21_0524_S01,THR21_0525_S01,THR21_0526_S01,THR21_0527_S01,THR21_0528_S01,THR21_0530_S01,...,THR21_0552_S01,THR21_0553_S01,THR21_0554_S01,THR21_0554_S02,THR21_0555_S01,THR21_0556_S01,THR21_0557_S01,THR21_0558_S01,THR21_0559_S01,THR21_0560_S01
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5S_rRNA,0.0,0.0,0.765535,2.232661,0.903038,0.903038,0.0,0.0,1.207893,0.0,...,0.0,0.0,0.0,0.0,1.22033,0.0,0.0,0.0,0.0,0.0
5_8S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7SK,1.269033,2.169925,0.378512,1.469886,1.321928,0.584963,0.678072,0.485427,0.594549,0.641546,...,0.526069,0.575312,0.389567,1.169925,0.536053,0.422233,0.823749,0.411426,0.214125,1.182692
A1BG,2.053111,0.782409,1.070389,0.137504,2.226509,0.632268,2.238787,0.516015,0.773996,1.765535,...,2.104337,2.260026,1.257011,2.353323,0.485427,1.176323,1.536053,0.895303,3.49057,3.620586
A1BG-AS1,0.910733,0.286881,0.575312,0.137504,1.356144,0.536053,1.367371,0.263034,0.275007,0.887525,...,0.555816,0.669027,0.678072,1.500802,0.15056,0.807355,0.807355,0.389567,1.799087,3.193772


# Verify sum of all expression  for a sample in TPM space sums to ~1 million

In [631]:
polyaGliomaBrainExpression[['TCGA-CS-4938-01','TCGA-CS-4942-01','TCGA-WY-A85A-01']]\
                              .apply(np.exp2).apply(lambda x: x - 1).sum()

TCGA-CS-4938-01    994535.4375
TCGA-CS-4942-01    993472.6250
TCGA-WY-A85A-01    976147.8125
dtype: float32

In [632]:
riboGliomablastomaExpression[['THR21_0553_S01','THR21_0554_S01','THR21_0554_S02','THR21_0555_S01',\
                              'THR21_0556_S01','THR21_0557_S01','THR21_0558_S01','THR21_0559_S01',]]\
                              .apply(np.exp2).apply(lambda x: x - 1).sum()

THR21_0553_S01    999997.64
THR21_0554_S01    999996.03
THR21_0554_S02    999999.66
THR21_0555_S01    999994.67
THR21_0556_S01    999992.60
THR21_0557_S01    999994.56
THR21_0558_S01    999993.97
THR21_0559_S01    999999.26
dtype: float64

In [654]:
riboGliomablastomaExpressionTest[['THR12_0277_S01','THR12_0278_S01','THR12_0278_S03']]\
                              .apply(np.exp2).apply(lambda x: x - 1).sum()

THR12_0277_S01    1000000.59
THR12_0278_S01    1000000.40
THR12_0278_S03    1000000.25
dtype: float64

In [633]:
# Check that we don't have any null/nan at this point
assert not riboGliomablastomaExpression.isnull().values.any()
assert not polyaGliomaBrainExpression.isnull().values.any()

# Make sure they have identical hugo gene indexes
assert np.array_equal(riboGliomablastomaExpression.index, polyaGliomaBrainExpression.index)

# Training Set 
This is a Ribo Depletion label and feature set for training

In [634]:
ribodGlioblastomaLabels.head()

Unnamed: 0,th_sampleid,tissue,disease,TR_method
11386,THR21_0520_S01,,glioblastoma multiforme,RiboMinus
11387,THR21_0521_S01,,glioblastoma multiforme,RiboMinus
11388,THR21_0522_S01,,glioblastoma multiforme,RiboMinus
11389,THR21_0523_S01,,glioblastoma multiforme,RiboMinus
11390,THR21_0524_S01,,glioblastoma multiforme,RiboMinus


In [635]:
riboGliomablastomaExpression.head()


Unnamed: 0_level_0,THR21_0520_S01,THR21_0521_S01,THR21_0522_S01,THR21_0523_S01,THR21_0524_S01,THR21_0525_S01,THR21_0526_S01,THR21_0527_S01,THR21_0528_S01,THR21_0530_S01,...,THR21_0552_S01,THR21_0553_S01,THR21_0554_S01,THR21_0554_S02,THR21_0555_S01,THR21_0556_S01,THR21_0557_S01,THR21_0558_S01,THR21_0559_S01,THR21_0560_S01
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5S_rRNA,0.0,0.0,0.765535,2.232661,0.903038,0.903038,0.0,0.0,1.207893,0.0,...,0.0,0.0,0.0,0.0,1.22033,0.0,0.0,0.0,0.0,0.0
5_8S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7SK,1.269033,2.169925,0.378512,1.469886,1.321928,0.584963,0.678072,0.485427,0.594549,0.641546,...,0.526069,0.575312,0.389567,1.169925,0.536053,0.422233,0.823749,0.411426,0.214125,1.182692
A1BG,2.053111,0.782409,1.070389,0.137504,2.226509,0.632268,2.238787,0.516015,0.773996,1.765535,...,2.104337,2.260026,1.257011,2.353323,0.485427,1.176323,1.536053,0.895303,3.49057,3.620586
A1BG-AS1,0.910733,0.286881,0.575312,0.137504,1.356144,0.536053,1.367371,0.263034,0.275007,0.887525,...,0.555816,0.669027,0.678072,1.500802,0.15056,0.807355,0.807355,0.389567,1.799087,3.193772


In [636]:
print(riboGliomablastomaExpression.shape)

(58581, 41)


# Test set on fibrolamellar hepatocellular carcinoma for RiboD
This will be used to test the accuracy of the training model

In [660]:
ribodGlioblastomaLabelsTest.head()

Unnamed: 0,th_sampleid,tissue,disease,TR_method
11357,THR12_0277_S01,,fibrolamellar hepatocellular carcinoma,RiboMinus
11358,THR12_0278_S01,,fibrolamellar hepatocellular carcinoma,RiboMinus
11360,THR12_0278_S03,,fibrolamellar hepatocellular carcinoma,RiboMinus
11361,THR12_0279_S01,,fibrolamellar hepatocellular carcinoma,RiboMinus
11363,THR12_0280_S01,,fibrolamellar hepatocellular carcinoma,RiboMinus


In [659]:
riboGliomablastomaExpressionTest.head()

Unnamed: 0_level_0,THR12_0277_S01,THR12_0278_S01,THR12_0278_S03,THR12_0279_S01,THR12_0280_S01,THR12_0280_S03,THR12_0281_S01,THR12_0282_S01,THR12_0365_S01,THR12_0365_S02,THR12_0365_S03,THR12_0365_S04,THR12_0365_S06,THR12_0366_S02,THR12_0366_S03,THR12_0366_S04,THR12_0368_S01,THR12_0369_S01,THR12_0370_S01
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
5S_rRNA,2.195348,0.0,3.269033,0.0,0.0,0.0,0.0,0.0,1.555816,0.0,1.316146,0.0,3.776104,0.0,0.0,0.0,2.269033,0.0,1.367371
5_8S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7SK,1.372952,2.500802,2.38405,2.378512,0.613532,2.247928,1.757023,1.269033,2.565597,2.987321,2.827819,2.606442,3.439623,2.427606,3.109361,2.195348,2.467279,1.599318,2.851999
A1BG,4.873813,0.201634,1.014355,3.122673,3.684819,3.456806,3.41819,2.482848,2.063503,1.02148,2.671293,1.709291,2.22033,5.291309,6.3572,5.615887,4.269033,5.723286,3.372952
A1BG-AS1,0.565597,0.367371,0.485427,0.111031,0.464668,0.565597,0.367371,0.275007,0.485427,0.799087,0.713696,0.389567,0.62293,0.475085,1.207893,0.695994,0.432959,0.555816,0.432959


# This is another training set for Poly-Adenylation Selection

In [637]:
polyGliomaBrainLabels.head()

Unnamed: 0,th_sampleid,tissue,disease,TR_method
737,THR13_0963_S01,brain,glioma,
740,THR13_0968_S01,brain,glioma,
741,THR13_0969_S01,brain,glioma,
743,THR13_0971_S01,brain,glioma,
745,THR13_0973_S01,brain,glioma,


In [638]:
polyaGliomaBrainExpression.head()

Unnamed: 0,THR13_0963_S01,THR13_0968_S01,THR13_0969_S01,THR13_0971_S01,THR13_0973_S01,TCGA-CS-4938-01,TCGA-CS-4941-01,TCGA-CS-4942-01,TCGA-CS-4943-01,TCGA-CS-4944-01,...,TCGA-VW-A8FI-01,TCGA-W9-A837-01,TCGA-WH-A86K-01,TCGA-WY-A858-01,TCGA-WY-A859-01,TCGA-WY-A85A-01,TCGA-WY-A85B-01,TCGA-WY-A85C-01,TCGA-WY-A85D-01,TCGA-WY-A85E-01
5S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.632854,0.0,0.0,0.0,0.0,0.0
5_8S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7SK,0.0,0.0,0.0,0.0,0.0,0.0,0.077266,0.0,0.0,0.115525,...,0.0,0.0,0.0,0.0,0.0,0.0,0.047351,0.035647,0.0,0.0
A1BG,5.050937,4.198494,4.548437,4.813525,5.132165,4.161106,3.731235,4.156197,2.55586,2.257066,...,4.395753,2.066978,5.6253,4.671308,3.778182,3.552132,3.77401,3.832873,5.730408,4.531703
A1BG-AS1,3.292782,1.851999,1.321928,2.488001,2.908813,2.485414,1.316216,1.863989,1.08414,0.782505,...,2.31615,1.914598,2.03919,1.765552,2.84001,1.996404,2.176366,2.283947,2.73558,2.134245


In [639]:
print(polyaGliomaBrainExpression.shape)

(58581, 528)


# Export (from rcurrie tumornormal)
Write expression and labels for both datasets out to hdf5 files wrangles and in machine learning format of rows = samples

In [658]:
# Export h5 format files
with pd.HDFStore("data/riboGliomablastomaTrain.h5", "w") as store:
    store["expression"] = riboGliomablastomaExpression.T.sort_index(axis="columns")
    store["labels"] = ribodGlioblastomaLabels.astype(str)
    
with pd.HDFStore("data/riboGliomablastomaTest.h5", "w") as store:
    store["expression"] = riboGliomablastomaExpressionTest.T.sort_index(axis="columns")
    store["labels"] = ribodGlioblastomaLabelsTest.astype(str)