In [6]:
import os, fnmatch
import matplotlib.pyplot as plt
import csv
import numpy as np
import pandas as pd
import sys
import json
import glob

# Labels
Sources:
- clinical.tsv
- riboDepleted_samples_that_passedQC_and_have_known_diagnosis
- TranscriptMethod_THPEDv1

# Features
Sources:
- /data/archive/compendium/v5/v5_hugo_log2tpm.11340x58581.2018-02-03.hd5
- /data/archive/downstream/*  [all sampleids possible]
    - /secondary/ucsc_cgl-rnaseq-cgl-pipeline-0.0.0-0000000/RSEM/Hugo/
        - rsem_genes.hugo.results

# Gather Features
1. read in compendium (log2(TPM+1))
2. find more in downstream source (TPM) 
3. merge the two using the same gene name both in log2(TPM+1)

In [7]:
%%time
compendium = pd.read_hdf("/data/archive/compendium/v5/v5_hugo_log2tpm.11340x58581.2018-02-03.hd5")

basepath="/data/archive/downstream/"
postidpath="/secondary/ucsc_cgl-rnaseq-cgl-pipeline-0.0.0-0000000/RSEM/Hugo/"
filename="rsem_genes.hugo.results"
hugoLogTpmPlusOneFilePathList = glob.glob(basepath + "*" + postidpath + filename)

rawTPMExpression = pd.DataFrame()
for filepath in hugoLogTpmPlusOneFilePathList:
    curDf = pd.read_csv(filepath,sep='\t')
    rawTPMExpression[filepath[25:].partition('/')[0]]=curDf['TPM']

### Multiple Ensemble IDs match to one HUGO gene name

In [10]:
rawTPMExpression['gene_id'] = curDf['gene_name']


rawTPMExpression.index = list(rawTPMExpression['gene_id'])


### Remove repeat genes and apply +1 and log2 to rawTPM

In [13]:
rawTPMExpressionShorter=rawTPMExpression.groupby(level=0).agg('mean').add(1).apply(np.log2)

CPU times: user 4.63 s, sys: 2.44 s, total: 7.06 s
Wall time: 6.51 s


### Merge compendium with downstream data

In [14]:
%%time
cols_to_use = rawTPMExpressionShorter.columns.difference(compendium.columns)

expressionTpmCompendium = pd.merge(rawTPMExpressionShorter[list(cols_to_use)], compendium, left_index = True, right_index = True)

Index(['TARGET-40-0A4HLD-01A-01R', 'TARGET-40-PAKUZU-01A-01R',
       'TARGET-40-PAKXLD-01A-01R', 'TARGET-40-PALFYN-01A-01R',
       'TARGET-40-PAMEKS-01A-01R', 'TARGET-40-PAPNVD-01A-01R',
       'TARGET-40-PAPWWC-01A-01R', 'TARGET-40-PASKZZ-01A-01R',
       'TARGET-40-PAUVUL-01A-01R', 'TARGET-40-PAUXPZ-01A-01R',
       ...
       'THR22_0597_S01', 'THR22_0598_S01', 'THR22_0605_S01', 'THR23_0606_S01',
       'THR23_0607_S01', 'THR23_0608_S01', 'THR23_0609_S01', 'THR25_0636_S01',
       'THR25_0641_S01', 'THR25_0642_S01'],
      dtype='object', length=283)

# Gather Labels
1. read clinical.tsv (no riboD, so all polyA)
2. read riboDepleted_samples_that_passedQC_and_have_known_diagnosis (all riboD)
3. read TranscriptMethod_THPEDv1 (compendium both polyA and riboD)
4. merge all three and remove repeats

In [17]:
clinicalLabels = pd.read_csv("/data/archive/compendium/v5/clinical.tsv",sep='\t')
clinicalLabels['tr_method']='PolyA'
ribodDiagnosis = pd.read_csv("riboDepleted_samples_that_passedQC_and_have_known_diagnosis.tsv", sep='\t')

ribodDiagnosis=ribodDiagnosis.rename(columns={'Treehouse SAMPLE identifier':'th_sampleid','Diagnosis/Disease':'disease'})

ribodDiagnosis['tr_method']='RiboMinus'
methods = pd.read_csv("TranscriptMethod_THPEDv1.csv")

methods = methods.rename(columns={'Treehouse SAMPLE identifier':'th_sampleid','TR_method':'tr_method'})

clinicalIdTissue = clinicalLabels[['th_sampleid','anat_sample','disease','tr_method']] 
df = pd.merge(clinicalIdTissue, ribodDiagnosis, how='outer')
df = pd.merge(df,methods,how='outer')
df = df[['th_sampleid','tr_method','disease']]
# PolyA                11350
# RiboMinus              179
# suspect RiboMinus        9

compendium_id_method_disease_labels = df.dropna()
# PolyA        11340
# RiboMinus      165

# 11454 features intersect labels total
- **160 features intersect labels that are RiboMinus**
- 11340 features intersect labels that are PolyA

In [19]:
len(set(list(compendium_id_method_disease_labels['th_sampleid']))&set(expressionTpmCompendium.keys()))

11454

In [20]:
labelsRiboD=compendium_id_method_disease_labels[compendium_id_method_disease_labels['tr_method']=='RiboMinus']['th_sampleid']
len(set(labelsRiboD)&set(expressionTpmCompendium.keys()))

160

In [21]:
labelsRiboD=compendium_id_method_disease_labels[compendium_id_method_disease_labels['tr_method']=='PolyA']['th_sampleid']
len(set(labelsRiboD)&set(expressionTpmCompendium.keys()))

11340

# Make labels and features intersect ids
1. find all columns that are intersecting in both
2. create features from compatible columns
3. remove duplicates from labels 
4. transpose label list to access ids as columns
5. create labels from compatible columns

In [22]:
allColumns = set(compendium_id_method_disease_labels['th_sampleid'])|set(expressionTpmCompendium.keys())
columnsNotInLabels = allColumns^set(compendium_id_method_disease_labels['th_sampleid'])
columnsNotInFeatures = allColumns^set(expressionTpmCompendium.keys())
allNonCompatibleColumns = columnsNotInFeatures^columnsNotInLabels
allCompatibleColumns = allColumns-allNonCompatibleColumns

In [23]:
features = expressionTpmCompendium[list(allCompatibleColumns)]
#  have 11454 columns

<class 'pandas.core.frame.DataFrame'>
Index: 58581 entries, 5S_rRNA to yR211F11.2
Columns: 11454 entries, TCGA-D3-A2J7-06 to TCGA-CR-7388-01
dtypes: float32(11340), float64(114)
memory usage: 2.5+ GB


In [24]:
compendium_id_method_disease_labels=compendium_id_method_disease_labels.drop_duplicates('th_sampleid')
compendium_id_method_disease_labels.index = compendium_id_method_disease_labels['th_sampleid']
transposeCompendium = compendium_id_method_disease_labels.T
labels = transposeCompendium[list(allCompatibleColumns)]
#  also have 11454 columns

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, th_sampleid to disease
Columns: 11454 entries, TCGA-D3-A2J7-06 to TCGA-CR-7388-01
dtypes: object(11454)
memory usage: 268.5+ KB


### Test whether the sets have truely the same ids 
(If all true then = proved)

In [25]:
print(set(features.keys())<=set(labels.keys()))
print(set(features.keys())>=set(labels.keys()))
# just to double check
print(set(labels.keys())<=set(features.keys()))
print(set(labels.keys())>=set(features.keys()))

True
True
True
True


# Feature and Label tables 
- 114 RiboMinus samples
- 11340 PolyA samples

In [26]:
labelsTall = labels.T[['tr_method','disease']]

In [27]:
labelsTall.head()

Unnamed: 0_level_0,tr_method,disease
th_sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1
TCGA-D3-A2J7-06,PolyA,skin cutaneous melanoma
TCGA-KC-A7FD-01,PolyA,prostate adenocarcinoma
TCGA-85-8071-01,PolyA,lung squamous cell carcinoma
TCGA-EM-A3AN-01,PolyA,thyroid carcinoma
TCGA-55-7284-01,PolyA,lung adenocarcinoma


In [28]:
features.head()

Unnamed: 0,TCGA-D3-A2J7-06,TCGA-KC-A7FD-01,TCGA-85-8071-01,TCGA-EM-A3AN-01,TCGA-55-7284-01,TCGA-Z6-A9VB-01,TCGA-CM-4747-01,TCGA-DE-A0XZ-01,TCGA-V4-A9EM-01,TCGA-D7-6815-01,...,TARGET-40-0A4I5B-01A-01R,TARGET-20-PANGJY-09,TCGA-GM-A3NW-01,TCGA-DV-A4W0-05,TCGA-25-2400-01,TCGA-A2-A0CU-01,TCGA-B6-A0RS-01,TARGET-50-PALFRD-01,TCGA-P5-A781-01,TCGA-CR-7388-01
5S_rRNA,0.0,0.0,0.0,0.0,0.0,0.71754,0.0,0.0,0.0,0.0,...,1.298292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5_8S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7SK,0.0,0.0,0.0,0.0,0.035647,0.02387,0.0,0.0,0.065823,0.0,...,0.45593,0.0,0.0,0.0,0.04733,0.0,0.0,0.0,0.0,0.0
A1BG,4.993182,2.053139,2.594564,3.702682,3.320516,1.232706,1.53613,4.602871,5.199276,1.803251,...,5.505573,1.395144,7.046446,3.454197,4.087481,4.249446,4.502056,2.601745,2.877778,4.679764
A1BG-AS1,3.513484,1.05667,1.169999,2.035634,1.669079,0.864038,0.443721,3.044429,2.742043,0.731261,...,1.803227,0.275124,3.320516,1.992807,1.650821,2.485414,2.114435,0.516128,1.131011,2.107747


### Export to csv and hdf5 files

In [32]:
labelsTall.to_csv('compendiumLabels.tsv', sep='\t')

In [None]:
%%time
features.to_csv('data/compendiumExpression.tsv', sep='\t')

In [672]:
with pd.HDFStore("data/compendiumTrain.h5", "w") as store:
    store["expression"] = features.T.sort_index(axis="columns")
    store["labels"] = labelsTall.astype(str)

### The above data is for another machine learning project taken on by Keller Jordan at the end of this quarter. <br>
Below, I am going to be finding disease specific data and also use more variable genes to create my feature set. 

# Find Disease with at least one RiboMinus and one PolyA

In [39]:
allDiseases=list(labelsTall['disease'].unique())

maxRiboMinusDisease = ''
maxRiboD=0
multiMethodDiseases = []
for disease in allDiseases:
    polyA_in_this_disease = list(labelsTall[labelsTall.disease == disease]['tr_method']).count('PolyA')
    riboD_in_this_disease = list(labelsTall[labelsTall.disease == disease]['tr_method']).count('RiboMinus')
    if riboD_in_this_disease >= 1 and polyA_in_this_disease >= 1:
        
        multiMethodDiseases.append(disease)
        print(disease,riboD_in_this_disease,polyA_in_this_disease)
#     else :
#         print("only one method disease:", disease)
print(multiMethodDiseases)


acute lymphoblastic leukemia 16 265
acute myeloid leukemia 4 421
sarcoma 1 357
neuroblastoma 4 178
diffuse large B-cell lymphoma 2 47
glioblastoma multiforme 41 192
wilms tumor 3 131
fibrolamellar hepatocellular carcinoma 29 7
atypical teratoid/rhabdoid tumor 1 3
lymphoma 1 50
ependymoma 2 3
['acute lymphoblastic leukemia', 'acute myeloid leukemia', 'sarcoma', 'neuroblastoma', 'diffuse large B-cell lymphoma', 'glioblastoma multiforme', 'wilms tumor', 'fibrolamellar hepatocellular carcinoma', 'atypical teratoid/rhabdoid tumor', 'lymphoma', 'ependymoma']


# Find Disease with the most RiboMinus

In [None]:
allDiseases=list(labelsTall['disease'].unique())

maxRiboMinusDisease = ''
maxRiboD=0
for disease in allDiseases:
    polyA_in_this_disease = list(labelsTall[labelsTall.disease == disease]['tr_method']).count('PolyA')
    riboD_in_this_disease = list(labelsTall[labelsTall.disease == disease]['tr_method']).count('RiboMinus')
    if riboD_in_this_disease > maxRiboD:
        maxRiboMinusDisease = disease
        maxRiboD = riboD_in_this_disease
print(maxRiboMinusDisease)

# Find Disease with closest 1/1 RiboMinus to PolyA
- looks for highest riboD count and
- highest riboD/polyA ratio

In [None]:
mostProportionalDisease = ''
bestRatio=0.0
highestRiboD = 0
for disease in allDiseases:
    polyA_in_this_disease = list(labelsTall[labelsTall.disease == disease]['tr_method']).count('PolyA')
    riboD_in_this_disease = list(labelsTall[labelsTall.disease == disease]['tr_method']).count('RiboMinus')
    if polyA_in_this_disease==0: polyA_in_this_disease=1
    if riboD_in_this_disease==0: riboD_in_this_disease=1
    ratio=riboD_in_this_disease/polyA_in_this_disease
    if ratio < 1 :
        if bestRatio < ratio and riboD_in_this_disease > highestRiboD:
            bestRatio = ratio
            highestRiboD = riboD_in_this_disease
            mostProportionalDisease = disease
print(mostProportionalDisease, bestRatio, highestRiboD)
print('polya:',list(labelsTall[labelsTall.disease == mostProportionalDisease]['tr_method']).count('PolyA'))
print('ribominus:',list(labelsTall[labelsTall.disease == mostProportionalDisease]['tr_method']).count('RiboMinus'))


# Create most highly variating genes (not DESeq)
- Get 75th highest variable genes
- Make features have the same genes as those above 75th percentile

This takes a long time, you can just read in the variation calculation from a csv file

In [None]:
%%time
variationFeatures = features.var(axis=1,numeric_only=True)

Read in from csv file

In [None]:
variationFeatures = pd.read_csv("variationFeatures.csv",header=0,index_col=0)

variationFeatures = variationFeatures.to_frame(name='var')

variationFeatures.to_csv(path_or_buf="variationFeatures.csv", sep=',', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.')

Get 75th highest variable genes

In [None]:
highVarGenes = variationFeatures[variationFeatures > variationFeatures.quantile(q=0.75)].dropna()

len(list(highVarGenes.index))

Make features have only these genes

In [None]:
%%time
highVarFeatures = features[features.index.isin(list(highVarGenes.index))]

# Glioblastoma Multiforme Features and Labels
- one feature csv with high variance genes (14,000 genes)
- one feature csv with all genes (58,000 genes)
- one label csv with identical identifiers (polyA riboD method)
- 233 samples
- 192 polya
- 41 ribod

In [712]:
th_ids_glioma = list(labelsTall[labelsTall.disease == "glioblastoma multiforme"].index)

gliomaFeaturesVar = highVarFeatures[th_ids_glioma]
gliomaFeatures = features[th_ids_glioma]
gliomaLabels = labelsTall[labelsTall.disease == "glioblastoma multiforme"]
# Export csv format files
gliomaFeaturesVar.to_csv("data/glioblastomaExpression14kgenes.csv")
gliomaFeatures.to_csv("data/glioblastomaExpression.csv")
gliomaLabels.to_csv("data/glioblastomaLabels.csv")

# Export h5 format files
with pd.HDFStore("data/glioblastomaTrain.h5", "w") as store:
    store["expression"] = gliomaFeatures.T.sort_index(axis="columns")
    store["labels"] = gliomaLabels.astype(str)

# Create Small Test Set of Another Disease
- Look for the next best 1/1 riboD polyA with more than 10 riboD samples
- get the features and labels to test on model

In [700]:
mostProportionalDisease = ''
bestRatio=0.0
highestRiboD = 10
for disease in allDiseases:
    polyA_in_this_disease = list(labelsTall[labelsTall.disease == disease]['tr_method']).count('PolyA')
    riboD_in_this_disease = list(labelsTall[labelsTall.disease == disease]['tr_method']).count('RiboMinus')
    if polyA_in_this_disease==0: polyA_in_this_disease=1
    if riboD_in_this_disease==0: riboD_in_this_disease=1
    ratio=riboD_in_this_disease/polyA_in_this_disease
    if ratio < 1 :
        if bestRatio < ratio and riboD_in_this_disease > highestRiboD:
            bestRatio = ratio
            highestRiboD = riboD_in_this_disease
            mostProportionalDisease = disease
            print(mostProportionalDisease, bestRatio, highestRiboD)
            print('polya:',list(labelsTall[labelsTall.disease == mostProportionalDisease]['tr_method']).count('PolyA'))
            print('ribominus:',list(labelsTall[labelsTall.disease == mostProportionalDisease]['tr_method']).count('RiboMinus'))


acute lymphoblastic leukemia 0.06037735849056604 16
polya: 265
ribominus: 16
glioblastoma multiforme 0.21354166666666666 41
polya: 192
ribominus: 41


acute lymphoblastic leukemia is the next most highly riboD and lowest ratio disease

# Create Labels for Test Set

Create Features and Labels for Test Set and export to csv files and hdf5

In [739]:
%%time
th_ids_ALL = list(labelsTall[labelsTall.disease=='acute lymphoblastic leukemia'].index)
ALL_FeaturesVar = highVarFeatures[th_ids_ALL]
ALL_Features = features[th_ids_ALL]
ALL_Labels = labels[list(ALL_Features.T.index)]

# Export csv format files
ALL_FeaturesVar.to_csv("data/ALLeukemiaExpressionVar.csv")
ALL_Features.to_csv("data/ALLeukemiaExpression.csv")
ALL_Labels.to_csv("data/ALLeukemiaLabels.csv")

CPU times: user 46.3 s, sys: 4.86 s, total: 51.1 s
Wall time: 51.2 s


In [730]:
# Export h5 format files
with pd.HDFStore("data/ALLeukemiaTrain.h5", "w") as store:
    store["expression"] = ALL_Features.T.sort_index(axis="columns")
    store["labels"] = ALL_Labels.astype(str)

# Reference

In [None]:
df2 = pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 5)),columns=['a', 'b', 'c', 'd', 'e'])
df1 = pd.DataFrame(np.random.randint(low=10, high=20, size=(5, 5)),columns=['a', 'b', 'c', 'd', 'e'])

pd.concat([df2,df1]).sort_index()

pd.concat([df2,df1]).sort_index().groupby(level=0).agg('mean')