In [193]:
import os, fnmatch
import matplotlib.pyplot as plt
import csv
import numpy as np
import pandas as pd
import sys
import json
import glob


# Labels
Sources:
- clinical.tsv
- riboDepleted_samples_that_passedQC_and_have_known_diagnosis
- TranscriptMethod_THPEDv1

# Features
Sources:
- /data/archive/compendium/v5/v5_hugo_log2tpm.11340x58581.2018-02-03.hd5
- /data/archive/downstream/*  [all sampleids possible]
    - /secondary/ucsc_cgl-rnaseq-cgl-pipeline-0.0.0-0000000/RSEM/Hugo/
        - rsem_genes.hugo.results

# Gather Features
1. read in compendium (log2(TPM+1))
2. find more in downstream source (TPM) 
3. merge the two using the same gene name both in log2(TPM+1)

In [7]:
compendium = pd.read_hdf("/data/archive/compendium/v5/v5_hugo_log2tpm.11340x58581.2018-02-03.hd5")

In [144]:
basepath="/data/archive/downstream/"
postidpath="/secondary/ucsc_cgl-rnaseq-cgl-pipeline-0.0.0-0000000/RSEM/Hugo/"
filename="rsem_genes.hugo.results"
hugoLogTpmPlusOneFilePathList = glob.glob(basepath + "*" + postidpath + filename)

rawTPMExpression = pd.DataFrame()
for filepath in hugoLogTpmPlusOneFilePathList:
    curDf = pd.read_csv(filepath,sep='\t')
    rawTPMExpression[filepath[25:].partition('/')[0]]=curDf['TPM']


In [175]:
compendium.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58581 entries, 5S_rRNA to yR211F11.2
Columns: 11340 entries, THR15_0330_S01 to TCGA-DD-A115-01
dtypes: float32(11340)
memory usage: 2.5+ GB


### Multiple Ensemble IDs match to one HUGO gene name

In [181]:
rawTPMExpression['gene_id'] = curDf['gene_name']


In [185]:
rawTPMExpression.index = list(rawTPMExpression['gene_id'])


In [177]:
rawTPMExpression.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60448 entries, 0 to 60447
Columns: 816 entries, TH03_0159_S01 to TH03_0118_S01
dtypes: float64(816)
memory usage: 376.3 MB


### Remove repeat genes and apply +1 and log2 to rawTPM

In [229]:
%%time
rawTPMExpressionShorter=rawTPMExpression.groupby(level=0).agg('mean').add(1).apply(np.log2)
# rawTPMExpressionShorter.info()
# <class 'pandas.core.frame.DataFrame'>
# Index: 58581 entries, 5S_rRNA to yR211F11.2
# Columns: 816 entries, TH03_0159_S01 to TH03_0118_S01
# dtypes: float64(816)
# memory usage: 365.1+ MB

# compendium.info()
# <class 'pandas.core.frame.DataFrame'>
# Index: 58581 entries, 5S_rRNA to yR211F11.2
# Columns: 11340 entries, THR15_0330_S01 to TCGA-DD-A115-01
# dtypes: float32(11340)
# memory usage: 2.5+ GB

CPU times: user 6.22 s, sys: 5.02 s, total: 11.2 s
Wall time: 10.3 s


### Merge compendium with downstream data

In [235]:
cols_to_use = rawTPMExpressionShorter.columns.difference(compendium.columns)
len(cols_to_use.tolist())
cols_to_use

283

In [237]:
%%time
expressionTpmCompendium = pd.merge(rawTPMExpressionShorter[list(cols_to_use)], compendium, left_index = True, right_index = True)

CPU times: user 1min 4s, sys: 30.3 s, total: 1min 34s
Wall time: 1min 34s


In [238]:
expressionTpmCompendium.head()

Unnamed: 0,TARGET-40-0A4HLD-01A-01R,TARGET-40-PAKUZU-01A-01R,TARGET-40-PAKXLD-01A-01R,TARGET-40-PALFYN-01A-01R,TARGET-40-PAMEKS-01A-01R,TARGET-40-PAPNVD-01A-01R,TARGET-40-PAPWWC-01A-01R,TARGET-40-PASKZZ-01A-01R,TARGET-40-PAUVUL-01A-01R,TARGET-40-PAUXPZ-01A-01R,...,TCGA-18-3414-01,TCGA-13-1481-01,TCGA-BP-4803-01,TCGA-A8-A09K-01,TCGA-61-1907-01,TCGA-IB-7885-01,TCGA-95-7947-01,TCGA-VQ-AA6F-01,TCGA-BR-8588-01,TCGA-DD-A115-01
5S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5_8S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7SK,0.0,0.0,0.0,0.135316,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.021551,0.0,0.0,0.0,0.038016,0.042669,0.016758,0.0,0.0
A1BG,5.782671,6.661778,4.725741,2.641546,4.68594,4.319762,5.801159,1.304511,3.534809,5.135863,...,3.323396,4.452866,2.424979,4.679187,5.000447,3.129315,2.666772,1.526119,1.646188,9.799418
A1BG-AS1,1.104337,2.693766,2.827819,0.978196,2.835924,2.548437,3.824768,0.941106,0.250962,0.713696,...,1.014435,1.682619,1.07045,3.106042,1.659971,1.310415,1.400608,0.739941,0.918464,0.739941


# Gather Labels
Sources:
- read clinical.tsv (no riboD, so all polyA)
- read riboDepleted_samples_that_passedQC_and_have_known_diagnosis (all riboD)
- read TranscriptMethod_THPEDv1 (compendium both polyA and riboD)
- merge all three and remove repeats

In [404]:
clinicalLabels = pd.read_csv("/data/archive/compendium/v5/clinical.tsv",sep='\t')
clinicalLabels['tr_method']='PolyA'
ribodDiagnosis = pd.read_csv("riboDepleted_samples_that_passedQC_and_have_known_diagnosis.tsv", sep='\t')

ribodDiagnosis=ribodDiagnosis.rename(columns={'Treehouse SAMPLE identifier':'th_sampleid','Diagnosis/Disease':'disease'})

ribodDiagnosis['tr_method']='RiboMinus'
methods = pd.read_csv("TranscriptMethod_THPEDv1.csv")

methods = methods.rename(columns={'Treehouse SAMPLE identifier':'th_sampleid','TR_method':'tr_method'})

clinicalIdTissue = clinicalLabels[['th_sampleid','anat_sample','disease','tr_method']] 
df = pd.merge(clinicalIdTissue, ribodDiagnosis, how='outer')
df = pd.merge(df,methods,how='outer')
df = df[['th_sampleid','tr_method','disease']]
# PolyA                11350
# RiboMinus              179
# suspect RiboMinus        9

compendium_id_method_disease_labels = df.dropna()
# PolyA        11340
# RiboMinus      165

In [390]:
compendium_id_method_disease_labels.head()

Unnamed: 0,th_sampleid,tr_method,disease
0,TH03_0010_S01,PolyA,acute leukemia of ambiguous lineage
1,TH03_0010_S02,PolyA,acute leukemia of ambiguous lineage
2,TH03_0103_S01,PolyA,rhabdomyosarcoma
3,TH03_0104_S01,PolyA,hepatoblastoma
4,TH03_0105_S01,PolyA,rhabdomyosarcoma


# 11454 features intersect labels total
- **160 features intersect labels that are RiboMinus**
- 11340 features intersect labels that are PolyA

In [286]:
len(set(list(compendium_id_method_disease_labels['th_sampleid']))&set(expressionTpmCompendium.keys()))

11454

In [292]:
labelsRiboD=compendium_id_method_disease_labels[compendium_id_method_disease_labels['tr_method']=='RiboMinus']['th_sampleid']
len(set(labelsRiboD)&set(expressionTpmCompendium.keys()))

160

In [293]:
labelsRiboD=compendium_id_method_disease_labels[compendium_id_method_disease_labels['tr_method']=='PolyA']['th_sampleid']
len(set(labelsRiboD)&set(expressionTpmCompendium.keys()))

11340

# Make labels and features intersect ids

In [432]:
allColumns = set(compendium_id_method_disease_labels['th_sampleid'])|set(expressionTpmCompendium.keys())
columnsNotInLabels = allColumns^set(compendium_id_method_disease_labels['th_sampleid'])
columnsNotInFeatures = allColumns^set(expressionTpmCompendium.keys())
allNonCompatibleColumns = columnsNotInFeatures^columnsNotInLabels
allCompatibleColumns = allColumns-allNonCompatibleColumns

In [434]:
# expressionTpmCompendium is bigger 11623 ids
# compendium_id_method_disease_labels.info() 11505 ids
features = expressionTpmCompendium[list(allCompatibleColumns)]
features.info()
# should have 11454 columns
# it does!

# <class 'pandas.core.frame.DataFrame'>
# Index: 58581 entries, 5S_rRNA to yR211F11.2
# Columns: 11454 entries, TCGA-KU-A6H7-06 to TCGA-F7-8298-01
# dtypes: float32(11340), float64(114)
# memory usage: 2.5+ GB

<class 'pandas.core.frame.DataFrame'>
Index: 58581 entries, 5S_rRNA to yR211F11.2
Columns: 11454 entries, TCGA-KU-A6H7-06 to TCGA-F7-8298-01
dtypes: float32(11340), float64(114)
memory usage: 2.5+ GB


In [446]:
compendium_id_method_disease_labels=compendium_id_method_disease_labels.drop_duplicates('th_sampleid')
compendium_id_method_disease_labels.index = compendium_id_method_disease_labels['th_sampleid']
transpose= compendium_id_method_disease_labels.T
transpose[list(allCompatibleColumns)].info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, th_sampleid to disease
Columns: 11454 entries, TCGA-KU-A6H7-06 to TCGA-F7-8298-01
dtypes: object(11454)
memory usage: 268.5+ KB


In [449]:
labels = transpose[list(allCompatibleColumns)]


Test whether the sets have truely the same ids 
<br> (If both true then = proved)

In [470]:
print(set(features.keys())<=set(labels.keys()))
print(set(features.keys())>=set(labels.keys()))
# just to double check
print(set(labels.keys())<=set(features.keys()))
print(set(labels.keys())>=set(features.keys()))

True
True
True
True


In [460]:
labelsTall = labels.T[['tr_method','disease']]

In [465]:
labelsTall.head()

Unnamed: 0_level_0,tr_method,disease
th_sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1
TCGA-KU-A6H7-06,PolyA,head & neck squamous cell carcinoma
TCGA-A8-A086-01,PolyA,breast invasive carcinoma
TCGA-HT-7687-01,PolyA,glioma
TARGET-50-PAJMXF-01,PolyA,wilms tumor
TARGET-50-PAJMFY-01,PolyA,wilms tumor


In [463]:
features.head()

Unnamed: 0,TCGA-KU-A6H7-06,TCGA-A8-A086-01,TCGA-HT-7687-01,TARGET-50-PAJMXF-01,TARGET-50-PAJMFY-01,TCGA-EJ-7327-01,TCGA-CW-6090-01,TCGA-55-A48Y-01,TCGA-2A-A8W3-01,TARGET-50-PAJLWT-01,...,TCGA-A2-A04U-01,TCGA-CV-5440-01,TCGA-AB-2935-03,TCGA-41-2572-01,TCGA-FY-A3I4-01,TCGA-A2-A0T5-01,TCGA-V4-A9EC-01,TCGA-AB-2837-03,TCGA-Q1-A5R1-01,TCGA-F7-8298-01
5S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5_8S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7SK,0.0,0.042716,0.0,0.042691,0.0,0.0,0.0,0.054294,0.130956,0.0,...,0.0,0.0,0.292841,0.0,0.0,0.081814,0.0,0.0,0.0,0.0
A1BG,5.745814,5.325208,3.063519,3.644984,4.025913,2.080706,2.367375,2.952385,2.204799,1.157091,...,5.516047,2.563169,3.130909,6.587368,3.426323,4.0618,5.88146,3.132593,1.848061,2.62065
A1BG-AS1,2.430272,3.158674,1.286958,1.844018,2.127684,1.063598,1.655393,1.028655,0.604182,1.709316,...,1.863989,0.298782,2.739831,3.244872,1.632331,2.327753,1.974571,4.256269,0.879794,0.731261


# 114 RiboD samples with labels

In [468]:
labelsTall[labelsTall.tr_method=='RiboMinus'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 114 entries, TH01_0125_S01 to TH01_0135_S01
Data columns (total 2 columns):
tr_method    114 non-null object
disease      114 non-null object
dtypes: object(2)
memory usage: 2.7+ KB


# Reference

In [216]:
df2 = pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 5)),columns=['a', 'b', 'c', 'd', 'e'])
df1 = pd.DataFrame(np.random.randint(low=10, high=20, size=(5, 5)),columns=['a', 'b', 'c', 'd', 'e'])

In [228]:
pd.concat([df2,df1]).sort_index()

Unnamed: 0,a,b,c,d,e
0,3,9,2,9,2
0,12,17,15,16,11
1,3,2,3,8,9
1,17,11,12,19,10
2,3,6,8,8,2
2,11,10,15,17,11
3,8,0,5,1,6
3,17,17,16,17,12
4,4,5,9,5,7
4,10,19,16,13,19


In [225]:
pd.concat([df2,df1]).sort_index().groupby(level=0).agg('mean')

Unnamed: 0,a,b,c,d,e
0,7.5,13.0,8.5,12.5,6.5
1,10.0,6.5,7.5,13.5,9.5
2,7.0,8.0,11.5,12.5,6.5
3,12.5,8.5,10.5,9.0,9.0
4,7.0,12.0,12.5,9.0,13.0
