# Treehouse Sample Processing

Create a `sample` by `gene` matrix of CKCC Treehouse samples for model processing

# Inputs

In [1]:
import pandas as pd
import os

## Metadata

In [2]:
path = '/mnt/treehouse-data-for-outlier-calling/ckccSampleMetaData_2017_11_20_plusDxGroup.txt'
meta = pd.read_csv(path, sep='\t')
meta.head()

Unnamed: 0,th_sampleid,disease,source,dx_group,anat_sample,sampletype,age_at_dx,pedaya,gender,race,...,ucs_hist,hnsc_hist,chol_hist,prad_hist,esca_hist,ucec_hist,paad_hist,stad_hist,brcapam,caseID
0,TH01_0053_S01,acute myeloid leukemia,,Hematopoietic,,,,,,,...,,,,,,,,,,TH01_0053
1,TH01_0054_S01,acute lymphoblastic leukemia,,Hematopoietic,,,,,,,...,,,,,,,,,,TH01_0054
2,TH01_0055_S01,glioma,,CNS,,,,,,,...,,,,,,,,,,TH01_0055
3,TH01_0061_S01,germ cell tumor,,Germ cell tumor or teratoma,,,,,,,...,,,,,,,,,,TH01_0061
4,TH01_0062_S01,acute lymphoblastic leukemia,,Hematopoietic,,,,,,,...,,,,,,,,,,TH01_0062


## Expression data
Taken from: https://xenabrowser.net/datapages/?dataset=TreehousePEDv9_unique_hugo_log2_tpm_plus_1.2019-03-15.tsv&host=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443

In [3]:
path = '/mnt/treehouse-data-for-outlier-calling/TreehousePEDv9_unique_hugo_log2_tpm_plus_1.2019-03-15.tsv'
df = pd.read_csv(path, sep='\t', index_col=0)

Subset for CKCC samples

In [11]:
ckcc = df[[x for x in meta.th_sampleid if x in df.columns]].T
ckcc.head(2)

Gene,5S_rRNA,5_8S_rRNA,7SK,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2ML1-AS1,...,snoU2-30,snoU2_19,snoU83B,snoZ196,snoZ278,snoZ40,snoZ6,snosnR66,uc_338,yR211F11.2
TH03_0010_S01,0.0,0.0,0.238787,4.545968,2.565597,0.028569,1.028569,0.014355,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.611172,0.0
TH03_0010_S02,0.0,0.0,0.0,3.926948,1.799087,0.028569,1.584963,0.014355,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.049631,0.0


Subset for genes in the GTEx dataset

In [14]:
gtex = pd.read_hdf('/mnt/data/expression/gtex.hd5')
genes = gtex.columns[5:]
ckcc = ckcc[genes]
ckcc.head(2)

Gene,RP11-40C6.2,IGHG1,IGKC,KRT5,IGLC2,DES,KRT19,AL162151.3,FXYD3,IGLC3,...,IGHEP1,RP11-44N21.4,GAPDHP61,P2RX5-TAX1BP3,RP11-408H20.3,RP13-726E6.2,CTB-43E15.1,ZNF878,OR2H2,RP4-530I15.9
TH03_0010_S01,3.221877,3.962549,4.760221,0.124328,3.761285,0.056584,0.070389,9.928755,0.137504,1.678072,...,0.042644,0.111031,0.0,0.807355,0.0,0.0,0.0,0.189034,0.0,0.014355
TH03_0010_S02,3.014355,3.68818,5.42055,0.0,3.784504,0.584963,0.070389,7.833523,0.176323,1.996389,...,0.0,0.389567,0.0,0.632268,0.0,0.0,0.0,0.097611,0.0,0.0


# Save Output

In [18]:
ckcc.to_hdf('/mnt/data/expression/ckcc.hd5', key='exp')

In [19]:
ckcc.to_hdf('/mnt/treehouse-data-for-outlier-calling/ckcc.hd5', key='exp')

In [20]:
ckcc.to_csv('/mnt/treehouse-data-for-outlier-calling/ckcc.tsv', sep='\t')

# Identify Candidates for Analysis
Compare to model findings

In [23]:
sub = meta[meta.th_sampleid.isin(ckcc.index)]

In [36]:
meta[meta.th_sampleid == 'TH03_0004_S04']

Unnamed: 0,th_sampleid,disease,source,dx_group,anat_sample,sampletype,age_at_dx,pedaya,gender,race,...,ucs_hist,hnsc_hist,chol_hist,prad_hist,esca_hist,ucec_hist,paad_hist,stad_hist,brcapam,caseID
87,TH03_0004_S04,hepatoblastoma,Stanford,Liver,lung metastasis,Metastasis,2.0,"Yes, age < 30 years",male,Not reported or Unknown,...,,,,,,,,,,TH03_0004


## Candidates for Analysis

- Liver
    - TH03_0104_S01
        - 1 outliers found where Treehouse found None.
            - NTRK2 - 0.02
        - Liver matched at 0.97
    - TH03_0107_S01
        - CDK6 - 0.054
        - None in Treehouse
    - TH03_0113_S01
       - Matched weight mostly to Bladder* — Important for sample contamination?
       - Found same outlier as Treehouse (FGFR1) but also 3 others.
           - FGFR2, AURKA, CDK4
    - TH03_0113_S02
       - Matched to colon.
       - DOES NOT identify same gene FGFR1 as outlier (0.11)
    - TH03_0114_S01
       - Agrees on no outliers
    - TH03_0017_S01
       - Agree on MDM2
    - TH03_0019_S01
       - Agrees on KDR
       - Disagrees on FLT1 (0.08)
    - TH03_0286_S01
       - Disagrees on PDGFRB (0.08)
    - TH03_0004_S02
       - Disagrees on AURKB (0.04, Treehouse finds nothing)
       - Liver match of 0.95
    - **TH03_0004_S04**
       - Lung metastisis, matches mostly to lung 0.68
       - Great example of what to use as appropriate match? Lung
       - Agrees on HSP90B1 (0.049), disagrees on IL6 (0.07), disagrees on AURKB (0.02), AURKA (0.046)

- Wilms
    - **TH06_0632_S01**
        - Female patient, matches to Cervix and Fallopian.
        - Anat sample: Vascular-Vein-Inferior Vena Cava-Hepatic
        - Almost no agreement amongst outliers
        - Treehouse: BCL6, MLST8, PDGFRB, PIK3R1, STAT3, TSC2
        - Model: CDK6, AURKB, CCND1, AURKA
        
- Adrenal
    - **TH03_0024_S01**
         - Matches 0.83% to Adrenal
         - Treehouse no outliers
         - AURKA (0.02), KIT (0.04), AURKB (0.05)
         
- Thyroid
    - **TH06_0612_S01**
        - Matches 0.83% to Thyroid
        - Treehouse no outliers
        - MET (0.005), STAT1 (0.03), CCND2 (0.03), NTRK2 (0.03), NTRK1 (0.04)
    - **TH06_0631_S01**
        - Taken from lymph node, but 0.50% match to Thyroid
        - Treehouse: CSF1R, STAT2, TSC2
        - Model: PIK3R6 (0.03), MET (0.03), ALK (0.04), CSF1R (0.04)
        - STAT2 in model is 0.27! TSC22 is 0.31!

- Medulloblastoma
    - **TH03_0020_S01**
        - Brain: 0.78 match
        - Treehouse: PTCH1
        - Model: AURKB (0.005), CDK6 (0.008), CDK4 (0.02), NTRK3 (0.03), CDK2 (0.03), AURKA (0.04)
        - Disagrees on PTCH1 (0.08)

        
- Neuroblastoma
    - TH06_0628_S01
        - Several more outliers than what Treehouse found
        - Nerve highest match at 0.47 which seems alright
        