In [1]:
from helpers.utilities import *
%run helpers/notebook_setup.ipynb

In [2]:
rna_deseq2_path = 'data/clean/rna/all_samples.csv'
rna_counts_path = 'data/clean/rna/all_samples_counts.csv'

clinical_path = 'data/clean/clinical/data_with_derived_variables.csv'

# output
out_deseq2_path = 'data/clean/rna/clinical_data_ordered_to_match_rna_deseq2.csv'
out_counts_path = 'data/clean/rna/clinical_data_ordered_to_match_rna_counts.csv'

In [3]:
clinical_data = read_csv(clinical_path, index_col=0)

rna_deseq2 = read_csv(rna_deseq2_path, index_col=[0, 1])
raw_counts = read_csv(rna_counts_path, index_col=[0])

In [4]:
rna_deseq2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,001.TMD,006.CM,012.BM,016.CM,017.TMD,...,174.CM,011.TMR,043.TMS,078.CM,261.CM
ensembl_id,ensembl_gene_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000000003,TSPAN6,11.596119,7.806308,1.391555,39.347366,24.765008,...,4.431071,51.81047,35.628629,0.0,20.705553
ENSG00000000005,TNMD,0.0,0.0,0.0,2.45921,0.0,...,0.0,11.102244,0.0,0.0,4.141111
ENSG00000000419,DPM1,0.0,31.22523,36.876198,0.0,72.313823,...,25.478656,11.102244,0.0,161.520987,39.754662
ENSG00000000457,SCYL3,127.557309,145.71774,86.276387,95.909204,123.82504,...,121.85444,96.219444,3.958737,92.39548,125.889763
ENSG00000000460,C1orf112,46.384476,0.0,50.791744,51.643418,14.859005,...,33.233029,49.960096,38.267787,0.0,30.644219


In [5]:
raw_counts.head()

Unnamed: 0,001.TMD,006.CM,012.BM,016.CM,017.TMD,...,158.TMD,167.TMR,175.TMD,233.CM,261.CM
ENSG00000000003,1.0,3.0,2.0,16.0,25.0,...,0.0,0.0,0.0,0.0,25.0
ENSG00000000005,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,5.0
ENSG00000000419,0.0,12.0,53.0,0.0,73.0,...,0.0,0.0,0.0,0.0,48.0
ENSG00000000457,11.0,56.0,123.84,38.72,124.95,...,0.0,0.0,0.0,0.0,151.94
ENSG00000000460,4.0,0.0,73.16,21.28,15.05,...,0.0,0.0,0.0,0.0,37.06


Note: the raw counts are not really raw counts. These are results of RSEM, which seemingly outputs floats not integers.

How to approach that?
- when using DESeq2 "tximport" function [was recommended](https://support.bioconductor.org/p/94003/#94028)
   - [tximport](https://bioconductor.org/packages/devel/bioc/vignettes/tximport/inst/doc/tximport.html) is a package on bioconductor and the docs describe how to use it for limma-voom
   - it has a related publication: https://f1000research.com/articles/4-1521/v1

Take only rows for patients with RNASeq data, ordered to match the corresponding data frames:

In [6]:
clinical_data_ordered_for_deseq2 = clinical_data.loc[rna_deseq2.columns]
clinical_data_ordered_for_deseq2.to_csv(out_deseq2_path)
clinical_data_ordered_for_deseq2.head()

Unnamed: 0,AdmissionDate,Birthday,Sex,PrevTB,PrevTBForm,...,survival,censored_survival,Meningitis,Tuberculosis,Meningitis_with_tuberculosis_status
001.TMD,2015-02-06,1980-01-04,M,False,,...,,182,Tuberculosis,Definite,Definite tuberculosis
006.CM,2015-02-11,1966-05-05,M,True,Pulmonary,...,,182,Cryptococcal,-,Cryptococcal
012.BM,2015-02-19,1961-01-09,M,False,,...,,182,Bacterial,-,Bacterial
016.CM,2015-02-20,1958-11-11,M,True,Pulmonary,...,24.0,24,Cryptococcal,-,Cryptococcal
017.TMD,2015-02-22,1957-07-03,F,False,,...,161.0,161,Tuberculosis,Definite,Definite tuberculosis


In [7]:
clinical_data_ordered_for_counts = clinical_data.loc[raw_counts.columns]
clinical_data_ordered_for_counts.to_csv(out_counts_path)
clinical_data_ordered_for_counts.head()

Unnamed: 0,AdmissionDate,Birthday,Sex,PrevTB,PrevTBForm,...,survival,censored_survival,Meningitis,Tuberculosis,Meningitis_with_tuberculosis_status
001.TMD,2015-02-06,1980-01-04,M,False,,...,,182,Tuberculosis,Definite,Definite tuberculosis
006.CM,2015-02-11,1966-05-05,M,True,Pulmonary,...,,182,Cryptococcal,-,Cryptococcal
012.BM,2015-02-19,1961-01-09,M,False,,...,,182,Bacterial,-,Bacterial
016.CM,2015-02-20,1958-11-11,M,True,Pulmonary,...,24.0,24,Cryptococcal,-,Cryptococcal
017.TMD,2015-02-22,1957-07-03,F,False,,...,161.0,161,Tuberculosis,Definite,Definite tuberculosis


Please note the difference in number of patients between raw counts and dseq2 processed data:

In [8]:
len(clinical_data_ordered_for_counts), len(clinical_data_ordered_for_deseq2)

(56, 49)