In [1]:
from helpers.utilities import *
%run helpers/notebook_setup.ipynb

In [2]:
from numpy import nan

In [3]:
definite_tbm_rna_path = 'data/raw/RNA-Seq/DefiniteTBM_CM_VM/DefTBM_CM_VM.xlsx'
all_samples_path = 'data/raw/RNA-Seq/AllSamples/NormalisedCounts_AllSamples_ConditionFiltered.txt'
metadata_path = 'data/raw/RNA-Seq/AllSamples/ColData.txt'

# outputs
tbm_subset_clean_path = 'data/clean/rna/definite_tbm_against_all.csv'
duplicates_path = 'data/other/duplicates_rna_definite_tbm_subset.csv'

definite_tbm_cm_deg_path = 'data/preliminary_analyses/deg/definite_tbm-cm.csv'
definite_tbm_vm_deg_path = 'data/preliminary_analyses/deg/definite_tbm-vm.csv'
cm_vm_deg_path = 'data/preliminary_analyses/deg/cm-vm.csv'

all_samples_clean_path = 'data/clean/rna/all_samples.csv'
all_samples_duplicates_path = 'data/other/duplicates_rna_all_samples.csv'

## Definite TBM subset

In [4]:
definite_tbm_rna = read_excel(definite_tbm_rna_path, sheet_name=None)

In [5]:
list(definite_tbm_rna.keys())

['DEG_P75_TBM_CM', 'DEG_P75_TBM_VM', 'DEG_P75_CM_VM']

I assume that there are two distinct datasets in the spreedsheets: differentially expressed genes (DEGs) and the actual expression values.

### Definite TBM vs viral (DEG_P75_TBM_VM)

Side note to a reader coming from R: NA (not available/missing value) are represented by NaN (not a number) in Python/pandas, thus you can see NaN where you might expect NA. In some cases None is used to represent NA, though it has a negative impact on data storage efficiency / performance.

In [6]:
tbm_vm = definite_tbm_rna['DEG_P75_TBM_VM']
tbm_vm.head()

Unnamed: 0,Ensembl_ID,Feature,Description,baseMean,log2FoldChange,...,154.VM,156.VM,165.VM,255.VM,205.VM
0,ENSG00000000003,TSPAN6,tetraspanin 6,14.919135,1.499861,...,12.062173,4.743492,2.080339,15.79095,4.808728
1,ENSG00000000419,DPM1,dolichyl-phosphate mannosyltransferase subunit...,32.962551,-0.188709,...,43.423823,27.145923,15.602543,45.267391,32.859642
2,ENSG00000000457,SCYL3,SCY1 like pseudokinase 3,120.711337,0.633077,...,107.35334,84.067823,79.052885,77.902022,88.160015
3,ENSG00000000460,C1orf112,chromosome 1 open reading frame 112,28.23142,-0.095839,...,32.567867,45.133619,30.164917,12.63276,36.065461
4,ENSG00000000938,FGR,"FGR proto-oncogene, Src family tyrosine kinase",664.920461,0.735533,...,85.641429,292.687526,104.016955,326.34631,164.298209


In [7]:
tbm_vm.tail()

Unnamed: 0,Ensembl_ID,Feature,Description,baseMean,log2FoldChange,...,154.VM,156.VM,165.VM,255.VM,205.VM
21483,ENSG00000284512,AC092718.8,0,15.697566,-0.2845,...,10.855956,12.39883,20.803391,20.001871,9.617456
21484,ENSG00000284513,AC006063.2,0,13.256127,0.08816,...,9.649738,7.984095,20.803391,24.212791,16.029094
21485,ENSG00000284523,AC004834.1,0,27.773974,0.674403,...,21.711911,21.604021,23.9239,33.687361,18.433458
21486,ENSG00000284526,AC015802.6,0,54.314726,0.686323,...,56.692213,27.897367,52.008477,63.163802,33.661097
21487,ENSG00000284543,LINC01226,long intergenic non-protein coding RNA 1226,76.657661,0.30707,...,54.279779,45.697202,119.619498,100.009353,42.477098


Zeros in the description are assumed to indicate missing values:

In [8]:
tbm_vm['Description'] = tbm_vm['Description'].replace('0', nan)

There are 47 columns:

In [9]:
tbm_vm.columns

Index(['Ensembl_ID', 'Feature', 'Description', 'baseMean', 'log2FoldChange',
       'lfcSE', 'stat', 'pvalue', 'padj', 'Feature.1', '001.TMD', '017.TMD',
       '083.TMD', '101.TMD', '136.TMD', '149.TMD', '151.TMD', '168.TMD',
       '170.TMD', '241.TMD', '242.TMD', '185.TMD', '006.CM', '016.CM',
       '018.CM', '058.CM', '103.CM', '120.CM', '123.CM', '145.CM', '220.CM',
       '224.CM', '240.CM', '244.CM', '256.CM', '206.CM', '174.CM', '078.CM',
       '261.CM', '048.VM', '089.VM', '095.VM', '154.VM', '156.VM', '165.VM',
       '255.VM', '205.VM'],
      dtype='object')

which include data for both definite TB, VM (viral) and CM (cryptococcal).

`Feature.1` is just the same as `Feature`:

In [10]:
(tbm_vm['Feature'] == tbm_vm['Feature.1']).all()

True

I will drop the redundant one:

In [11]:
tbm_vm = tbm_vm.drop('Feature.1', axis='columns')

`Feature` appears to represent Ensembl Gene Name (AC116366.3 is a hint - this is an EBI, versioned format)

In [12]:
tbm_vm = tbm_vm.rename(columns={'Feature': 'ensembl_gene_name'})

Standardize index column names:

In [13]:
from helpers.data_frame import to_lowercase

rna_raw_index = ['Ensembl_ID', 'ensembl_gene_name', 'Description']
tbm_vm.columns = to_lowercase(tbm_vm.columns, limit_to=rna_raw_index)
rna_index = to_lowercase(rna_raw_index)

and separate the datasets to keep those in a format closer to second normal form (2NF):

In [14]:
deg_columns = ['baseMean', 'log2FoldChange', 'lfcSE', 'stat', 'pvalue', 'padj']
patient_columns = list(tbm_vm.columns.difference(deg_columns + rna_index))

tbm_vm_deg = tbm_vm[rna_index + deg_columns].set_index(rna_index)
tbm_vm_data = tbm_vm[rna_index + patient_columns].set_index(rna_index)

#### Data

In [15]:
tbm_vm_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,001.TMD,006.CM,016.CM,017.TMD,018.CM,...,242.TMD,244.CM,255.VM,256.CM,261.CM
ensembl_id,ensembl_gene_name,description,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ENSG00000000003,TSPAN6,tetraspanin 6,11.335838,7.111529,37.483544,25.116886,0.0,...,44.547274,11.084592,15.79095,10.457265,20.323391
ENSG00000000419,DPM1,"dolichyl-phosphate mannosyltransferase subunit 1, catalytic",0.0,28.446116,0.0,73.341307,33.845488,...,46.738124,0.0,45.267391,16.034472,39.02091
ENSG00000000457,SCYL3,SCY1 like pseudokinase 3,124.694213,132.748542,91.366139,125.584429,105.162765,...,64.995203,151.225503,77.902022,92.721079,123.566216
ENSG00000000460,C1orf112,chromosome 1 open reading frame 112,45.34335,0.0,49.197152,15.070131,19.340279,...,9.493681,0.0,12.63276,25.097435,30.078618
ENSG00000000938,FGR,"FGR proto-oncogene, Src family tyrosine kinase",498.776851,668.483729,693.445565,2189.187769,1375.577318,...,749.270547,235.151698,326.34631,123.395721,263.391144


In [16]:
tbm_vm_data.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,001.TMD,006.CM,016.CM,017.TMD,018.CM,...,242.TMD,244.CM,255.VM,256.CM,261.CM
ensembl_id,ensembl_gene_name,description,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ENSG00000284512,AC092718.8,0,0.0,61.633252,35.140823,0.0,0.0,...,0.0,0.0,20.001871,11.851566,12.194034
ENSG00000284513,AC006063.2,0,0.0,7.111529,4.685443,8.037403,2.417535,...,16.06623,37.212558,24.212791,15.337321,3.251743
ENSG00000284523,AC004834.1,0,11.335838,47.410194,39.826266,10.046754,21.757813,...,0.0,0.0,33.687361,11.154415,21.136326
ENSG00000284526,AC015802.6,0,22.671675,49.780703,77.30981,38.177666,80.987417,...,21.178212,1.583513,63.163802,57.86353,19.510455
ENSG00000284543,LINC01226,long intergenic non-protein coding RNA 1226,90.6867,182.529245,168.675948,31.144938,71.317277,...,39.435292,94.219031,100.009353,94.115381,51.214945


First finding, ahead of proper quality checks:

In [17]:
tbm_vm_data.duplicated().any()

True

In [18]:
sum(tbm_vm_data.duplicated(keep=False))

39

39 genes have identical values across all patients, thus are correlated in 100%. Here are a few examples:

In [19]:
tbm_vm_data[tbm_vm_data.duplicated(keep=False)].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,001.TMD,006.CM,016.CM,017.TMD,018.CM,...,242.TMD,244.CM,255.VM,256.CM,261.CM
ensembl_id,ensembl_gene_name,description,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ENSG00000179028,AC245177.1,0,34.007513,23.705097,46.85443,0.0,21.757813,...,10.954248,0.0,8.42184,4.880057,13.00697
ENSG00000182776,AC239585.1,0,34.007513,23.705097,46.85443,0.0,21.757813,...,10.954248,0.0,8.42184,4.880057,13.00697
ENSG00000206588,RNU1-28P,"RNA, U1 small nuclear 28, pseudogene",0.0,14.223058,2.342722,19.088833,3.626302,...,11.684531,0.0,0.0,11.154415,21.949262
ENSG00000206596,RNU1-27P,"RNA, U1 small nuclear 27, pseudogene",0.0,14.223058,2.342722,19.088833,3.626302,...,11.684531,33.253775,21.054601,11.154415,0.0
ENSG00000206652,RNU1-1,"RNA, U1 small nuclear 1",0.0,14.223058,2.342722,19.088833,3.626302,...,11.684531,0.0,0.0,11.154415,21.949262


There are 12 groups of such genes and these are:

In [20]:
from helpers.data_frame import extract_duplicates

data_duplicates = extract_duplicates(tbm_vm_data.reset_index(), patient_columns, rna_index)
full_table(data_duplicates)

Unnamed: 0_level_0,Unnamed: 1_level_0,description,ensembl_gene_name,ensembl_id
group,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,"RNA, U1 small nuclear 28, pseudogene",RNU1-28P,ENSG00000206588
0,2,"RNA, U1 small nuclear 1",RNU1-1,ENSG00000206652
0,3,"RNA, variant U1 small nuclear 18",RNVU1-18,ENSG00000206737
0,4,"RNA, U1 small nuclear 3",RNU1-3,ENSG00000207513
0,5,U1 spliceosomal RNA,U1,ENSG00000275405
1,6,"RNA, U1 small nuclear 27, pseudogene",RNU1-27P,ENSG00000206596
1,7,"RNA, U1 small nuclear 2",RNU1-2,ENSG00000207005
1,8,"RNA, U1 small nuclear 4",RNU1-4,ENSG00000207389
1,9,U1 spliceosomal RNA,U1,ENSG00000273768
2,10,activating transcription factor 4 pseudogene 1,ATF4P1,ENSG00000213338


As the duplicated groups present genes having similar functions this appears to be a genuine artifact from the mapping procedure.

Interestingly, there are many pseudogenes and RNA genes. I will keep the full data for now and remove the duplicates later on.

Just to have it quickly accessible:

In [21]:
data_duplicates.to_csv(duplicates_path)

#### Differentially Expressed Genes (DEGs)

In [22]:
tbm_vm_deg.sort_values('padj').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
ensembl_id,ensembl_gene_name,description,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000100336,APOL4,apolipoprotein L4,80.607489,2.529792,0.479032,5.28105,1.284456e-07,0.000941
ENSG00000125538,IL1B,interleukin 1 beta,926.175471,4.576987,0.899595,5.08783,3.621846e-07,0.001054
ENSG00000179388,EGR3,early growth response 3,334.404894,3.485038,0.689505,5.054402,4.317408e-07,0.001054
ENSG00000165997,ARL5B,ADP ribosylation factor like GTPase 5B,222.402492,2.320539,0.475704,4.878117,1.071032e-06,0.001961
ENSG00000127946,HIP1,huntingtin interacting protein 1,249.836938,1.642896,0.354008,4.64085,3.469786e-06,0.005083


In [23]:
tbm_vm_deg.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
ensembl_id,ensembl_gene_name,description,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000284512,AC092718.8,0,15.697566,-0.2845,0.810605,-0.350972,0.725609,
ENSG00000284513,AC006063.2,0,13.256127,0.08816,0.892035,0.09883,0.921274,
ENSG00000284523,AC004834.1,0,27.773974,0.674403,0.791553,0.852,0.394214,
ENSG00000284526,AC015802.6,0,54.314726,0.686323,0.585895,1.171409,0.241434,
ENSG00000284543,LINC01226,long intergenic non-protein coding RNA 1226,76.657661,0.30707,0.510789,0.601167,0.547729,0.829314


##### Are there any duplicates?

In [24]:
tbm_vm_deg.duplicated().any()

True

This was expected as there are dupllicates in the data. Are those the same?

In [25]:
tv_deg_duplicates = extract_duplicates(tbm_vm_deg.reset_index(), deg_columns, rna_index)
set(tv_deg_duplicates.ensembl_id) == set(data_duplicates.ensembl_id)

True

##### Can I quickly reproduce the log2FoldChange in Python?

In [26]:
from helpers.data_frame import select_columns

In [27]:
tmd = select_columns(tbm_vm_data, '.*TMD').T
vir = select_columns(tbm_vm_data, '.*VM').T

In [28]:
from numpy import log2

def log2_fold_change(a, b):
    fold_change = a.mean() / b.mean()
    return log2(fold_change)

In [29]:
DataFrame({'python': log2_fold_change(tmd, vir), 'deseq2': tbm_vm_deg['log2FoldChange']}).corr()   # Pearson ρ

Unnamed: 0,python,deseq2
python,1.0,0.986152
deseq2,0.986152,1.0


Good enough. The small differences may arise from the differences in DESeq2 procedure as explained by the author at [bioconductor forum](https://support.bioconductor.org/p/88813/).

In [30]:
tbm_vm_deg.to_csv(definite_tbm_vm_deg_path)

### Definite TBM vs cryptococal (DEG_P75_TBM_CM)

In [31]:
tbm_cm = definite_tbm_rna['DEG_P75_TBM_CM']
tbm_cm.head()

Unnamed: 0,Ensembl_ID,Feature,Description,baseMean,log2FoldChange,...,154.VM,156.VM,165.VM,255.VM,205.VM
0,ENSG00000100226,GTPBP1,GTP binding protein 1,580.453924,1.19219,...,324.472455,422.875238,361.979002,464.253944,512.930995
1,ENSG00000165997,ARL5B,ADP ribosylation factor like GTPase 5B,222.402492,1.96333,...,92.878733,91.957987,52.008477,147.382204,38.469825
2,ENSG00000164691,TAGAP,T-cell activation RhoGTPase activating protein,1023.319959,1.964425,...,272.605111,423.485786,170.587805,392.668301,621.928831
3,ENSG00000108771,DHX58,DExH-box helicase 58,125.298558,1.785542,...,41.011388,80.357567,53.048647,225.284227,76.939649
4,ENSG00000128016,ZFP36,ZFP36 ring finger protein,2070.881325,1.683438,...,539.179135,827.809745,778.04682,1328.545299,925.680155


In [32]:
tbm_cm.tail()

Unnamed: 0,Ensembl_ID,Feature,Description,baseMean,log2FoldChange,...,154.VM,156.VM,165.VM,255.VM,205.VM
21483,ENSG00000179041,RRS1,ribosome biogenesis regulator homolog,54.538767,0.000362,...,110.971992,42.315703,98.816107,50.531041,69.726557
21484,ENSG00000188582,PAQR9,progestin and adipoQ receptor family member 9,21.484805,-0.000857,...,18.09326,13.572961,28.084578,33.687361,45.682917
21485,ENSG00000092470,WDR76,WD repeat domain 76,50.967584,0.000159,...,80.816559,47.904569,74.892207,47.372851,56.903282
21486,ENSG00000243508,AC108688.1,0,12.289421,0.000205,...,14.474608,11.130768,26.004239,9.47457,15.227639
21487,ENSG00000160973,FOXH1,forkhead box H1,29.928712,-1.5e-05,...,16.887042,28.601846,43.687121,18.949141,6.411637


In [33]:
assert (tbm_cm['Feature'] == tbm_cm['Feature.1']).all()

In [34]:
tbm_cm = tbm_cm.drop('Feature.1', axis='columns')
tbm_cm = tbm_cm.rename(columns={'Feature': 'ensembl_gene_name'})
tbm_cm['Description'] = tbm_cm['Description'].replace('0', nan)

In [35]:
tbm_cm.columns = to_lowercase(tbm_cm.columns, limit_to=rna_raw_index)

In [36]:
patient_columns = list(tbm_cm.columns.difference(deg_columns + rna_index))

tbm_cm_deg = tbm_cm[rna_index + deg_columns].set_index(rna_index)
tbm_cm_data = tbm_cm[rna_index + patient_columns].set_index(rna_index)

#### Data

So far the data looks similar, including the number of columns. What are the differences?

In [37]:
set(tbm_vm_data.columns) == set(tbm_cm_data.columns)

True

The column names are the same

In [38]:
(tbm_cm_data[tbm_cm_data.columns] == tbm_cm_data).all().all()

True

And values are also the same. So the data is just duplicated across the spread sheets.

As there are no measurments in the third spreadsheet I will just save the data now:

In [39]:
rna_data_definite_tbm = tbm_cm_data = tbm_cm_data

In [40]:
rna_data_definite_tbm.to_csv(tbm_subset_clean_path)

### DEGs

In [41]:
tbm_cm_deg.sort_values('padj').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
ensembl_id,ensembl_gene_name,description,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000100226,GTPBP1,GTP binding protein 1,580.453924,1.19219,0.219708,5.426243,5.755269e-08,0.000517
ENSG00000165997,ARL5B,ADP ribosylation factor like GTPase 5B,222.402492,1.96333,0.393233,4.992789,5.951353e-07,0.001903
ENSG00000164691,TAGAP,T-cell activation RhoGTPase activating protein,1023.319959,1.964425,0.394441,4.980279,6.349261e-07,0.001903
ENSG00000108771,DHX58,DExH-box helicase 58,125.298558,1.785542,0.36715,4.863249,1.154747e-06,0.002481
ENSG00000128016,ZFP36,ZFP36 ring finger protein,2070.881325,1.683438,0.348691,4.82788,1.37994e-06,0.002481


##### Duplicates
Again, are the same as the data duplicates?

In [42]:
tbm_cm_duplicates = extract_duplicates(tbm_cm_deg.reset_index(), deg_columns, rna_index)
tbm_cm_duplicates.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,description,ensembl_gene_name,ensembl_id
group,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,activating transcription factor 4 pseudogene 1,ATF4P1,ENSG00000213338
0,2,activating transcription factor 4 pseudogene 2,ATF4P2,ENSG00000273041
1,3,chromosome X open reading frame 49B,CXorf49B,ENSG00000215113
1,4,chromosome X open reading frame 49,CXorf49,ENSG00000215115
2,5,family with sequence similarity 231 member A,FAM231A,ENSG00000237847


In [43]:
set(data_duplicates.ensembl_id) == set(tbm_cm_duplicates.ensembl_id)

True

##### Quick log2FoldChange check

In [44]:
cm = select_columns(tbm_cm_data, '.*CM').T
DataFrame({'python': log2_fold_change(tmd, cm), 'deseq2': tbm_cm_deg['log2FoldChange']}).corr()   # Pearson ρ

Unnamed: 0,python,deseq2
python,1.0,0.985204
deseq2,0.985204,1.0


Everything is fine here!

In [45]:
tbm_cm_deg.to_csv(definite_tbm_cm_deg_path)

### Cryptococcal vs viral (DEG_P75_CM_VM)

So this time there are no data, only DEGs:

In [46]:
cm_vm = definite_tbm_rna['DEG_P75_CM_VM']
cm_vm

Unnamed: 0,Ensembl_ID,Feature,Description,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,ENSG00000147457,CHMP7,charged multivesicular body protein 7,146.620981,-0.950047,0.215228,-4.414134,0.000010,0.217920
1,ENSG00000136872,ALDOB,"aldolase, fructose-bisphosphate B",8.163739,3.487265,0.897156,3.887021,0.000101,0.432580
2,ENSG00000164300,SERINC5,serine incorporator 5,209.646143,-1.151876,0.286659,-4.018282,0.000059,0.432580
3,ENSG00000178075,GRAMD1C,GRAM domain containing 1C,33.411785,1.825325,0.474786,3.844523,0.000121,0.432580
4,ENSG00000179388,EGR3,early growth response 3,334.404894,2.543094,0.647976,3.924671,0.000087,0.432580
...,...,...,...,...,...,...,...,...,...
21483,ENSG00000272588,AC139887.4,0,16.588560,0.000253,0.706493,0.000359,0.999714,0.999917
21484,ENSG00000085117,CD82,CD82 molecule,156.867101,0.000042,0.486934,0.000086,0.999932,0.999997
21485,ENSG00000102309,PIN4,"peptidylprolyl cis/trans isomerase, NIMA-inter...",31.521707,-0.000020,0.509869,-0.000039,0.999969,0.999997
21486,ENSG00000188305,PEAK3,PEAK family member 3,81.830585,0.000002,0.501223,0.000004,0.999997,0.999997


#### DEGs

In [47]:
cm_vm['Description'] = cm_vm['Description'].replace('0', nan)
cm_vm = cm_vm.rename(columns={'Feature': 'ensembl_gene_name'})
cm_vm.columns = to_lowercase(cm_vm.columns, limit_to=rna_raw_index)
cm_vm = cm_vm.set_index(rna_index)

##### Duplicates

In [48]:
cm_vm_duplicates = extract_duplicates(cm_vm.reset_index(), deg_columns, rna_index)
set(cm_vm_duplicates.ensembl_id) == set(tbm_cm_duplicates.ensembl_id)

True

##### Quick log2FoldChange check

In [49]:
DataFrame({'python': log2_fold_change(cm, vir), 'deseq2': cm_vm['log2FoldChange']}).corr()   # Pearson ρ

Unnamed: 0,python,deseq2
python,1.0,0.995362
deseq2,0.995362,1.0


Great!

In [50]:
cm_vm.to_csv(cm_vm_deg_path)

## All Samples

In [51]:
rna_seq = read_table(all_samples_path)

In [52]:
rna_seq

Unnamed: 0,Ensembl_ID,Feature,001.TMD,006.CM,012.BM,...,174.CM,011.TMR,043.TMS,078.CM,261.CM
0,ENSG00000000003,TSPAN6,11.596119,7.806308,1.391555,...,4.431071,51.810470,35.628629,0.000000,20.705553
1,ENSG00000000005,TNMD,0.000000,0.000000,0.000000,...,0.000000,11.102244,0.000000,0.000000,4.141111
2,ENSG00000000419,DPM1,0.000000,31.225230,36.876198,...,25.478656,11.102244,0.000000,161.520987,39.754662
3,ENSG00000000457,SCYL3,127.557308,145.717740,86.276387,...,121.854440,96.219444,3.958737,92.395480,125.889763
4,ENSG00000000460,C1orf112,46.384476,0.000000,50.791744,...,33.233029,49.960096,38.267787,0.000000,30.644219
...,...,...,...,...,...,...,...,...,...,...,...
38659,ENSG00000284543,LINC01226,92.768952,200.361893,27.135315,...,175.027287,170.234402,318.018506,131.406905,52.177994
38660,ENSG00000284546,SSU72P3,0.000000,0.000000,0.000000,...,0.000000,14.802991,0.000000,0.000000,0.000000
38661,ENSG00000284552,AC106774.10,0.000000,0.000000,0.000000,...,11.077676,9.251870,0.000000,27.376439,0.828222
38662,ENSG00000284572,AC099654.15,0.000000,28.623128,0.000000,...,0.000000,1.850374,0.000000,0.000000,0.000000


Note to self: this is on the gene level. Are there transcript-level data available?

In [53]:
assert not rna_seq.duplicated().any()

In [54]:
rna_seq = rna_seq.rename(columns={'Feature': 'ensembl_gene_id'})
rna_seq_index = ['Ensembl_ID', 'ensembl_gene_id']
rna_seq.columns = to_lowercase(rna_seq.columns, limit_to=rna_seq_index)
rna_seq_index = to_lowercase(rna_seq_index)

In [55]:
rna_seq = rna_seq.set_index(rna_seq_index)

In [56]:
rna_seq.duplicated().any()

True

In [57]:
rna_duplicates= extract_duplicates(rna_seq.reset_index(), list(rna_seq.columns.difference(rna_seq_index)), rna_seq_index)
rna_duplicates

Unnamed: 0_level_0,Unnamed: 1_level_0,ensembl_gene_id,ensembl_id
group,index,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,AC009238.1,ENSG00000227120
0,2,AC009237.6,ENSG00000235959
1,3,AC141586.2,ENSG00000260176
1,4,AC093525.5,ENSG00000261288
2,5,AC008175.1,ENSG00000235059
...,...,...,...
31,76,FP671120.1,ENSG00000278996
31,77,FP236383.1,ENSG00000280441
32,78,FP236383.2,ENSG00000280614
32,79,FP671120.3,ENSG00000280800


More duplicates - likely due to having more genes.

In [58]:
rna_duplicates.to_csv(all_samples_duplicates_path)

In [59]:
rna_seq.to_csv(all_samples_clean_path)

### Additional patients

should be tbm but not definite:

In [60]:
set(rna_seq.columns) - set(rna_data_definite_tbm.columns)

{'011.TMR',
 '012.BM',
 '028.TMR',
 '043.TMS',
 '056.TMS',
 '079.TMR',
 '100.TMS',
 '155.TMR',
 '169.TMR',
 '172.TMR',
 '234.BM',
 '239.TMS'}

R - probable, S - possible, BM - bacterial (non-TB or TB even less likely than in the "possible" group)

### Additional genes

38664 genes, much more than for the definite subset (21488). Is it due to the completeness filtering?

In [61]:
ratio_empty = ((rna_seq == 0).apply(sum, axis=1) / len(rna_seq.columns))

In [62]:
len(rna_seq.loc[~(ratio_empty > 0.25)])

20688

Almost. I got fewer genes: 20688, not 21488. Maybe it was 1/5 and not 1/4?

In [63]:
len(rna_seq.loc[~(ratio_empty > 0.20)])

18424

Or maybe the filtering criterion was applied after taking the subset of the data - that would make sense...

In [64]:
subset = rna_seq[rna_data_definite_tbm.columns]
ratio_empty_subset = ((subset == 0).apply(sum, axis=1) / len(subset.columns))
filtered = subset[ratio_empty_subset <= 0.25]
len(filtered)

21511

Now it is a little bit more. Close enough?

In [65]:
len(filtered) - len(rna_data_definite_tbm)

23

So what is the difference?

In [66]:
reproduction = set(filtered.reset_index().ensembl_id)
original_genes = set(rna_data_definite_tbm.reset_index().ensembl_id)
original_genes - reproduction

set()

I got some additional genes:

In [67]:
difference = Series(list(reproduction - original_genes))

In [68]:
ratio_empty_subset.loc[difference].to_frame().T

ensembl_id,ENSG00000141748,ENSG00000156574,ENSG00000177553,ENSG00000189253,ENSG00000197744,...,ENSG00000259856,ENSG00000271361,ENSG00000279730,ENSG00000283093,ENSG00000284294
ensembl_gene_id,ARL5C,NODAL,AL953897.1,TRIM64B,PTMAP2,...,RAB43P1,HTATSF1P2,SETD8P1,CENPVL2,AC007326.5
0,0.243243,0.243243,0.243243,0.243243,0.243243,...,0.243243,0.243243,0.243243,0.243243,0.243243


Okay, it is close enough. Maybe R or Excel does some funny rounding errors and thus the 23 were excluded too. Or maybe a different set of samples was excluded initially?

Also, it is barely 0.1% of the data:

In [69]:
f'{23 / 21511 * 100:.2f}%'

'0.11%'

### Metadata

In [70]:
metadata = read_table(metadata_path, index_col=0)

In [71]:
metadata

Unnamed: 0,Condition,Group,Batch
001.TMD,TMD,TBM,FC2
006.CM,CM,CM,FC2
012.BM,BM,BM,FC2
016.CM,CM,CM,FC2
017.TMD,TMD,TBM,FC2
...,...,...,...
174.CM,CM,CM,FC1
011.TMR,TMR,TBM,FC1
043.TMS,TMS,TMS,FC1
078.CM,CM,CM,FC1


In [72]:
metadata.Batch.unique()

array(['FC2', 'FC1'], dtype=object)

Are al samples present?

In [73]:
assert set(rna_seq.columns) == set(metadata.index)

Are ids, conditions, and groups matching?

In [74]:
from_id = metadata.index.str.split('.').str[1]
assert (from_id == metadata.Condition).all()

What I expected:

In [75]:
condition_to_group = {
    'TMD': 'TBM',
    'CM': 'CM',
    'TMR': 'TBM',
    'TMS': 'TBM',
    'BM': 'BM',
    'VM': 'VM'
}

In [76]:
mapped = from_id.map(condition_to_group.get)

In [77]:
(mapped == metadata.Group).all()

False

It seems that the "probable TB" condition stands on its own.

In [78]:
metadata[~(mapped == metadata.Group)]

Unnamed: 0,Condition,Group,Batch
056.TMS,TMS,TMS,FC2
100.TMS,TMS,TMS,FC2
239.TMS,TMS,TMS,FC1
043.TMS,TMS,TMS,FC1


But is this really relevant? Is this used anywhere?