# Data Validation and Sanity Checks

In this notebook, we'll compare the outputs of our GeneExpressionData class with the original expression matrices and labels

In [1]:
import pandas as pd 
import os
import sys
import pandas as pd
import numpy as np
from torch.utils.data import *
from tqdm import tqdm
import linecache 
from pytorch_tabnet.tab_model import TabNetClassifier
sys.path.append('../src/')
sys.path.append('..')

from src.models.lib.neural import GeneClassifier

In [2]:
from src.models.lib.data import *
from src.helper import *

## Intersection between reference columns with current dataset columns 

Here, we'll write test code for the methods used in mapping an arbitrary dataset sample to a list of given reference columns, since this method being correct is extremely important 

In [3]:
def clean_sample(sample, refgenes, currgenes):
    intersection = np.intersect1d(currgenes, refgenes, return_indices=True)
    indices = intersection[1] # List of indices in currgenes that equal refgenes 
    
    axis = (1 if sample.ndim == 2 else 0)
    sample = np.sort(sample, axis=axis)
    sample = np.take(sample, indices, axis=axis)

    return torch.from_numpy(sample)

### Unit Test Example 01

In [4]:
def test1():
    ref = ['a', 'b', 'c']
    curr = ['b', 'a', 'c', 'd'] 
    sample = np.array([1,2,3,4]) # Want --> [2,1,3]

    result = clean_sample(sample, ref, curr)
    desired = torch.from_numpy(np.array([2,1,3]))
    
    assert torch.equal(result, desired)
    
def test2():
    ref = ['a', 'b', 'c']
    curr = ['c', 'd', 'b', 'a']

    sample = np.array(
        [[1,2,3,4],
         [5,6,7,8]]
    ) 
    # --> want [[4, 3, 1],
    #           [8, 7, 5]]

    res = clean_sample(sample, ref, curr)
    desired = torch.from_numpy(np.array([
        [4,3,1],
        [8,7,5]
    ]))
    
    assert torch.equal(res, desired)
    
test1()
test2()

From initial tests, `clean_sample` seems to be working correctly.

## Validation of GeneExpressionData class with the original expression matrices and label files 

In this section, we'll confirm that the GeneExpressionData method returns the correct (sample, label) pairs relative to the original expression matrices and raw label files.

In [5]:
datafiles, labelfiles = list(INTERIM_DATA_AND_LABEL_FILES_LIST.keys()), list(INTERIM_DATA_AND_LABEL_FILES_LIST.values())

datafiles = [os.path.join('..', 'data', 'interim', f) for f in datafiles]
labelfiles = [os.path.join('..', 'data', 'processed/labels', f) for f in labelfiles]

for labelfile in labelfiles:
    display(pd.read_csv(labelfile)['cell'])

0              0
1              1
2              2
3              3
4              4
           ...  
186471    189404
186472    189405
186473    189406
186474    189407
186475    189408
Name: cell, Length: 186476, dtype: int64

0            1
1            2
2            3
3            4
4            5
         ...  
47504    49489
47505    49490
47506    49491
47507    49492
47508    49493
Name: cell, Length: 47509, dtype: int64

0            0
1            1
2            2
3            3
4            4
         ...  
76528    76528
76529    76529
76530    76530
76531    76531
76532    76532
Name: cell, Length: 76533, dtype: int64

0              0
1              1
2              2
3              3
4              4
           ...  
659546    691923
659547    691924
659548    691925
659549    691926
659550    691927
Name: cell, Length: 659551, dtype: int64

Next, we define a function that takes the first `N` samples from the GeneExpressionData object and from the raw expression matrix and compares the samples to make sure they are equal.

In [45]:
N = 5

def test_first_n(n, datafile, labelfile):
    data = GeneExpressionData(datafile, labelfile, 'Type', skip=3, index_col='cell')
    cols = data.columns
    
    # Generate dict with half precision values to read this into my 16gb memory
    data_df = pd.read_csv(datafile, nrows=2*n, header=1, dtype=np.float32) # Might need some extras since numerical index drops some values
    label_df = pd.read_csv(labelfile, nrows=n)

    similar = []
    for i in range(n):
        datasample = data[i][0]

        idx = label_df.loc[i, 'cell']
        dfsample = torch.from_numpy(data_df.loc[idx, :].values).float()
        
        isclose = all(torch.isclose(datasample, dfsample))
        
        similar.append(isclose)
    
    print(f"First {n=} columns of expression matrix is equal to GeneExpressionData: {all(p for p in similar)}")

def test_train_test_split(datafile, labelfile):
    from sklearn.model_selection import train_test_split 
    
    label_df = pd.read_csv(labelfile)
    labels = label_df.loc[:, 'Type']
    
    train, test = train_test_split(labels, stratify=labels, random_state=42)
    display(label_df.head(50))
    display(label_df.loc[train.index, :])

    train, test = (
        GeneExpressionData(
            datafile,
            labelfile,
            index_col='cell',
            class_label='Type',
            indices=index
        )
        for index in [train.index, test.index]
    )
    
    return train, test
# for datafile, labelfile in zip(datafiles, labelfiles):
#     print(f'{datafile=}')
#     test_first_n(N, datafile, labelfile)

In [46]:
train, test = test_train_test_split(datafiles[0], labelfiles[0])

Unnamed: 0,cell,Type
0,0,16
1,1,16
2,2,4
3,3,4
4,4,4
5,5,4
6,6,16
7,7,4
8,8,4
9,9,4


Unnamed: 0,cell,Type
151099,151162,7
99871,99911,4
122763,122812,16
64141,64170,4
146925,146988,4
...,...,...
123936,123985,4
90214,90249,4
167054,167125,4
21468,21489,4


In [37]:
train._labeldf

Unnamed: 0,cell,Type
0,151162,7
1,99911,4
2,122812,16
3,64170,4
4,146988,4
...,...,...
139852,123985,4
139853,90249,4
139854,167125,4
139855,21489,4


In [8]:
train._labeldf

In [9]:
data._labeldf

Unnamed: 0,cell,Type
0,1,7
1,2,7
2,3,7
3,4,7
4,5,7
...,...,...
47504,49489,0
47505,49490,7
47506,49491,7
47507,49492,12


In [14]:
train, val, test = generate_single_dataset(
    datafile=datafiles[1],
    labelfile=labelfiles[1],
    class_label='Type',
    index_col='cell',
    skip=3,
)



In [15]:
train, val, test

(GeneExpressionData(filename=../data/interim/allen_cortex_T.csv, labelname=../data/processed/labels/allen_cortex_labels.csv),
 GeneExpressionData(filename=../data/interim/allen_cortex_T.csv, labelname=../data/processed/labels/allen_cortex_labels.csv),
 GeneExpressionData(filename=../data/interim/allen_cortex_T.csv, labelname=../data/processed/labels/allen_cortex_labels.csv))

In [16]:
val._labeldf

Unnamed: 0,cell,Type
0,18811,7
1,116,7
2,32348,7
3,5325,8
4,33223,7
...,...,...
9497,38525,7
9498,36427,7
9499,5670,7
9500,41876,7


## TabNet Classifier validation

Since the TabNet package is designed to be used with the `sklearn` API, we'll write a custom `pl.LightningModule` with the TabNet classifier as the base class, and make sure that the correct `forward` method is returned. Essentially, validating that our wrapper doesn't change any internals.

In [4]:
from pytorch_tabnet.tab_model import TabNetClassifier
from models.lib.neural import TabNetGeneClassifier

train, val, test = generate_dataloaders(datafiles=datafiles, labelfiles=labelfiles, class_label='Type', skip=3, batch_size=4, num_workers=0)
sample = next(iter(train))

model = TabNetGeneClassifier(input_dim=19765, output_dim=17)

Model initialized. input_dim = 19765, output_dim = 17. Metrics are dict_keys(['accuracy', 'precision', 'recall']) and weighted_metrics = False


In [5]:
model(sample)

AttributeError: 'tuple' object has no attribute 'dim'