# Tests for all data code 

The most important thing for model training is that the data is bug free. Here, we'll generate some synethic data and labels, and make sure the train/test/val splits are working correctly. 

In [1]:
import sys
sys.path.append('../src')
import pandas as pd 
import numpy as np
import random
import anndata as an
import scanpy as sc 

from models.lib.data import *
from models.lib.lightning_train import *
from models.lib.neural import *

random.seed(42)

train, val, test = generate_dataloaders(
    datafiles=['../data/tests/test_matrix.csv'],
    labelfiles=['../data/tests/test_labels.csv'],
    class_label='label',
    index_col='index_col',
    skip=2,
    stratify=False,
    batch_size=5,
)

tensor([[14., 14., 14., 14., 14., 14., 14., 14., 14., 14.],
        [11., 11., 11., 11., 11., 11., 11., 11., 11., 11.],
        [ 9.,  9.,  9.,  9.,  9.,  9.,  9.,  9.,  9.,  9.],
        [23., 23., 23., 23., 23., 23., 23., 23., 23., 23.]])

Let's generate some synethic data and labels, where each row of the DataFrame contains numbers equal to the row index (for easy error checking). Additionally, we'll create a label DataFrame with an index_col and make sure the label splits are done correctly. 

In [2]:
df = pd.DataFrame(index=range(25), columns=[f'col_{i}' for i in range(10)])

for i in range(25):
    df.loc[i, :] = [i]*10

cols = df.columns 

df = an.AnnData(df.values)
df.var.index = cols
df.var.index

Index(['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7',
       'col_8', 'col_9'],
      dtype='object')

Now let's generate some synethic labels with basically the same properties, as well as an index_col with a random subset of the indices

In [3]:
labels = pd.DataFrame(index=range(25), columns=['index_col', 'label'])

labels['index_col'] = [int(x) for x in random.sample(set(df.obs.index), k=len(df))]
labels['label'] = labels['index_col']

since Python 3.9 and will be removed in a subsequent version.
  labels['index_col'] = [int(x) for x in random.sample(set(df.obs.index), k=len(df))]


Now, we can write these to file and test our dataset generation methods

In [4]:
labels.to_csv('../data/tests/test_labels.csv', index=False)
df.write_h5ad('../data/tests/test_matrix.h5ad')

In [5]:
labels

Unnamed: 0,index_col,label
0,0,0
1,10,10
2,22,22
3,1,1
4,14,14
5,13,13
6,12,12
7,21,21
8,3,3
9,17,17


And the same for the `.csv` file, for testing

In [6]:
df = pd.DataFrame(df.X, columns=cols)
df.to_csv('../data/tests/test_matrix.csv', index=False)

In [7]:
pd.read_csv('../data/tests/test_labels.csv')

Unnamed: 0,index_col,label
0,0,0
1,10,10
2,22,22
3,1,1
4,14,14
5,13,13
6,12,12
7,21,21
8,3,3
9,17,17


Let's first try generating an AnnData Dataset using the same code as in generate_single_dataset

In [8]:
current_labels = labels.loc[:, 'label']
df

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
4,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
5,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
6,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
7,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
8,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
9,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0


In [9]:
train, val, test = generate_single_dataset(
    datafile='../d'
)

In [10]:
train.labels

array([22, 10, 24,  2,  6, 15, 12, 19,  4, 18,  1, 23, 14])

In [11]:
train[0: len(train.labels)]

[(tensor([22., 22., 22., 22., 22., 22., 22., 22., 22., 22.]), 22),
 (tensor([10., 10., 10., 10., 10., 10., 10., 10., 10., 10.]), 10),
 (tensor([24., 24., 24., 24., 24., 24., 24., 24., 24., 24.]), 24),
 (tensor([2., 2., 2., 2., 2., 2., 2., 2., 2., 2.]), 2),
 (tensor([6., 6., 6., 6., 6., 6., 6., 6., 6., 6.]), 6),
 (tensor([15., 15., 15., 15., 15., 15., 15., 15., 15., 15.]), 15),
 (tensor([12., 12., 12., 12., 12., 12., 12., 12., 12., 12.]), 12),
 (tensor([19., 19., 19., 19., 19., 19., 19., 19., 19., 19.]), 19),
 (tensor([4., 4., 4., 4., 4., 4., 4., 4., 4., 4.]), 4),
 (tensor([18., 18., 18., 18., 18., 18., 18., 18., 18., 18.]), 18),
 (tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), 1),
 (tensor([23., 23., 23., 23., 23., 23., 23., 23., 23., 23.]), 23),
 (tensor([14., 14., 14., 14., 14., 14., 14., 14., 14., 14.]), 14)]

In [32]:
train, val, test = generate_dataloaders(
    datafiles=['../data/tests/test_matrix.csv'],
    labelfiles=['../data/tests/test_labels.csv'],
    class_label='label',
    index_col='index_col',
    skip=2,
    stratify=False,
    batch_size=5,
)

In [None]:
train, val, test = generate_single_dataset(
    datafile='../data/'
)

In [36]:
for X, y in zip(train.data, train.labels):
    assert X[0] - y == 0

Now let's try the same testing for `.csv` files

In [29]:
# Make stratified split on labels
trainsplit, valsplit = train_test_split(current_labels)
trainsplit, testsplit = train_test_split(trainsplit)

train, val, test = (
    GeneExpressionData(
        filename='../data/tests/test_matrix.csv',
        labelname='../data/tests/test_labels.csv',
        class_label='label',
        index_col='index_col',
        indices=indices,
        skip=2,
    )
    for indices in [trainsplit.index, valsplit.index, testsplit.index]  
)

In [30]:
train[0:10]

[(tensor([9., 9., 9., 9., 9., 9., 9., 9., 9., 9.]), 9),
 (tensor([23., 23., 23., 23., 23., 23., 23., 23., 23., 23.]), 23),
 (tensor([16., 16., 16., 16., 16., 16., 16., 16., 16., 16.]), 16),
 (tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), 1),
 (tensor([13., 13., 13., 13., 13., 13., 13., 13., 13., 13.]), 13),
 (tensor([19., 19., 19., 19., 19., 19., 19., 19., 19., 19.]), 19),
 (tensor([10., 10., 10., 10., 10., 10., 10., 10., 10., 10.]), 10),
 (tensor([3., 3., 3., 3., 3., 3., 3., 3., 3., 3.]), 3),
 (tensor([8., 8., 8., 8., 8., 8., 8., 8., 8., 8.]), 8),
 (tensor([17., 17., 17., 17., 17., 17., 17., 17., 17., 17.]), 17)]

In [24]:
pd.read_csv('../data/tests/test_matrix.csv')

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
4,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
5,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
6,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
7,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
8,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
9,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0


In [39]:
t = GeneExpressionData(
    filename='../data/tests/test_matrix.csv',
    labelname='../data/tests/test_labels.csv',
    class_label='label',
#     index_col='index_col',
    skip=2,
)

t[0:5]

[(tensor([17., 17., 17., 17., 17., 17., 17., 17., 17., 17.]), 17),
 (tensor([3., 3., 3., 3., 3., 3., 3., 3., 3., 3.]), 3),
 (tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 0),
 (tensor([9., 9., 9., 9., 9., 9., 9., 9., 9., 9.]), 9),
 (tensor([8., 8., 8., 8., 8., 8., 8., 8., 8., 8.]), 8)]