# Sample intersection between genomes, function test code 

In this notebook, we'll test our the sample intersection code to make sure it's functioning properly, since it's essential for training/testing with multiple datasets

In [1]:
import numpy as np
import anndata as an
import pandas as pd 
import torch
from typing import *
import matplotlib.pyplot as plt 



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def test_clean_sample(
    sample: torch.Tensor,
    refgenes: List[str],
    currgenes: List[str],
) -> torch.Tensor:
    
    intersection = np.intersect1d(currgenes, refgenes, return_indices=True)
    indices = intersection[1] # List of indices in sorted(currgenes) that equal sorted(refgenes)
    
    axis = (1 if sample.ndim == 2 else 0)
    sample = np.sort(sample, axis=axis)
    sample = np.take(sample, indices, axis=axis)

    return sample

Let's write a bunch of tests!

In [3]:
refgenes = ['a', 'b', 'c', 'd']
currgenes = ['a', 'b', 'c']

sample = np.array([1,2,3])

res = test_clean_sample(sample, refgenes, currgenes)

assert all(res == np.array([1,2,3]))

In [4]:
refgenes = ['a', 'c', 'd']
currgenes = ['a', 'b', 'c', 'd', 'e']

sample = np.array([1,2,3,4,5])

res = test_clean_sample(sample, refgenes, currgenes)

assert all(res == np.array([1,3,4]))

In [5]:
refgenes = ['c', 'a', 'd', 'b']
currgenes = ['a', 'b', 'c', 'd', 'e']

sample = np.array([1,2,3,4,5])

res = test_clean_sample(sample, refgenes, currgenes)

assert all(res == np.array([1,2,3,4]))

## Testing the entire pipeline

Now, let's generate some synethic data and make sure the refgenes and currgenes are recapitulated through a DataLoader with our custom DataModule.

In [6]:
import pandas as pd 
import sys

sys.path.append('../src')
from models.lib.neural import *
from models.lib.lightning_train import *
from models.lib.train import *

We'll create a two dummy csv files to calculate the intersection of, then run these through the DataLoader pipeline and make sure the output is as expected.

In [53]:
def create_dummy_data(columns, n=25, name=None):
    index = list(range(n))
    df = pd.DataFrame(index=index, columns=columns)
    
    for idx, col in enumerate(columns):
        df.loc[:, col] = [idx]*n
    
    if name is not None:
        df.to_csv(name, index=False)
        
    return df

def create_dummy_label(k=1, n=25, name=None):
    index = list(range(n))
    df = pd.DataFrame(index=index, columns=['label'])
    
    for i in index:
        df.loc[i, :] = k
    
    if name is not None:
        df.to_csv(name, index=False)
        
    return df

curr_df = create_dummy_data(['a', 'b', 'c', 'd'], name='curr_test.csv')
ref_df = create_dummy_data(['a', 'd', 'b'], name='ref_test.csv')

curr_labels = create_dummy_label(k=0, name='label_curr_test.csv')
ref_labels = create_dummy_label(k=1, name='label_curr_test.csv')

refgenes = list(set(ref_df.columns).intersection(curr_df.columns))
refgenes

['a', 'd', 'b']

In [54]:
datafiles=['curr_test.csv', 'ref_test.csv']
labelfiles=['label_curr_test.csv', 'label_curr_test.csv']

pd.read_csv(datafiles[0]).head(5)

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,0,1,2,3
2,0,1,2,3
3,0,1,2,3
4,0,1,2,3


In [55]:
refgenes

['a', 'd', 'b']

In [56]:
from models.lib.data import generate_single_dataloader

module = DataModule(
    datafiles=['curr_test.csv'],
    labelfiles=['label_curr_test.csv'],
    class_label='label',
    currgenes=curr_df.columns,
    refgenes=refgenes,
    index_col=None,
    test_prop=0.2,
    sep=',',
    subset=None,
    stratify=False,
    batch_size=4,
    num_workers=0,
)

In [57]:
module.setup()

Creating train/val/test DataLoaders...
Done, continuing to training.
Calculating weights


In [59]:
%%timeit 

next(iter(module.trainloader))

113 µs ± 740 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [62]:
module.trainloader.dataset.shape[0] + module.valloader.dataset.shape[0] + module.testloader.dataset.shape[0]

25

In [15]:
curr_df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,0,1,2,3
2,0,1,2,3
3,0,1,2,3
4,0,1,2,3
5,0,1,2,3
6,0,1,2,3
7,0,1,2,3
8,0,1,2,3
9,0,1,2,3
