In [3]:
import pandas as pd 
import os
import sys
import pandas as pd
import numpy as np
from torch.utils.data import *
from tqdm import tqdm
import linecache 

sys.path.append('../src/')
sys.path.append('..')

from src.models.lib.neural import GeneClassifier

In [4]:
from src.models.lib.data import *
from src.helper import *

In [5]:
test = GeneExpressionData(
    filename='../data/interim/allen_cortex_T.csv',
    labelname='../data/processed/labels/allen_cortex_labels.csv',
    class_label='Type',
    cast=True,
    skip=3,
)


In [6]:
%%timeit

test[0:20]

88.6 ms ± 1.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%time

refgenes = gene_intersection()

CPU times: user 2.28 s, sys: 57.6 ms, total: 2.33 s
Wall time: 2.34 s


In [8]:
from torch.utils.data import DataLoader, ConcatDataset

loader = DataLoader(test, batch_size=4)
sample = next(iter(loader))
sample = sample[0].numpy()

In [9]:
def clean_sample(sample, refgenes, currgenes):
    intersection = np.intersect1d(currgenes, refgenes, return_indices=True)
    indices = intersection[1] # List of indices in currgenes that equal refgenes 
    
    axis = (1 if sample.ndim == 2 else 0)
    sample = np.sort(sample, axis=axis)
    sample = np.take(sample, indices, axis=axis)

    return torch.from_numpy(sample)

In [10]:
datafiles, labelfiles = list(INTERIM_DATA_AND_LABEL_FILES_LIST.keys()), list(INTERIM_DATA_AND_LABEL_FILES_LIST.values())

datafiles = [os.path.join('..', 'data', 'interim', f) for f in datafiles]
labelfiles = [os.path.join('..', 'data', 'processed/labels', f) for f in labelfiles]
datafiles, labelfiles

(['../data/interim/primary_bhaduri_T.csv',
  '../data/interim/allen_cortex_T.csv',
  '../data/interim/allen_m1_region_T.csv',
  '../data/interim/whole_brain_bhaduri_T.csv'],
 ['../data/processed/labels/primary_bhaduri_labels.csv',
  '../data/processed/labels/allen_cortex_labels.csv',
  '../data/processed/labels/allen_m1_region_labels.csv',
  '../data/processed/labels/whole_brain_bhaduri_labels.csv'])

In [11]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
loader = DataLoader(train, batch_size=4)
currgenes = train.columns

In [12]:
onedsample = train[0][0]
len(onedsample)

19765

In [13]:
t = (clean_sample(onedsample, refgenes, currgenes))
t

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [14]:
len(t)

16604

In [15]:
twodsample = next(iter(loader))[0]
twodsample.shape

torch.Size([4, 19765])

In [16]:
# %%timeit

sample = clean_sample(twodsample, refgenes, currgenes)

In [17]:
len(sample[0])

16604

In [18]:
# for X, y in tqdm(loader):
#     X = clean_sample(X, refgenes, currgenes)

In [19]:
sample.ndim

2

In [20]:
temp = pd.read_csv(datafiles[0], nrows=1, header=1).columns 

In [21]:
# cols = []
# for file in datafiles:
#     # Read in columns, split by | (since some are PVALB|PVALB), and make sure all are uppercase
#     temp = pd.read_csv(file, nrows=1, header=1).columns 
#     temp = [x.split('|')[0].upper().strip() for x in temp]
    
#     print(f'Temp is {temp[0:5]}...')
#     cols.append(set(temp))

# unique = list(set.intersection(*cols))
# unique = sorted(unique)

In [22]:
# len(unique)

In [23]:
# temp = pd.read_csv(datafiles[0], nrows=1, header=1).columns 
# temp = [x.strip().upper() for x in temp]
# l = train.features

In [24]:
# l == temp

In [25]:
# len(set(unique).intersection(l))

In [26]:
# len(set(unique))

In [27]:
# len(set(unique).intersection([x.upper().strip() for x in l]))

In [28]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
loader = DataLoader(train, batch_size=4)

model = GeneClassifier(
    N_features=len(train.columns),
    N_labels=len(train.labels)
)

Model initialized. N_features = 19765, N_labels = 9. Metrics are {'accuracy': <function accuracy at 0x7fba93c33040>, 'precision': <function precision at 0x7fba93c45b80>, 'recall': <function recall at 0x7fba93c45ca0>} and weighted_metrics = False


In [29]:
sample = next(iter(loader))[0]
sample

tensor([[2.8467, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 1.8507, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [1.6067, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [30]:
# %%timeit

# model(sample)

Now let's time iterating over our dataloader with and without the extra data cleaning

In [31]:
# for X, y in tqdm(loader):
#     X
#     model(X)

In [32]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
loader = DataLoader(train, batch_size=4)

model = GeneClassifier(
    N_features=len(refgenes),
    N_labels=len(train.labels)
)

# for X, y in tqdm(loader):
#     X = clean_sample(X, refgenes, train.columns)
#     model(X)

Model initialized. N_features = 16604, N_labels = 9. Metrics are {'accuracy': <function accuracy at 0x7fba93c33040>, 'precision': <function precision at 0x7fba93c45b80>, 'recall': <function recall at 0x7fba93c45ca0>} and weighted_metrics = False


In [33]:
df1_data = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=3)
df3_data = GeneExpressionData(datafiles[2], labelfiles[2], 'Type', skip=3)
df4_data = GeneExpressionData(datafiles[3], labelfiles[3], 'Type', skip=3)

In [34]:
df3_data[0]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), 7)

In [None]:
df4_data[0]

In [None]:
from src.models.lib.data import _generate_stratified_dataset

train, test = _generate_stratified_dataset(
        dataset_files=datafiles,
        label_files=labelfiles,
        class_label='Type',
    )


In [None]:
# train[0]

We can see that it's much faster to clean the sample on each minibatch, since numpy clearly scales well under-the-hood. Therefore, we'll have to write a manual training loop as we can no longer use pytorch lightning.

In [None]:
from pytorch_lightning import Trainer

# train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
# loader = DataLoader(train, batch_size=4)

# model = GeneClassifier(
#     N_features=len(train.columns),
#     load
# )

In [None]:
combined, test, insize, numlabels, weights = generate_datasets(datafiles, labelfiles, 'Type', skip=3)
# numlabels

In [None]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
trainloader = DataLoader(train, batch_size=4)

net = GeneClassifier(
    N_features=len(train.columns),
    N_labels=max(train.labels)
)

In [None]:
loaders = []
refgenes = gene_intersection()

In [None]:
for datafile, labelfile in zip(datafiles, labelfiles):
    data = GeneExpressionData(
            datafile,
            labelfile,
            'Type',
            cast=False,
    )
    
#     print(data[0][0][0:5])
    loaders.append(data)

In [None]:
# for data in loaders:
#     print(data.name)
#     print(data[0][0][0:5])

In [None]:
loaders = [DataLoader(data, batch_size=4) for data in loaders]

In [None]:
# df1 = pd.read_csv(datafiles[0], nrows=75, header=1)
# df2 = pd.read_csv(datafiles[1], nrows=75, header=1)
# df3 = pd.read_csv(datafiles[2], nrows=75, header=1)
# df4 = pd.read_csv(datafiles[3], nrows=75, header=1)

# df1_labels = pd.read_csv(labelfiles[0])
# df2_labels = pd.read_csv(labelfiles[1])
# df3_labels = pd.read_csv(labelfiles[2])
# df4_labels = pd.read_csv(labelfiles[3])

In [None]:
# df1_labels.loc[186471, 'cell']

In [None]:
# df1_labels['Type'].value_counts()

In [None]:
# df3

In [None]:
# df4

In [None]:
df1_data = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=3)
df3_data = GeneExpressionData(datafiles[2], labelfiles[2], 'Type', skip=3)
df4_data = GeneExpressionData(datafiles[3], labelfiles[3], 'Type', skip=3)

In [None]:
datasets = [df1_data, df2_data, df3_data, df4_data]

In [None]:
for dataset in datasets:
    print(dataset.columns[0:10])

In [None]:
df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=2)
len(df2_data[0][0])

In [None]:
df2_data[0]

The falses makes sense since the indices arent being changed when we're reading in the pure dataframes

In [None]:
# k1 = df2_data[0][0]
# k1

In [None]:
# np.linalg.norm(df2.iloc[1, :] - k1.numpy())

In [None]:
# df2_labels

In [None]:
# df2_labels_raw = pd.read_csv('../data/interim/labels/allen_cortex_labels.csv')
# df2_labels_raw

In [None]:
# df2_data[0]

In [None]:
# df2_labels.head(10)

In [None]:
# all(np.isclose(df2.loc[0, :], df2_data[0][0]))

In [None]:
# df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=3)
# all(np.isclose(df2.loc[1, :], df2_data[0][0]))

In [None]:
# df2_data[0]

In [None]:
# df4_labels.head(10)

In [None]:
# df4_labels.loc[5, 'cell']

In [None]:
# df4_data[5]

In [None]:
# np.linalg.norm(df4.loc[7, :].values - df4_data[5][0].numpy())

Ok, so this issue of non-matching just seems to be with the second dataset, which also requires a different skip number and seems to have some weird behavior. Investigate this one more, which is `allen_cortex_T.csv`.

In [None]:
# for i in range(50):
#     k = df1.loc[df1_labels.loc[i, 'cell']]
#     s = df1_data[i][0]
    
#     print(all(np.isclose(k, s)))

In [None]:
# for i in range(50):
#     k = df2.loc[df2_labels.loc[i, 'cell']]
#     s = df2_data[i][0]
    
#     print(all(np.isclose(k, s)))

In [None]:
# df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=3)
# df2_data[0]

In [None]:
# df2_data[1]

In [None]:
# for i in range(50):
#     k = df2.loc[df2_labels.loc[i, 'cell']]
#     s = df2_data[i][0]
    
#     print(all(np.isclose(k, s)))

In [None]:
# for i in range(50):
#     k = df3.loc[df3_labels.loc[i, 'cell']]
#     s = df3_data[i][0]
    
#     print(all(np.isclose(k, s)))

In [None]:
# for i in range(50):
#     k = df4.loc[df4_labels.loc[i, 'cell']]
#     s = df4_data[i][0]
    
#     print(all(np.isclose(k, s)))

Now, let's write our training loop using all four datasets.

In [None]:
# loaders = [df1_data, df2_data, df3_data, df4_data]
# loaders = [DataLoader(data, batch_size=2) for data in loaders]

In [None]:
# next(iter(loaders[0]))

Now, let's time the DataLoader vs the custom DataLoader from Pytorch Tabular found here: https://github.com/hcarlens/pytorch-tabular/blob/master/fast_tensor_data_loader.py

In [None]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)

trainloader = DataLoader(train, batch_size=4)
fastloader = FastTensorDataLoader(train, batch_size=4)

train, val, test = generate_single_dataset(datafiles[0], labelfiles[0], 'Type', skip=3, index_col='cell', cast=True)

In [None]:
trainloaders, valloaders, testloaders = [], [], []

In [None]:
def gen_loaders(batch_size, num_workers):
    traindata = []
    for datafile, labelfile in zip(datafiles, labelfiles):
        train, val, test = generate_single_dataset(
            datafile,
            labelfile,
            'Type', 
            skip=3, 
            index_col='cell', 
            cast=True
        )

        traindata.append(
            GeneExpressionData(
                datafile,
                labelfile,
                'Type',
                skip=3,
            )
        )

        trainloaders.append(
            DataLoader(train, batch_size=batch_size, num_workers=num_workers)
        )

        valloaders.append(
            DataLoader(val, batch_size=batch_size, num_workers=num_workers)
        )

        testloaders.append(
            DataLoader(test, batch_size=batch_size, num_workers=num_workers)
        )
        
        return trainloaders, valloaders, testloaders
        
trainloaders, valloaders, testloaders = gen_loaders(4, 0)

In [None]:
trainloaders[0].dataset.dataset.columns

In [None]:
for i, sample in enumerate(tqdm(trainloader)):
    X, y = sample 
    net(X)
    if i == 200:
        break

In [None]:
for i, sample in enumerate(tqdm(fastloader)):
    t = sample
    
    if i == 200:
        break

In [None]:
X = next(iter(trainloaders[0]))[0]
X.shape

In [None]:
# clean_sample(X, refgenes, traindata[0].columns)

In [None]:
from functools import partial 
from torchmetrics.functional import accuracy, f1_score, precision, recall

def calculate_metrics(
    outputs, 
    labels,
    num_classes,
    append_str='',
) -> Dict[str, float]:
    metrics = {
        'micro_accuracy': partial(accuracy, average='micro', num_classes=num_classes),
        'macro_accuracy': partial(accuracy, average='macro', num_classes=num_classes),
        'weighted_accuracy': partial(accuracy, average='weighted', num_classes=num_classes),
        'f1': f1_score,
        'precision': precision,
        'recall': recall,
    }
    results = {}
    
    for name, metric in metrics.items():
        res = metric(
            preds=outputs,
            target=labels,
        )
        
        results[f"{name}{f'_{append_str}' if append_str else ''}"] = res
    
    return results 

In [None]:
import torch.optim as optim
import torch.nn as nn
import wandb

def _inner_computation(
    data,
    model, 
    optim,
    loader,
    wandb, 
    i, 
    running_loss,
    mode=['train', 'val', 'test'],
):
    inputs, labels = data
    inputs = clean_sample(inputs, refgenes, valloader.dataset.dataset.columns)

    outputs = model(batch)
    loss = criterion(outputs, labels)
    
    if mode == 'train':
        # Backward pass ⬅
        optimizer.zero_grad()
        loss.backward()

        # Step with optimizer
        optimizer.step()

    running_loss += loss.item()

    if i % 100 == 0:
        running_loss = 0.0
        metric_results = calculate_metrics(
            outputs=outputs,
            labels=labels,
            append_str=mode,
            num_classes=model.N_labels
        )

        wandb.log({f"{mode}_loss": loss})
        wandb.log(metric_results)
    
    return running_loss

model = GeneClassifier(
    N_features=len(refgenes),
    N_labels=18,
)

wandb.init()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

wandb.watch(model, log_freq=100)

for epoch in range(100):  # loop over the dataset multiple times
    running_loss = 0.0
    
    # Train loop
    for trainidx, trainloader in enumerate(trainloaders):
        model.train()
        print(f'Training on {trainidx}')
        
        for i, data in enumerate(tqdm(trainloader)):

            inputs, labels = data
            
            # CLEAN INPUTS
            inputs = clean_sample(inputs, refgenes, trainloader.dataset.dataset.columns)
            
            # Forward pass ➡
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass ⬅
            optimizer.zero_grad()
            loss.backward()

            # Step with optimizer
            optimizer.step()
            
            # print statistics
            running_loss += loss.item()
            if i % 100 == 0:
                running_loss = 0.0
                metric_results = calculate_metrics(
                    outputs=outputs,
                    labels=labels,
                    append_str='train',
                    num_classes=model.N_labels
                )
                wandb.log({"train_loss": loss})
                wandb.log(metric_results)
                
    
    # Validation loops 
    for validx, valloader in enumerate(valloaders):
        model.eval()
        
        for i, data in enumerate(tqdm(valloader)):
            inputs, labels = data
            inputs = clean_sample(inputs, refgenes, valloader.dataset.dataset.columns)
            
            outputs = model(batch)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            
            if i % 100 == 0:
                running_loss = 0.0
                metric_results = calculate_metrics(
                    outputs=outputs,
                    labels=labels,
                    append_str='val',
                    num_classes=model.N_labels
                )
                
                wandb.log({"val_loss": loss})
                wandb.log(metric_results)
    
print('Finished train/validation, calculating test error')

for testidx, testloader in enumerate(testloaders):
    model.eval()
    
    for i, data in enumerate(tqdm(testloader)):
        inputs, labels = data
        inputs = clean_sample(inputs, refgenes, valloader.dataset.dataset.columns)

        outputs = model(batch)
        loss = criterion(outputs, labels)
            
        if i % 100 == 0:
            running_loss = 0.0
            metric_results = calculate_metrics(
                outputs=outputs,
                labels=labels,
                append_str='test',
                num_classes=model.N_labels
            )

            wandb.log({"test_loss": loss})
            wandb.log(metric_results)


In [107]:
preds = torch.randn(10, 5).softmax(dim=-1)
target = torch.randint(5, (10,))

calculate_metrics(preds, target, 5, 'val')

<class 'dict'>


{'micro_accuracy_val': tensor(0.2000),
 'macro_accuracy_val': tensor(0.2000),
 'weighted_accuracy_val': tensor(0.2000),
 'f1_val': tensor(0.2000),
 'precision_val': tensor(0.2000),
 'recall_val': tensor(0.2000)}

In [99]:
import torch
# import our library
import torchmetrics

# simulate a classification problem
preds = torch.randn(10, 5).softmax(dim=-1)
target = torch.randint(5, (10,))

acc = torchmetrics.functional.accuracy(preds, target)
acc

tensor(0.3000)

In [None]:
def train_batch(batch, labels, model, optimizer, criterion):
    images, labels = images.to(device), labels.to(device)
    
    # Forward pass ➡
    outputs = model(batch)
    loss = criterion(outputs, labels)
    
    # Backward pass ⬅
    optimizer.zero_grad()
    loss.backward()

    # Step with optimizer
    optimizer.step()

    return loss

def validate_model(model, test_loader):
    model.eval()

    # Run the model on some test examples
    with torch.no_grad():
        correct, total = 0, 0
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f"Accuracy of the model on the {total} " +
              f"test images: {100 * correct / total}%")
        
        wandb.log({"test_accuracy": correct / total})

    # Save the model in the exchangeable ONNX format
    torch.onnx.export(model, images, "model.onnx")
    wandb.save("model.onnx")
    

In [None]:
# tell wandb to get started
with wandb.init(project="pytorch-demo", config=hyperparameters):
  # access all HPs through wandb.config, so logging matches execution!
  config = wandb.config

  # make the model, data, and optimization problem
  model, train_loader, test_loader, criterion, optimizer = make(config)

  # and use them to train the model
  train(model, train_loader, criterion, optimizer, config)

  # and test its final performance
  test(model, test_loader)

return model

In [None]:
print(torch.randn(10, 5).softmax(dim=-1))
torch.randint(5, (10,))

## Linecache speed testing 

If we can improve the speed of our __getitem__ method, we can train our model a lot faster. Since currently it requires two list comprehensions, let's see if we can increase the time 

In [None]:
import linecache 

line = linecache.getline('../data/interim/primary_bhaduri_T.csv', 5)
line = np.array(line.split(','), dtype=np.float32)

In [None]:
%%timeit 

line = linecache.getline('../data/interim/primary_bhaduri_T.csv', 5)
line = np.array(line.split(','), dtype=np.float32)

In [None]:
df1_data.name

In [None]:
%%timeit 

line = df1_data[5]