In [1]:
import pandas as pd 
import os
import sys
import pandas as pd
import numpy as np
from torch.utils.data import *
from tqdm import tqdm
import linecache 

sys.path.append('../src/')
sys.path.append('..')

from src.models.lib.neural import GeneClassifier

In [2]:
from src.models.lib.data import *
from src.helper import *

In [3]:
test = GeneExpressionData(
    filename='../data/interim/allen_cortex_T.csv',
    labelname='../data/processed/labels/allen_cortex_labels.csv',
    class_label='Type',
    cast=True,
    skip=3,
)


In [4]:
%%timeit

test[0:20]

90.1 ms ± 2.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%%time

refgenes = gene_intersection()

CPU times: user 2.25 s, sys: 87.7 ms, total: 2.34 s
Wall time: 2.36 s


In [6]:
from torch.utils.data import DataLoader, ConcatDataset

loader = DataLoader(test, batch_size=4)
sample = next(iter(loader))
sample = sample[0].numpy()

In [7]:
def clean_sample(sample, refgenes, currgenes):
    intersection = np.intersect1d(currgenes, refgenes, return_indices=True)
    indices = intersection[1] # List of indices in currgenes that equal refgenes 
    
    axis = (1 if sample.ndim == 2 else 0)
    sample = np.sort(sample, axis=axis)
    sample = np.take(sample, indices, axis=axis)

    return torch.from_numpy(sample)

In [8]:
datafiles, labelfiles = list(INTERIM_DATA_AND_LABEL_FILES_LIST.keys()), list(INTERIM_DATA_AND_LABEL_FILES_LIST.values())

datafiles = [os.path.join('..', 'data', 'interim', f) for f in datafiles]
labelfiles = [os.path.join('..', 'data', 'processed/labels', f) for f in labelfiles]
datafiles, labelfiles

(['../data/interim/primary_bhaduri_T.csv',
  '../data/interim/allen_cortex_T.csv',
  '../data/interim/allen_m1_region_T.csv',
  '../data/interim/whole_brain_bhaduri_T.csv'],
 ['../data/processed/labels/primary_bhaduri_labels.csv',
  '../data/processed/labels/allen_cortex_labels.csv',
  '../data/processed/labels/allen_m1_region_labels.csv',
  '../data/processed/labels/whole_brain_bhaduri_labels.csv'])

In [9]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
loader = DataLoader(train, batch_size=4)
currgenes = train.columns

In [10]:
onedsample = train[0][0]
len(onedsample)

19765

In [11]:
t = (clean_sample(onedsample, refgenes, currgenes))
t

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [12]:
len(t)

16604

In [13]:
twodsample = next(iter(loader))[0]
twodsample.shape

torch.Size([4, 19765])

In [14]:
# %%timeit

sample = clean_sample(twodsample, refgenes, currgenes)

In [15]:
len(sample[0])

16604

In [16]:
# for X, y in tqdm(loader):
#     X = clean_sample(X, refgenes, currgenes)

In [17]:
sample.ndim

2

In [18]:
temp = pd.read_csv(datafiles[0], nrows=1, header=1).columns 

In [19]:
# cols = []
# for file in datafiles:
#     # Read in columns, split by | (since some are PVALB|PVALB), and make sure all are uppercase
#     temp = pd.read_csv(file, nrows=1, header=1).columns 
#     temp = [x.split('|')[0].upper().strip() for x in temp]
    
#     print(f'Temp is {temp[0:5]}...')
#     cols.append(set(temp))

# unique = list(set.intersection(*cols))
# unique = sorted(unique)

In [20]:
# len(unique)

In [21]:
# temp = pd.read_csv(datafiles[0], nrows=1, header=1).columns 
# temp = [x.strip().upper() for x in temp]
# l = train.features

In [22]:
# l == temp

In [23]:
# len(set(unique).intersection(l))

In [24]:
# len(set(unique))

In [25]:
# len(set(unique).intersection([x.upper().strip() for x in l]))

In [26]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
loader = DataLoader(train, batch_size=4)

model = GeneClassifier(
    N_features=len(train.columns),
    N_labels=len(train.labels)
)

Model initialized. N_features = 19765, N_labels = 9. Metrics are {'accuracy': <function accuracy at 0x7fe6432a2f70>, 'precision': <function precision at 0x7fe6432c65e0>, 'recall': <function recall at 0x7fe6432c6700>} and weighted_metrics = False


In [27]:
sample = next(iter(loader))[0]
sample

tensor([[2.8467, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 1.8507, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [1.6067, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [28]:
# %%timeit

# model(sample)

Now let's time iterating over our dataloader with and without the extra data cleaning

In [29]:
# for X, y in tqdm(loader):
#     X
#     model(X)

In [30]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
loader = DataLoader(train, batch_size=4)

model = GeneClassifier(
    N_features=len(refgenes),
    N_labels=len(train.labels)
)

# for X, y in tqdm(loader):
#     X = clean_sample(X, refgenes, train.columns)
#     model(X)

Model initialized. N_features = 16604, N_labels = 9. Metrics are {'accuracy': <function accuracy at 0x7fe6432a2f70>, 'precision': <function precision at 0x7fe6432c65e0>, 'recall': <function recall at 0x7fe6432c6700>} and weighted_metrics = False


In [31]:
df1_data = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=3)
df3_data = GeneExpressionData(datafiles[2], labelfiles[2], 'Type', skip=3)
df4_data = GeneExpressionData(datafiles[3], labelfiles[3], 'Type', skip=3)

In [32]:
df3_data[0]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), 7)

In [33]:
df4_data[0]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), 16)

In [34]:
from src.models.lib.data import _generate_stratified_dataset

train, test = _generate_stratified_dataset(
        dataset_files=datafiles,
        label_files=labelfiles,
        class_label='Type',
    )


In [35]:
# train[0]

We can see that it's much faster to clean the sample on each minibatch, since numpy clearly scales well under-the-hood. Therefore, we'll have to write a manual training loop as we can no longer use pytorch lightning.

In [36]:
from pytorch_lightning import Trainer

# train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
# loader = DataLoader(train, batch_size=4)

# model = GeneClassifier(
#     N_features=len(train.columns),
#     load
# )

In [37]:
combined, test, insize, numlabels, weights = generate_datasets(datafiles, labelfiles, 'Type', skip=3)
# numlabels

In [38]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
trainloader = DataLoader(train, batch_size=4)

net = GeneClassifier(
    N_features=len(train.columns),
    N_labels=max(train.labels)
)

Model initialized. N_features = 19765, N_labels = 17. Metrics are {'accuracy': <function accuracy at 0x7fe6432a2f70>, 'precision': <function precision at 0x7fe6432c65e0>, 'recall': <function recall at 0x7fe6432c6700>} and weighted_metrics = False


In [39]:
loaders = []
refgenes = gene_intersection()

In [40]:
for datafile, labelfile in zip(datafiles, labelfiles):
    data = GeneExpressionData(
            datafile,
            labelfile,
            'Type',
            cast=False,
    )
    
#     print(data[0][0][0:5])
    loaders.append(data)

In [41]:
# for data in loaders:
#     print(data.name)
#     print(data[0][0][0:5])

In [42]:
loaders = [DataLoader(data, batch_size=4) for data in loaders]

In [43]:
# df1 = pd.read_csv(datafiles[0], nrows=75, header=1)
# df2 = pd.read_csv(datafiles[1], nrows=75, header=1)
# df3 = pd.read_csv(datafiles[2], nrows=75, header=1)
# df4 = pd.read_csv(datafiles[3], nrows=75, header=1)

# df1_labels = pd.read_csv(labelfiles[0])
# df2_labels = pd.read_csv(labelfiles[1])
# df3_labels = pd.read_csv(labelfiles[2])
# df4_labels = pd.read_csv(labelfiles[3])

In [44]:
# df1_labels.loc[186471, 'cell']

In [45]:
# df1_labels['Type'].value_counts()

In [46]:
# df3

In [47]:
# df4

In [48]:
df1_data = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=3)
df3_data = GeneExpressionData(datafiles[2], labelfiles[2], 'Type', skip=3)
df4_data = GeneExpressionData(datafiles[3], labelfiles[3], 'Type', skip=3)

In [49]:
datasets = [df1_data, df2_data, df3_data, df4_data]

In [50]:
for dataset in datasets:
    print(dataset.columns[0:10])

['FO538757.2', 'AP006222.2', 'RP11-206L10.9', 'FAM41C', 'NOC2L', 'AGRN', 'C1ORF159', 'ACAP3', 'CPSF3L', 'AURKAIP1']
['3.8-1.2', '3.8-1.3', '3.8-1.4', '3.8-1.5', '5-HT3C2', 'A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1']
['DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2', 'FAM138A', 'LOC105379212', 'OR4G4P', 'OR4G11P', 'OR4F5', 'LOC105379213']
['RP11-34P13.7', 'FO538757.2', 'AP006222.2', 'RP4-669L17.10', 'RP11-206L10.9', 'LINC00115', 'FAM41C', 'RP11-54O7.16', 'RP11-54O7.1', 'RP11-54O7.2']


In [51]:
df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=2)
len(df2_data[0][0])

50281

In [52]:
df2_data[0]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), 7)

The falses makes sense since the indices arent being changed when we're reading in the pure dataframes

In [None]:
# k1 = df2_data[0][0]
# k1

In [None]:
# np.linalg.norm(df2.iloc[1, :] - k1.numpy())

In [None]:
# df2_labels

In [None]:
# df2_labels_raw = pd.read_csv('../data/interim/labels/allen_cortex_labels.csv')
# df2_labels_raw

In [None]:
# df2_data[0]

In [None]:
# df2_labels.head(10)

In [None]:
# all(np.isclose(df2.loc[0, :], df2_data[0][0]))

In [None]:
# df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=3)
# all(np.isclose(df2.loc[1, :], df2_data[0][0]))

In [None]:
# df2_data[0]

In [None]:
# df4_labels.head(10)

In [None]:
# df4_labels.loc[5, 'cell']

In [None]:
# df4_data[5]

In [None]:
# np.linalg.norm(df4.loc[7, :].values - df4_data[5][0].numpy())

Ok, so this issue of non-matching just seems to be with the second dataset, which also requires a different skip number and seems to have some weird behavior. Investigate this one more, which is `allen_cortex_T.csv`.

In [None]:
# for i in range(50):
#     k = df1.loc[df1_labels.loc[i, 'cell']]
#     s = df1_data[i][0]
    
#     print(all(np.isclose(k, s)))

In [None]:
# for i in range(50):
#     k = df2.loc[df2_labels.loc[i, 'cell']]
#     s = df2_data[i][0]
    
#     print(all(np.isclose(k, s)))

In [None]:
# df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=3)
# df2_data[0]

In [None]:
# df2_data[1]

In [None]:
# for i in range(50):
#     k = df2.loc[df2_labels.loc[i, 'cell']]
#     s = df2_data[i][0]
    
#     print(all(np.isclose(k, s)))

In [None]:
# for i in range(50):
#     k = df3.loc[df3_labels.loc[i, 'cell']]
#     s = df3_data[i][0]
    
#     print(all(np.isclose(k, s)))

In [None]:
# for i in range(50):
#     k = df4.loc[df4_labels.loc[i, 'cell']]
#     s = df4_data[i][0]
    
#     print(all(np.isclose(k, s)))

Now, let's write our training loop using all four datasets.

In [None]:
# loaders = [df1_data, df2_data, df3_data, df4_data]
# loaders = [DataLoader(data, batch_size=2) for data in loaders]

In [None]:
# next(iter(loaders[0]))

Now, let's time the DataLoader vs the custom DataLoader from Pytorch Tabular found here: https://github.com/hcarlens/pytorch-tabular/blob/master/fast_tensor_data_loader.py

In [55]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)

trainloader = DataLoader(train, batch_size=4)
fastloader = FastTensorDataLoader(train, batch_size=4)

train, val, test = generate_single_dataset(datafiles[0], labelfiles[0], 'Type', skip=3, index_col='cell', cast=True)

In [56]:
trainloaders, valloaders, testloaders = [], [], []

In [60]:
def gen_loaders(batch_size, num_workers):
    traindata = []
    for datafile, labelfile in zip(datafiles, labelfiles):
        train, val, test = generate_single_dataset(
            datafile,
            labelfile,
            'Type', 
            skip=3, 
            index_col='cell', 
            cast=True
        )

        traindata.append(
            GeneExpressionData(
                datafile,
                labelfile,
                'Type',
                skip=3,
            )
        )

        trainloaders.append(
            DataLoader(train, batch_size=batch_size, num_workers=num_workers)
        )

        valloaders.append(
            DataLoader(val, batch_size=batch_size, num_workers=num_workers)
        )

        testloaders.append(
            DataLoader(test, batch_size=batch_size, num_workers=num_workers)
        )
        
        return trainloaders, valloaders, testloaders
        
trainloaders, valloaders, testloaders = gen_loaders(4, 0)

In [69]:
trainloaders[0].dataset.dataset.columns

['FO538757.2',
 'AP006222.2',
 'RP11-206L10.9',
 'FAM41C',
 'NOC2L',
 'AGRN',
 'C1ORF159',
 'ACAP3',
 'CPSF3L',
 'AURKAIP1',
 'CCNL2',
 'MRPL20',
 'SSU72',
 'RP5-832C2.5',
 'CDK11B',
 'SLC35E2B',
 'CDK11A',
 'NADK',
 'GNB1',
 'PRKCZ',
 'FAAP20',
 'SKI',
 'RER1',
 'FAM213B',
 'PRDM16',
 'WRAP73',
 'TP73-AS1',
 'LRRC47',
 'DFFB',
 'C1ORF174',
 'CHD5',
 'RPL22',
 'GPR153',
 'ACOT7',
 'NOL9',
 'ZBTB48',
 'THAP3',
 'DNAJC11',
 'CAMTA1',
 'VAMP3',
 'PARK7',
 'SLC45A1',
 'RERE',
 'ENO1',
 'CLSTN1',
 'CTNNBIP1',
 'LZIC',
 'NMNAT1',
 'UBE4B',
 'KIF1B',
 'PGD',
 'APITD1-CORT',
 'APITD1',
 'DFFA',
 'PEX14',
 'TARDBP',
 'RP4-635E18.8',
 'SRM',
 'EXOSC10',
 'MTOR',
 'UBIAD1',
 'PTCHD2',
 'FBXO44',
 'MAD2L2',
 'DRAXIN',
 'AGTRAP',
 'MTHFR',
 'CLCN6',
 'KIAA2013',
 'MFN2',
 'MIIP',
 'VPS13D',
 'PDPN',
 'PRDM2',
 'KAZN',
 'DNAJC16',
 'DDI2',
 'PLEKHM2',
 'FBLIM1',
 'UQCRHL',
 'FLJ37453',
 'SPEN',
 'ZBTB17',
 'FBXO42',
 'SZRD1',
 'NECAP2',
 'RP4-798A10.7',
 'NBPF1',
 'RP11-108M9.4',
 'RP11-108M9.6',
 '

In [61]:
for i, sample in enumerate(tqdm(trainloader)):
    X, y = sample 
    net(X)
    if i == 200:
        break

  0%|▋                                                                                                                                                 | 200/46619 [00:04<16:35, 46.64it/s]


In [62]:
for i, sample in enumerate(tqdm(fastloader)):
    t = sample
    
    if i == 200:
        break

  0%|▌                                                                                                                                                | 200/46619 [00:01<05:55, 130.57it/s]


In [63]:
X = next(iter(trainloaders[0]))[0]
X.shape

torch.Size([4, 19765])

In [65]:
# clean_sample(X, refgenes, traindata[0].columns)

In [67]:
# traindata[0].labels

In [71]:
import torch.optim as optim
import torch.nn as nn
import wandb

model = GeneClassifier(
    N_features=len(refgenes),
    N_labels=18,
)

wandb.init()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

wandb.watch(model, log_freq=100)

for epoch in range(100):  # loop over the dataset multiple times
    running_loss = 0.0
    
    # Train loop
    for trainidx, trainloader in enumerate(trainloaders):
        model.train()
        print(f'Training on {trainidx}')
        
        for i, data in enumerate(tqdm(trainloader)):
            inputs, labels = data
            # CLEAN INPUTS
            inputs = clean_sample(inputs, refgenes, trainloader.dataset.dataset.columns)
            # Forward pass ➡
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass ⬅
            optimizer.zero_grad()
            loss.backward()

            # Step with optimizer
            optimizer.step()
            
            # print statistics
            running_loss += loss.item()
            if i % 10 == 0:    # print every 2000 mini-batches
                running_loss = 0.0
                wandb.log({"train_loss": loss})
                
    # Validation loops 
    for validx, valloader in enumerate(valloaders):
        model.eval()
        
        for i, data in enumerate(tqdm(valloader)):
            inputs, labels = data
            inputs = clean_sample(inputs, refgenes, valloader.dataset.dataset.columns)
            
            outputs = model(batch)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            
            if i % 10 == 0:
                wandb.log({"val_loss": loss})
    
print('Finished train/validation, calculating test error')

Model initialized. N_features = 16604, N_labels = 18. Metrics are {'accuracy': <function accuracy at 0x7fe6432a2f70>, 'precision': <function precision at 0x7fe6432c65e0>, 'recall': <function recall at 0x7fe6432c6700>} and weighted_metrics = False


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Training on 0


  1%|█▏                                                                                                                                                | 291/37295 [00:15<31:50, 19.37it/s]


KeyboardInterrupt: 

In [None]:
def train_batch(batch, labels, model, optimizer, criterion):
    images, labels = images.to(device), labels.to(device)
    
    # Forward pass ➡
    outputs = model(batch)
    loss = criterion(outputs, labels)
    
    # Backward pass ⬅
    optimizer.zero_grad()
    loss.backward()

    # Step with optimizer
    optimizer.step()

    return loss

def validate_model(model, test_loader):
    model.eval()

    # Run the model on some test examples
    with torch.no_grad():
        correct, total = 0, 0
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f"Accuracy of the model on the {total} " +
              f"test images: {100 * correct / total}%")
        
        wandb.log({"test_accuracy": correct / total})

    # Save the model in the exchangeable ONNX format
    torch.onnx.export(model, images, "model.onnx")
    wandb.save("model.onnx")
    

In [None]:
# tell wandb to get started
with wandb.init(project="pytorch-demo", config=hyperparameters):
  # access all HPs through wandb.config, so logging matches execution!
  config = wandb.config

  # make the model, data, and optimization problem
  model, train_loader, test_loader, criterion, optimizer = make(config)

  # and use them to train the model
  train(model, train_loader, criterion, optimizer, config)

  # and test its final performance
  test(model, test_loader)

return model

In [72]:
print(torch.randn(10, 5).softmax(dim=-1))
torch.randint(5, (10,))

tensor([[0.4371, 0.0333, 0.1688, 0.2000, 0.1607],
        [0.1782, 0.2526, 0.3274, 0.1640, 0.0778],
        [0.0805, 0.0224, 0.6530, 0.1200, 0.1240],
        [0.2361, 0.3397, 0.2549, 0.1413, 0.0280],
        [0.1407, 0.0528, 0.4000, 0.1040, 0.3026],
        [0.2487, 0.4209, 0.1792, 0.0857, 0.0654],
        [0.1900, 0.0882, 0.3352, 0.2989, 0.0878],
        [0.0211, 0.3861, 0.2937, 0.1939, 0.1052],
        [0.0460, 0.5406, 0.0403, 0.0521, 0.3210],
        [0.1224, 0.2627, 0.4455, 0.0433, 0.1261]])


tensor([3, 0, 1, 1, 2, 4, 2, 0, 3, 3])

## Linecache speed testing 

If we can improve the speed of our __getitem__ method, we can train our model a lot faster. Since currently it requires two list comprehensions, let's see if we can increase the time 

In [None]:
import linecache 

line = linecache.getline('../data/interim/primary_bhaduri_T.csv', 5)
line = np.array(line.split(','), dtype=np.float32)

In [None]:
%%timeit 

line = linecache.getline('../data/interim/primary_bhaduri_T.csv', 5)
line = np.array(line.split(','), dtype=np.float32)

In [None]:
df1_data.name

In [None]:
%%timeit 

line = df1_data[5]