In [1]:
import pandas as pd 
import os
import sys
import pandas as pd
import numpy as np
from torch.utils.data import *
from tqdm import tqdm
import linecache 

sys.path.append('../src/')
sys.path.append('..')

from src.models.lib.neural import GeneClassifier

In [2]:
from src.models.lib.data import *
from src.helper import *

In [3]:
test = GeneExpressionData(
    filename='../data/interim/allen_cortex_T.csv',
    labelname='../data/processed/labels/allen_cortex_labels.csv',
    class_label='Type',
    cast=True,
    skip=3,
)


In [4]:
%%timeit

test[0:20]

88.5 ms ± 1.53 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%%time

refgenes = gene_intersection()

CPU times: user 2.28 s, sys: 85.2 ms, total: 2.36 s
Wall time: 2.4 s


In [6]:
from torch.utils.data import DataLoader, ConcatDataset

loader = DataLoader(test, batch_size=4)
sample = next(iter(loader))
sample = sample[0].numpy()

In [7]:
def clean_sample(sample, refgenes, currgenes):
    intersection = np.intersect1d(currgenes, refgenes, return_indices=True)
    indices = intersection[1] # List of indices in currgenes that equal refgenes 
    
    axis = (1 if sample.ndim == 2 else 0)
    sample = np.sort(sample, axis=axis)
    sample = np.take(sample, indices, axis=axis)

    return torch.from_numpy(sample)

In [8]:
datafiles, labelfiles = list(INTERIM_DATA_AND_LABEL_FILES_LIST.keys()), list(INTERIM_DATA_AND_LABEL_FILES_LIST.values())

datafiles = [os.path.join('..', 'data', 'interim', f) for f in datafiles]
labelfiles = [os.path.join('..', 'data', 'processed/labels', f) for f in labelfiles]
datafiles, labelfiles

(['../data/interim/primary_bhaduri_T.csv',
  '../data/interim/allen_cortex_T.csv',
  '../data/interim/allen_m1_region_T.csv',
  '../data/interim/whole_brain_bhaduri_T.csv'],
 ['../data/processed/labels/primary_bhaduri_labels.csv',
  '../data/processed/labels/allen_cortex_labels.csv',
  '../data/processed/labels/allen_m1_region_labels.csv',
  '../data/processed/labels/whole_brain_bhaduri_labels.csv'])

In [9]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
loader = DataLoader(train, batch_size=4)
currgenes = train.columns

In [10]:
onedsample = train[0][0]
len(onedsample)

19765

In [11]:
t = (clean_sample(onedsample, refgenes, currgenes))
t

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [12]:
len(t)

16604

In [13]:
# twodsample = next(iter(loader))[0]
# twodsample

In [14]:
# %%timeit

# clean_sample(twodsample, refgenes, currgenes)

In [15]:
# for X, y in tqdm(loader):
#     X = clean_sample(X, refgenes, currgenes)

In [16]:
sample.ndim

2

In [17]:
temp = pd.read_csv(datafiles[0], nrows=1, header=1).columns 

In [18]:
# cols = []
# for file in datafiles:
#     # Read in columns, split by | (since some are PVALB|PVALB), and make sure all are uppercase
#     temp = pd.read_csv(file, nrows=1, header=1).columns 
#     temp = [x.split('|')[0].upper().strip() for x in temp]
    
#     print(f'Temp is {temp[0:5]}...')
#     cols.append(set(temp))

# unique = list(set.intersection(*cols))
# unique = sorted(unique)

In [19]:
# len(unique)

In [20]:
# temp = pd.read_csv(datafiles[0], nrows=1, header=1).columns 
# temp = [x.strip().upper() for x in temp]
# l = train.features

In [21]:
# l == temp

In [22]:
# len(set(unique).intersection(l))

In [23]:
# len(set(unique))

In [24]:
# len(set(unique).intersection([x.upper().strip() for x in l]))

In [25]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
loader = DataLoader(train, batch_size=4)

model = GeneClassifier(
    N_features=len(train.columns),
    N_labels=len(train.labels)
)

Model initialized. N_features = 19765, N_labels = 9. Metrics are {'accuracy': <function accuracy at 0x7f9234573f70>, 'precision': <function precision at 0x7f92345975e0>, 'recall': <function recall at 0x7f9234597700>} and weighted_metrics = False


In [26]:
sample = next(iter(loader))[0]
sample

tensor([[2.8467, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 1.8507, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [1.6067, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [27]:
# %%timeit

# model(sample)

Now let's time iterating over our dataloader with and without the extra data cleaning

In [28]:
# for X, y in tqdm(loader):
#     X
#     model(X)

In [29]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
loader = DataLoader(train, batch_size=4)

model = GeneClassifier(
    N_features=len(refgenes),
    N_labels=len(train.labels)
)

# for X, y in tqdm(loader):
#     X = clean_sample(X, refgenes, train.columns)
#     model(X)

Model initialized. N_features = 16604, N_labels = 9. Metrics are {'accuracy': <function accuracy at 0x7f9234573f70>, 'precision': <function precision at 0x7f92345975e0>, 'recall': <function recall at 0x7f9234597700>} and weighted_metrics = False


In [30]:
df1_data = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=2)
df3_data = GeneExpressionData(datafiles[2], labelfiles[2], 'Type', skip=3)
df4_data = GeneExpressionData(datafiles[3], labelfiles[3], 'Type', skip=3)

In [31]:
df3_data[0]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), 7)

In [32]:
df4_data[0]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), 16)

In [33]:
from src.models.lib.data import _generate_stratified_dataset

train, test = _generate_stratified_dataset(
        dataset_files=datafiles,
        label_files=labelfiles,
        class_label='Type',
    )


In [34]:
# train[0]

We can see that it's much faster to clean the sample on each minibatch, since numpy clearly scales well under-the-hood. Therefore, we'll have to write a manual training loop as we can no longer use pytorch lightning.

In [35]:
from pytorch_lightning import Trainer

# train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
# loader = DataLoader(train, batch_size=4)

# model = GeneClassifier(
#     N_features=len(train.columns),
#     load
# )

In [36]:
# combined, test, insize, numlabels, weights = generate_datasets(datafiles, labelfiles, 'Type')
# numlabels

In [37]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
trainloader = DataLoader(train, batch_size=4)

net = GeneClassifier(
    N_features=len(train.columns),
    N_labels=max(train.labels)
)

Model initialized. N_features = 19765, N_labels = 17. Metrics are {'accuracy': <function accuracy at 0x7f9234573f70>, 'precision': <function precision at 0x7f92345975e0>, 'recall': <function recall at 0x7f9234597700>} and weighted_metrics = False


In [38]:
loaders = []
refgenes = gene_intersection()

In [39]:
for datafile, labelfile in zip(datafiles, labelfiles):
    data = GeneExpressionData(
            datafile,
            labelfile,
            'Type',
            cast=False,
    )
    
#     print(data[0][0][0:5])
    loaders.append(data)

In [40]:
# for data in loaders:
#     print(data.name)
#     print(data[0][0][0:5])

In [41]:
loaders = [DataLoader(data, batch_size=4) for data in loaders]

In [42]:
df1 = pd.read_csv(datafiles[0], nrows=75, header=1)
df2 = pd.read_csv(datafiles[1], nrows=75, header=1)
df3 = pd.read_csv(datafiles[2], nrows=75, header=1)
df4 = pd.read_csv(datafiles[3], nrows=75, header=1)

df1_labels = pd.read_csv(labelfiles[0])
df2_labels = pd.read_csv(labelfiles[1])
df3_labels = pd.read_csv(labelfiles[2])
df4_labels = pd.read_csv(labelfiles[3])

In [43]:
df1_labels.loc[186471, 'cell']

189404

In [44]:
df1_labels['Type'].value_counts()

4     122958
16     29563
7      20609
8       4510
6       3863
17      2451
11      1888
9        363
3        271
Name: Type, dtype: int64

In [45]:
df3

Unnamed: 0,DDX11L1|DDX11L1,WASH7P|WASH7P,MIR6859-1|MIR6859-1,MIR1302-2|MIR1302-2,FAM138A|FAM138A,LOC105379212|LOC105379212,OR4G4P|OR4G4P,OR4G11P|OR4G11P,OR4F5|OR4F5,LOC105379213|LOC105379213,...,LOC105374091|LOC105374091,PLXNA1|PLXNA1,LOC105374092|LOC105374092,C3orf56|C3orf56,LOC101060159|LOC101060159,LOC105374093|LOC105374093,LOC100419008|LOC100419008,LOC101927123|LOC101927123,LOC105374094|LOC105374094,LINC01471|LINC01471
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
71,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
df4

Unnamed: 0,RP11-34P13.7|RP11-34P13.7,FO538757.2|FO538757.2,AP006222.2|AP006222.2,RP4-669L17.10|RP4-669L17.10,RP11-206L10.9|RP11-206L10.9,LINC00115|LINC00115,FAM41C|FAM41C,RP11-54O7.16|RP11-54O7.16,RP11-54O7.1|RP11-54O7.1,RP11-54O7.2|RP11-54O7.2,...,FADS2|FADS2,FADS1|FADS1,FADS3|FADS3,BEST1|BEST1,FTH1|FTH1,RP11-810P12.5|RP11-810P12.5,INCENP|INCENP,RP11-703H8.7|RP11-703H8.7,ASRGL1|ASRGL1,CTD-2531D15.5|CTD-2531D15.5
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [47]:
df1_data = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)
df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=3)
df3_data = GeneExpressionData(datafiles[2], labelfiles[2], 'Type', skip=3)
df4_data = GeneExpressionData(datafiles[3], labelfiles[3], 'Type', skip=3)

In [48]:
datasets = [df1_data, df2_data, df3_data, df4_data]

In [49]:
for dataset in datasets:
    print(dataset.columns[0:10])

['FO538757.2', 'AP006222.2', 'RP11-206L10.9', 'FAM41C', 'NOC2L', 'AGRN', 'C1ORF159', 'ACAP3', 'CPSF3L', 'AURKAIP1']
['3.8-1.2', '3.8-1.3', '3.8-1.4', '3.8-1.5', '5-HT3C2', 'A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1']
['DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2', 'FAM138A', 'LOC105379212', 'OR4G4P', 'OR4G11P', 'OR4F5', 'LOC105379213']
['RP11-34P13.7', 'FO538757.2', 'AP006222.2', 'RP4-669L17.10', 'RP11-206L10.9', 'LINC00115', 'FAM41C', 'RP11-54O7.16', 'RP11-54O7.1', 'RP11-54O7.2']


In [50]:
df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=2)
len(df2_data[0][0])

50281

In [51]:
df2_data[0]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), 7)

In [52]:
all(np.isclose(df1_data[0][0], df1.loc[0, :].values))

True

In [53]:
all(np.isclose(df2_data[0][0], df2.loc[0, :].values))

True

In [54]:
all(np.isclose(df3_data[0][0], df3.loc[0, :].values))

True

In [55]:
all(np.isclose(df3_data[0][0], df3.loc[0, :].values))

True

The falses makes sense since the indices arent being changed when we're reading in the pure dataframes

In [56]:
k1 = df2_data[0][0]
k1

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [57]:
np.linalg.norm(df2.iloc[1, :] - k1.numpy())

28177.153316117652

In [58]:
df2_labels

Unnamed: 0,cell,Type
0,1,7
1,2,7
2,3,7
3,4,7
4,5,7
...,...,...
47504,49489,0
47505,49490,7
47506,49491,7
47507,49492,12


In [59]:
df2_labels_raw = pd.read_csv('../data/interim/labels/allen_cortex_labels.csv')
df2_labels_raw

Unnamed: 0.1,Unnamed: 0,Type
0,0,Exclude
1,1,Interneuron
2,2,Interneuron
3,3,Interneuron
4,4,Interneuron
...,...,...
49489,49489,Astrocyte
49490,49490,Interneuron
49491,49491,Interneuron
49492,49492,Oligodendrocyte


In [60]:
df2_data[0]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), 7)

In [61]:
df2_labels.head(10)

Unnamed: 0,cell,Type
0,1,7
1,2,7
2,3,7
3,4,7
4,5,7
5,6,7
6,7,7
7,8,7
8,9,7
9,11,15


In [62]:
all(np.isclose(df2.loc[0, :], df2_data[0][0]))

True

In [63]:
df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=3)
all(np.isclose(df2.loc[1, :], df2_data[0][0]))

True

In [64]:
df2_data[0]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), 7)

In [65]:
df4_labels.head(10)

Unnamed: 0,cell,Type
0,0,16
1,1,2
2,2,2
3,3,4
4,4,2
5,7,16
6,8,2
7,9,2
8,10,4
9,11,2


In [66]:
df4_labels.loc[5, 'cell']

7

In [67]:
df4_data[5]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), 16)

In [68]:
np.linalg.norm(df4.loc[7, :].values - df4_data[5][0].numpy())

0.0

Ok, so this issue of non-matching just seems to be with the second dataset, which also requires a different skip number and seems to have some weird behavior. Investigate this one more, which is `allen_cortex_T.csv`.

In [69]:
for i in range(50):
    k = df1.loc[df1_labels.loc[i, 'cell']]
    s = df1_data[i][0]
    
    print(all(np.isclose(k, s)))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [70]:
for i in range(50):
    k = df2.loc[df2_labels.loc[i, 'cell']]
    s = df2_data[i][0]
    
    print(all(np.isclose(k, s)))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [71]:
df2_data = GeneExpressionData(datafiles[1], labelfiles[1], 'Type', skip=3)
df2_data[0]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), 7)

In [72]:
df2_data[1]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), 7)

In [73]:
for i in range(50):
    k = df2.loc[df2_labels.loc[i, 'cell']]
    s = df2_data[i][0]
    
    print(all(np.isclose(k, s)))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [74]:
for i in range(50):
    k = df3.loc[df3_labels.loc[i, 'cell']]
    s = df3_data[i][0]
    
    print(all(np.isclose(k, s)))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [75]:
for i in range(50):
    k = df4.loc[df4_labels.loc[i, 'cell']]
    s = df4_data[i][0]
    
    print(all(np.isclose(k, s)))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


Now, let's write our training loop using all four datasets.

In [76]:
loaders = [df1_data, df2_data, df3_data, df4_data]
loaders = [DataLoader(data, batch_size=2) for data in loaders]

In [77]:
next(iter(loaders[0]))

[tensor([[2.8467, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]),
 tensor([16, 16])]

Now, let's time the DataLoader vs the custom DataLoader from Pytorch Tabular found here: https://github.com/hcarlens/pytorch-tabular/blob/master/fast_tensor_data_loader.py

In [78]:
train = GeneExpressionData(datafiles[0], labelfiles[0], 'Type', skip=3)

trainloader = DataLoader(train, batch_size=4)
fastloader = FastTensorDataLoader(train, batch_size=4)

In [101]:
for i, sample in enumerate(tqdm(trainloader)):
    X, y = sample 
    net(X)
    if i == 200:
        break

  0%|▉                                                                                                                                                                                                                     | 200/46619 [00:03<15:19, 50.48it/s]


In [102]:
for i, sample in enumerate(tqdm(fastloader)):
    t = sample
    
    if i == 200:
        break

  0%|▉                                                                                                                                                                                                                    | 200/46619 [00:01<05:53, 131.46it/s]


In [103]:
s = next(iter(fastloader))

In [109]:
s

([(tensor([2.8467, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]), 16),
  (tensor([0., 0., 0.,  ..., 0., 0., 0.]), 16),
  (tensor([0.0000, 1.8507, 0.0000,  ..., 0.0000, 0.0000, 0.0000]), 4),
  (tensor([1.6067, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]), 4)],)

In [98]:
x = [df1_data[i] for i in range(5)]
x

[(tensor([2.8467, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]), 16),
 (tensor([0., 0., 0.,  ..., 0., 0., 0.]), 16),
 (tensor([0.0000, 1.8507, 0.0000,  ..., 0.0000, 0.0000, 0.0000]), 4),
 (tensor([1.6067, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]), 4),
 (tensor([0., 0., 0.,  ..., 0., 0., 0.]), 4)]

In [100]:
next(iter(trainloader))

[tensor([[2.8467, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 1.8507, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [1.6067, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]),
 tensor([16, 16,  4,  4])]

In [110]:
import torch.optim as optim
import torch.nn as nn
import wandb

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

for epoch in range(100):  # loop over the dataset multiple times
    
    running_loss = 0.0
    for trainidx, trainloader in enumerate(loaders):
        print(f'Training on {trainidx}')
        for i, data in enumerate(tqdm(trainloader), 0):
            inputs, labels = data

            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 10 == 0:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
                running_loss = 0.0

print('Finished Training')

Training on 0


  0%|                                                                                                                                                                                                                      | 1/93238 [00:00<4:53:41,  5.29it/s]

[1,     1] loss: 0.001


  0%|                                                                                                                                                                                                                     | 15/93238 [00:00<1:02:06, 25.01it/s]

[1,    11] loss: 0.015


  0%|                                                                                                                                                                                                                       | 24/93238 [00:01<57:15, 27.13it/s]

[1,    21] loss: 0.011


  0%|                                                                                                                                                                                                                       | 36/93238 [00:01<55:14, 28.12it/s]

[1,    31] loss: 0.017


  0%|                                                                                                                                                                                                                       | 45/93238 [00:01<54:33, 28.47it/s]

[1,    41] loss: 0.013


  0%|                                                                                                                                                                                                                       | 54/93238 [00:02<54:05, 28.71it/s]

[1,    51] loss: 0.007


  0%|▏                                                                                                                                                                                                                      | 66/93238 [00:02<54:32, 28.47it/s]

[1,    61] loss: 0.014


  0%|▏                                                                                                                                                                                                                      | 74/93238 [00:02<59:40, 26.02it/s]

[1,    71] loss: 0.012





KeyboardInterrupt: 

In [None]:
def train_batch(images, labels, model, optimizer, criterion):
    images, labels = images.to(device), labels.to(device)
    
    # Forward pass ➡
    outputs = model(images)
    loss = criterion(outputs, labels)
    
    # Backward pass ⬅
    optimizer.zero_grad()
    loss.backward()

    # Step with optimizer
    optimizer.step()

    return loss

## Linecache speed testing 

If we can improve the speed of our __getitem__ method, we can train our model a lot faster. Since currently it requires two list comprehensions, let's see if we can increase the time 

In [85]:
import linecache 

line = linecache.getline('../data/interim/primary_bhaduri_T.csv', 5)
line = np.array(line.split(','), dtype=np.float32)

In [86]:
%%timeit 

line = linecache.getline('../data/interim/primary_bhaduri_T.csv', 5)
line = np.array(line.split(','), dtype=np.float32)

1.86 ms ± 5.78 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [87]:
df1_data.name

'../data/interim/primary_bhaduri_T.csv'

In [88]:
%%timeit 

line = df1_data[5]

1.87 ms ± 9.75 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
