# Dataloading 01

In this notebook, we'll figure out how to use PyTorch's DataLoader class to load our massive files without reading the entirety of them into memory

In [8]:
import comet_ml
import dask.dataframe as dd
import pandas as pd 
import torch
import linecache 
import csv
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import torch.nn.functional as F
import sys, os
from pathlib import Path
import plotly.express as px 
from sklearn.utils.class_weight import compute_class_weight
import torch

sys.path.append('../src')
here = Path().cwd()

We'll first design a custom dataset to use with PyTorch's `DataLoader` class

In [2]:
from models.train_neural_network import *

In [6]:
test = pd.read_csv('../data/processed/meta_organoid_labels.csv')
test

Unnamed: 0.1,Unnamed: 0,Class,State,Type,Subtype
0,0,1,0,5,15
1,1,1,0,5,15
2,2,1,0,5,15
3,3,1,1,5,12
4,4,1,0,5,15
...,...,...,...,...,...
235116,235116,1,1,5,13
235117,235117,1,1,2,3
235118,235118,0,3,1,14
235119,235119,1,1,5,8


In [5]:
primary = GeneExpressionData(
    filename='../data/processed/primary.csv',
    labelname='../data/processed/meta_primary_labels.csv',
    class_label='Subtype'
)

In [7]:
organoid = GeneExpressionData(
    filename='../data/processed/organoid.csv',
    labelname='../data/processed/meta_organoid_labels.csv',
    class_label='Subtype',
)

In [17]:
len(primary), len(organoid)

(189409, 235121)

In [18]:
primary

<models.train_neural_network.GeneExpressionData at 0x7f9f500d1160>

In [19]:
combined = torch.utils.data.ConcatDataset([primary, organoid])
len(combined)

424530

In [41]:
dataset = primary

In [42]:
train_size = int(0.80 * len(dataset))
test_size = len(dataset) - train_size
train, test = torch.utils.data.random_split(dataset, [train_size, test_size])


In [44]:
train.compute_class_weights()

AttributeError: 

In [27]:
org

Unnamed: 0,V1,Cluster,Sample,Line,Protocol,Age,iPSCorhESC,Class,State,Type,Subtype
0,H1SWeek3_AAACCTGAGACAAAGG,29,H1SWeek3,H1,Less Directed,3,hESC,Nonneuronal,Dividing,RadialGlia,panRG
1,H1SWeek3_AAACCTGAGCACACAG,5,H1SWeek3,H1,Less Directed,3,hESC,Nonneuronal,Dividing,RadialGlia,panRG
2,H1SWeek3_AAACCTGAGGATGGAA,35,H1SWeek3,H1,Less Directed,3,hESC,Nonneuronal,Dividing,RadialGlia,panRG
3,H1SWeek3_AAACCTGCAATTGCTG,36,H1SWeek3,H1,Less Directed,3,hESC,Nonneuronal,Nondividing,RadialGlia,hindbrainRG
4,H1SWeek3_AAACCTGCAGCGTAAG,5,H1SWeek3,H1,Less Directed,3,hESC,Nonneuronal,Dividing,RadialGlia,panRG
...,...,...,...,...,...,...,...,...,...,...,...
235116,WTC10SWeek10_TTTGTCAAGTGCAAGC,15,YH10SWeek10,YH10,Less Directed,10,iPSC,Nonneuronal,Nondividing,RadialGlia,lowquality
235117,WTC10SWeek10_TTTGTCACAAGTTGTC,35,YH10SWeek10,YH10,Less Directed,10,iPSC,Nonneuronal,Nondividing,IPC,MatureIPC
235118,WTC10SWeek10_TTTGTCAGTCTAAACC,2,YH10SWeek10,YH10,Less Directed,10,iPSC,Neuronal,Postmitotic,ExcitatoryNeuron,panNeuron
235119,WTC10SWeek10_TTTGTCATCACAGGCC,4,YH10SWeek10,YH10,Less Directed,10,iPSC,Nonneuronal,Nondividing,RadialGlia,earlyRG


In [30]:
prim['Type'].value_counts()

Excitatory Neuron     122958
Radial Glia            29563
Inhibitory Neuron      20609
Microglia               4510
IPC                     3863
Outlier                 2933
Red blood cells         2451
OPC                     1888
Mural                    363
Endothelial              271
Name: Type, dtype: int64

In [35]:
prim['Age'].value_counts()

22    83653
18    78157
14    14435
10     7194
6      5970
Name: Age, dtype: int64

In [37]:
prim[prim['Age'] >= 18]

Unnamed: 0,Cell,Area,Individual,Age,Class,State,Type,Subtype,Cluster
14435,AAACCTGAGAACTGTA_9512,hippocampus,GW18,18,Non-neuronal,Dividing,Radial Glia,late,42
14436,AAACCTGAGGACACCA_9513,hippocampus,GW18,18,Neuron,Postmitotic,Excitatory Neuron,Newborn,41
14437,AAACCTGCAAAGTGCG_9514,hippocampus,GW18,18,Neuron,Postmitotic,Inhibitory Neuron,MGE2,10
14438,AAACCTGCAGCCACCA_9515,hippocampus,GW18,18,Neuron,Postmitotic,Excitatory Neuron,Deep Layer,37
14439,AAACCTGCATCCTAGA_9516,hippocampus,GW18,18,Neuron,Postmitotic,Excitatory Neuron,Newborn,41
...,...,...,...,...,...,...,...,...,...
176240,TTTGTCATCCGTAGTA,V1,GW22,22,Neuron,Postmitotic,Excitatory Neuron,Newborn,31
176241,TTTGTCATCGGCTACG,V1,GW22,22,Neuron,Postmitotic,Excitatory Neuron,Newborn,31
176242,TTTGTCATCTAAGCCA,V1,GW22,22,Neuron,Postmitotic,Excitatory Neuron,Upper Layer,21
176243,TTTGTCATCTACCAGA,V1,GW22,22,Non-neuronal,Postmitotic,Microglia,Microglia low quality,12


In [38]:
prim['Class'].value_counts()

Neuron          143567
Non-neuronal     42909
Outlier           2933
Name: Class, dtype: int64

In [39]:
prim['State'].value_counts()

Postmitotic     151162
Dividing         18587
Non-dividing     16727
Outlier           2933
Name: State, dtype: int64