# Data Exploration 01

In this notebook, we'll try making a CNN for cell line classification

In [1]:
import pandas as pd 
import numpy as np 
import torch
import os 
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

In [2]:
mapping = {
    "A172": "Glioblastoma",
    "BT474": "Ductal Carcinoma",
    "BV2": "Microglial",
    "Huh7": "Tumorigenic",
    "MCF7": "Breast Cancer",
    "SHSY5Y": "Neuroblastoma",
    "SkBr3": "Adenocarcinoma",
    "SKOV3": "Adenocarcinoma"
}

In [3]:
for t in mapping:
    num_in = len([f for f in os.listdir(os.path.join('../images/', t))])
    print(f'Number of images of {mapping[t]} is {num_in}')

Number of images of Glioblastoma is 608
Number of images of Ductal Carcinoma is 672
Number of images of Microglial is 608
Number of images of Tumorigenic is 600
Number of images of Breast Cancer is 735
Number of images of Neuroblastoma is 704
Number of images of Adenocarcinoma is 704
Number of images of Adenocarcinoma is 608


Now, we have to create the labels file. We'll have a column with the true label name, and another with an integer encoded representation since PyTorch doesn't encode strings automatically

In [165]:
import pandas as pd 
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

df = pd.DataFrame(columns=['label'])
df.index.name = 'filename'
for t in mapping:
    for f in os.listdir(os.path.join('../images/', t)):
        df.loc[os.path.join(t, f), :] = mapping[t]

df['class'] = le.fit_transform(df['label'])
df.head()

Unnamed: 0_level_0,label,class
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
A172/A172_Phase_A7_1_00d08h00m_3.tif,Glioblastoma,3
A172/A172_Phase_A7_2_02d04h00m_1.tif,Glioblastoma,3
A172/A172_Phase_B7_1_03d00h00m_3.tif,Glioblastoma,3
A172/A172_Phase_C7_2_01d04h00m_3.tif,Glioblastoma,3
A172/A172_Phase_C7_1_01d00h00m_2.tif,Glioblastoma,3


In [196]:
for k in mapping:
    print(k)

A172
BT474
BV2
Huh7
MCF7
SHSY5Y
SkBr3
SKOV3


In [168]:
df.to_csv('../images/labels.csv', index=True)
pd.read_csv('../images/labels.csv')

Unnamed: 0,filename,label,class
0,A172/A172_Phase_A7_1_00d08h00m_3.tif,Glioblastoma,3
1,A172/A172_Phase_A7_2_02d04h00m_1.tif,Glioblastoma,3
2,A172/A172_Phase_B7_1_03d00h00m_3.tif,Glioblastoma,3
3,A172/A172_Phase_C7_2_01d04h00m_3.tif,Glioblastoma,3
4,A172/A172_Phase_C7_1_01d00h00m_2.tif,Glioblastoma,3
...,...,...,...
5234,SKOV3/SKOV3_Phase_E4_1_01d04h00m_2.tif,Adenocarcinoma,0
5235,SKOV3/SKOV3_Phase_E4_2_01d12h00m_4.tif,Adenocarcinoma,0
5236,SKOV3/SKOV3_Phase_G4_1_00d20h00m_1.tif,Adenocarcinoma,0
5237,SKOV3/SKOV3_Phase_H4_2_01d00h00m_3.tif,Adenocarcinoma,0


Now that we've created a labels file, we can create the PyTorch dataset and generate our train-test split. 

In [4]:
class CellDataset(Dataset):
    def __init__(self, images_path, label_path):
        self.images_path = images_path
        self.labels = pd.read_csv(label_path)
        self.tensor = transforms.ToTensor()
        
    def __len__(self):
        return self.labels.shape[0]
    
    def __getitem__(self, idx):
        img_path, label = self.labels.iloc[idx]['filename'], self.labels.iloc[idx]['class']
        img = Image.open(os.path.join(self.images_path, img_path))
        return self.tensor(img), label
            

In [5]:
dataset = CellDataset('../images/', '../images/labels.csv')
dataset[0]

(tensor([[[0.4980, 0.5020, 0.4980,  ..., 0.4275, 0.4353, 0.4510],
          [0.4941, 0.4980, 0.5137,  ..., 0.4157, 0.3647, 0.3922],
          [0.4980, 0.4980, 0.5059,  ..., 0.3882, 0.4549, 0.4863],
          ...,
          [0.4941, 0.4941, 0.5020,  ..., 0.5020, 0.4980, 0.5059],
          [0.5098, 0.5059, 0.5059,  ..., 0.4980, 0.4980, 0.5020],
          [0.5098, 0.5059, 0.4980,  ..., 0.5059, 0.4980, 0.5020]]]),
 3)

In [6]:
train_size = int(0.80 * len(dataset))
test_size = len(dataset) - train_size
train, test = torch.utils.data.random_split(dataset, [train_size, test_size])

In [7]:
traindata = DataLoader(train, batch_size=8, num_workers=0)
valdata = DataLoader(test, batch_size=8, num_workers=0)

Now that we've defined our data, we can build our CNN classifier and benchmark our results. We'll use PyTorch Lightning so we can run our model on the PRP

In [8]:
import pytorch_lightning as pl
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.stack = nn.Sequential(
            nn.Conv2d(1,32,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(32,64,kernel_size=3,stride=1,padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            # nn.Conv2d(64,128,kernel_size=3,stride=1,padding=1),
            # nn.ReLU(),
            # nn.Conv2d(128,128,kernel_size=3,stride=1,padding=1),
            # nn.ReLU(),
            # nn.MaxPool2d(2,2),
            # nn.Conv2d(128,256,kernel_size=3,stride=1,padding=1),
            # nn.ReLU(),
            # nn.Conv2d(256,256,kernel_size=3,stride=1,padding=1),
            # nn.ReLU(),
            # nn.MaxPool2d(2,2),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.stack(x)


In [9]:
model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [10]:
for epoch in range(2):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(traindata, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        print(outputs.shape)
        print(outputs.shape)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

torch.Size([8, 5857280])
torch.Size([8, 5857280])


KeyboardInterrupt: 