In [1]:
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import torch
import torchvision
import torch.nn as nn
from torchmetrics import Accuracy
from torchvision.utils import make_grid
from torch.utils.data import random_split
import torchvision.transforms as transforms
from torch.utils.data import Dataset,DataLoader

import pytorch_lightning as pl

# Data preprocessing

In [2]:
df_raw = pd.read_csv('/scratch-shared/martin/003_SPECS1K_ML/001_data/Specs935_ImageMeans_AfterQC_AnnotatedWithMOA.csv', sep=';')
df_raw.rename(columns={'Compound ID':'Compound_ID'}, inplace = True)

In [3]:
df = df_raw.copy()

In [4]:
# reading the "inactive" dataset
df_tox = pd.read_csv('data/train.csv', sep=',')
df_tox.rename(columns={'Compound ID':'Compound_ID'}, inplace = True)
df_heat= df_tox[df_tox["selected_mechanism"]=='heat shock response signalling agonist'].copy()

df_heat_inactive = df_heat[(df_heat["ASSAY_OUTCOME"]=="inactive")].copy()
df_heat_active   = df_heat[(df_heat["ASSAY_OUTCOME"]=="active")].copy()
df_heat_inactive.drop(["ASSAY_OUTCOME"], axis=1, inplace=True)
df_heat_active.drop(["ASSAY_OUTCOME"], axis=1, inplace=True)
df_heat.drop(["selected_mechanism", "Compound_ID"], axis=1, inplace=True)

In [5]:
df.dropna(subset=['Compound_ID'], inplace=True)
df = df[df['selected_mechanism'].str.contains('dmso')==False] # actually not dropping anything, since dropna already drop all dmso

In [6]:
df.drop(["ImageID", "Plate", "Plate_Well", "batch_id", "pertType", "Batch nr", "PlateID", "Well"], axis=1, inplace=True)

# remove all heat
df = df[df.selected_mechanism != "heat shock response signalling agonist"]
# add heat_active
df = pd.concat([df, df_heat_active], axis=0)

In [7]:
df = df.groupby(['selected_mechanism','Compound_ID']).mean() # merge the data based on Compound_ID, avoiding reading "duplicated data"
df.reset_index(inplace=True)
df.drop('Compound_ID', axis=1, inplace=True)

In [8]:
df

Unnamed: 0,selected_mechanism,ImageNumber_nuclei,ObjectNumber_nuclei,Metadata_Site_nuclei,AreaShape_Area_nuclei,AreaShape_BoundingBoxArea_nuclei,AreaShape_BoundingBoxMaximum_X_nuclei,AreaShape_BoundingBoxMaximum_Y_nuclei,AreaShape_BoundingBoxMinimum_X_nuclei,AreaShape_BoundingBoxMinimum_Y_nuclei,...,RadialDistribution_ZernikePhase_illumSYTO_8_8_cytoplasm,RadialDistribution_ZernikePhase_illumSYTO_9_1_cytoplasm,RadialDistribution_ZernikePhase_illumSYTO_9_3_cytoplasm,RadialDistribution_ZernikePhase_illumSYTO_9_5_cytoplasm,RadialDistribution_ZernikePhase_illumSYTO_9_7_cytoplasm,RadialDistribution_ZernikePhase_illumSYTO_9_9_cytoplasm,Site,cmpd_conc,Flag,Count_nuclei
0,ATPase inhibitor,1.0,71.245173,5.239130,2594.052840,3641.637710,1115.659189,1105.359249,1055.397388,1044.748298,...,-0.007249,0.037613,-0.004677,0.029404,-0.004017,-0.006743,5.239130,10.0,0.0,129.934783
1,ATPase inhibitor,1.0,75.987102,4.909091,2575.014391,3595.923493,1103.946315,1113.647177,1044.400607,1053.059617,...,0.020761,0.017658,0.006239,0.016753,0.010671,-0.045862,4.909091,10.0,0.0,137.136364
2,ATPase inhibitor,1.0,76.035368,5.187500,2520.257557,3542.050967,1091.662087,1114.037379,1031.724602,1054.629479,...,-0.056267,-0.062818,-0.000146,0.000251,0.025794,-0.043205,5.187500,10.0,0.0,137.479167
3,ATPase inhibitor,1.0,87.332931,5.022222,2590.518971,3655.140277,1089.390643,1122.687979,1028.800183,1062.081933,...,0.044132,0.030789,-0.005525,0.026131,0.017409,0.006531,5.022222,10.0,0.0,159.022222
4,ATPase inhibitor,1.0,10.026111,4.968750,1715.445263,2698.307079,1042.539294,1133.979970,993.095621,1082.576196,...,0.011072,-0.065412,-0.053279,-0.157138,-0.003663,-0.152837,4.968750,10.0,0.0,20.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
842,tubulin polymerization inhibitor,1.0,20.402987,5.000000,2082.156880,3014.069150,1076.408519,1068.403089,1025.965967,1017.558366,...,-0.099782,-0.138100,-0.064953,0.028538,-0.088900,0.061930,5.000000,10.0,0.0,30.416667
843,tubulin polymerization inhibitor,1.0,11.223901,4.790698,2383.460366,3494.812003,1085.155299,1068.350095,1030.907400,1012.764174,...,-0.048432,-0.170362,0.252728,0.184233,0.153065,0.110785,4.790698,10.0,0.0,19.302326
844,tubulin polymerization inhibitor,1.0,13.496681,5.000000,1996.259680,2951.947594,1027.813396,1103.537110,978.226385,1052.836261,...,0.056227,-0.031864,-0.069233,0.052349,0.003676,-0.093969,5.000000,10.0,0.0,22.541667
845,tubulin polymerization inhibitor,1.0,13.776276,4.903846,2145.525287,3180.830545,1094.460819,1059.192757,1042.688779,1005.678258,...,0.080633,-0.037189,-0.199157,0.027529,-0.168356,-0.187250,4.903846,10.0,0.0,24.596154


# Loading dataset

In [9]:
# Hyperparameter
val_pct = 0.2

learning_rate = 1e-4
n_epochs = 500
n_batch = 16

In [10]:
from sklearn.preprocessing import LabelEncoder

trainset = df.copy()
# testset = pd.read_csv('../input/digit-recognizer/test.csv')

le = LabelEncoder()

trainLabels = le.fit_transform(df.iloc[:,0])
trainSamples = df.iloc[:,1:]

In [11]:
class datasets(Dataset):
    def __init__(self, features=torch.tensor(trainSamples.values), labels=torch.tensor(trainLabels)):
        self.features = features
        self.labels = labels
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, i):
        x = self.features[i, :].float()
        y = self.labels[i]
        return x,y

In [12]:
dataset = datasets()

In [13]:
valid_size = int(len(dataset)*val_pct)
train_size = int(len(dataset)-valid_size)

train_ds, valid_ds = random_split(dataset, [train_size,  valid_size])

trainloader = torch.utils.data.DataLoader(train_ds, batch_size=n_batch, shuffle=True, num_workers=0, pin_memory=True)
validloader = torch.utils.data.DataLoader(valid_ds, batch_size=n_batch, shuffle=False, num_workers=0, pin_memory=True)

# Model

In [14]:
# since all loss are the same, it should be defined in the self instead of every training/valid step

class specs_ann(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.train_acc = Accuracy()
        self.valid_acc = Accuracy()
        self.test_acc = Accuracy()
        
        self.fc = nn.Sequential(
            nn.Linear(2130, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 30)
        )
        
    def forward(self, x):
        out = x.view(x.size(0), -1)
        out = self.fc(out)
        return out

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = nn.functional.cross_entropy(logits, y)
        preds = torch.argmax(logits, dim=1)
        self.train_acc.update(preds, y)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def training_epoch_end(self, outs):
        self.log("train_acc", self.train_acc.compute())
        self.train_acc.reset()
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = nn.functional.cross_entropy(logits, y)
        preds = torch.argmax(logits, dim=1)
        self.valid_acc.update(preds, y)
        self.log("valid_loss", loss, prog_bar=True)
        return loss
    
    def validation_epoch_end(self, outs):
        self.log("valid_acc", self.valid_acc.compute(), prog_bar=True)
        self.valid_acc.reset()

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = nn.functional.cross_entropy(logits, y)
        preds = torch.argmax(logits, dim=1)
        self.test_acc.update(preds, y)
        self.log("test_loss", loss, prog_bar=True)
        self.log("test_acc", self.test_acc.compute(), prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        return optimizer

In [15]:
from pytorch_lightning.callbacks import ModelCheckpoint

model = specs_ann()

callbacks = [ModelCheckpoint(save_top_k=1, mode='max', monitor="valid_acc")] # save top 1 model

if torch.cuda.is_available(): # if you have GPUs
    trainer = pl.Trainer(max_epochs=n_epochs, callbacks=callbacks, accelerator='gpu', devices=1)
else:
    trainer = pl.Trainer(max_epochs=n_epochs, callbacks=callbacks)

trainer.fit(model, trainloader, validloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params
-----------------------------------------
0 | train_acc | Accuracy   | 0     
1 | valid_acc | Accuracy   | 0     
2 | test_acc  | Accuracy   | 0     
3 | fc        | Sequential | 150 K 
-----------------------------------------
150 K     Trainable params
0         Non-trainable params
150 K     Total params
0.603     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
trainer.test(model, testloader, ckpt_path='best')

NameError: name 'mnist_dm' is not defined

In [None]:
!tensorboard --logdir .