In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

from models.kim_cnn import CNN1DText
from utils.caption_utils import preprocess_training_data, load_embedding_matrix, clean_str
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader

In [2]:
MAX_NUMBER_WORDS = 20000       # number of words to consider from embeddings vocabulary
MAX_WORDS_PER_SENTENCE = 700   # sentence maximum length
WORD_DIMENSION = 300           # number of features per embedding
NUM_CLASSES = 4                # 4 microscopy classes

DATA_PATH = '/workspace/data/multimodality_classification.csv'
EMBEDDINGS = '/workspace/data/embeddings'

In [3]:
df = pd.read_csv(DATA_PATH, sep='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,ID,MODALITY,PATH,CAPTION,SET
0,201,1423-0127-16-7-1-1.jpg,DMEL,subfigure-classification/2016/train/DMEL/1423-...,Scanning electron microscope images of the TiO...,TRAIN
1,202,1423-0127-16-7-1-2.jpg,DMEL,subfigure-classification/2016/train/DMEL/1423-...,Scanning electron microscope images of the TiO...,TRAIN
2,203,1423-0127-16-7-1-4.jpg,DMEL,subfigure-classification/2016/train/DMEL/1423-...,Scanning electron microscope images of the TiO...,TRAIN
3,204,1423-0127-16-7-1-5.jpg,DMEL,subfigure-classification/2016/train/DMEL/1423-...,Scanning electron microscope images of the TiO...,VAL
4,205,1423-0127-16-7-1-6.jpg,DMEL,subfigure-classification/2016/train/DMEL/1423-...,Scanning electron microscope images of the TiO...,TRAIN


In [4]:
train_df = df[df['SET']=='TRAIN']
val_df = df[df['SET']=='VAL']

In [5]:
x0_train, y0_train = train_df['CAPTION'].values, train_df['MODALITY'].values
x0_val, y0_val = val_df['CAPTION'].values, val_df['MODALITY'].values

In [6]:
(x_train, y_train), (x_val, y_val), word_index, _ = \
    preprocess_training_data(x0_train, y0_train, x0_val, y0_val, MAX_NUMBER_WORDS, MAX_WORDS_PER_SENTENCE)

Training Data Vector:  (1864, 700)
Validation Data Vector:  (466, 700)


In [7]:
x_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [8]:
y_train[0]

'DMEL'

In [9]:
x0_train[0]

'Scanning electron microscope images of the TiO 2  nanoparticles . (A) C150, (B) C200, (C) EDS elemental spectrum of C200, (D)  S. aureus  and UV100, (E)  S. aureus  and C150, and (F)  S. aureus  and C200. Scale bars: 100 nm.'

In [10]:
clean_str(x0_train[0])

'scanning electron microscope images of the tio 2 nanoparticles \\( a \\) c150 , \\( b \\) c200 , \\( c \\) eds elemental spectrum of c200 , \\( d \\) s aureus and uv100 , \\( e \\) s aureus and c150 , and \\( f \\) s aureus and c200 scale bars 100 nm'

In [11]:
embeddings_dict = load_embedding_matrix(EMBEDDINGS, 300)

Dimension: 300; found 400000 word vectors.


In [12]:
MAX_NUMBER_WORDS = len(word_index) + 1
MAX_NUMBER_WORDS

7221

In [13]:
embedding_matrix = np.zeros((MAX_NUMBER_WORDS, WORD_DIMENSION))
for word, idx in word_index.items():
    if idx < MAX_NUMBER_WORDS:
        word_embedding = embeddings_dict.get(word)
        if word_embedding is not None:
            embedding_matrix[idx] = word_embedding
        else:
            embedding_matrix[idx] = np.random.randn(WORD_DIMENSION)

In [14]:
le = LabelEncoder()
le.fit(y_train)

x_train_tensor = torch.LongTensor(x_train)
y_train_tensor = torch.LongTensor(le.transform(y_train))

x_val_tensor = torch.LongTensor(x_val)
y_val_tensor = torch.LongTensor(le.transform(y_val))

kwargs = {'num_workers': 16, 'pin_memory': True} if torch.cuda.is_available() else {}

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, **kwargs)

val_dataset = TensorDataset(x_val_tensor, y_val_tensor)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, **kwargs)

In [15]:
model = CNN1DText(MAX_WORDS_PER_SENTENCE,
                  MAX_NUMBER_WORDS,
                  WORD_DIMENSION,
                  embedding_matrix,
                  num_classes=NUM_CLASSES,
                  train_embeddings=True)

In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
epochs = 50

In [17]:

train_losses = []
train_accs  = []
val_losses = []
val_accs  = []

for epoch in range(epochs):
    # train step
    model.train()

    train_loss, n_iter = 0, 0
    total, correct = 0, 0
    
    for x, y in train_dataloader:
        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        
        outputs = model(x)
        loss = criterion(outputs, y)        
        _, predicted = torch.max(outputs.data, 1)
        
        loss.backward()
        optimizer.step()

        total += y.size(0)
        correct += torch.sum(predicted == y)
        train_loss += loss.item()
        n_iter += 1

    epoch_acc = 100 * correct.item() / total
    average_loss = train_loss / n_iter
    train_losses.append(average_loss)
    train_accs.append(epoch_acc)
    
    print("train - epoch {0}: loss: {1}, acc: {2}".format(str(epoch), str(average_loss), str(epoch_acc)))
    
    # validation step
    model.eval()

    valid_loss, valid_n_iter = 0, 0
    total, correct = 0, 0
    
    for x, y in val_dataloader:         
        x = x.to(device)
        y = y.to(device)

        outputs = model(x)
        loss = criterion(outputs, y)

        _, predicted = torch.max(outputs.data, 1)
        
        total += y.size(0)
        correct += torch.sum(predicted == y)
        valid_loss += loss.item()
        valid_n_iter += 1

    epoch_acc = 100 * correct.item() / total
    average_loss = valid_loss / valid_n_iter
    val_losses.append(average_loss)
    val_accs.append(epoch_acc)
    
    print("val  - epoch {0}: loss: {1}, acc: {2}".format(str(epoch), str(average_loss), str(epoch_acc)))

train - epoch 0: loss: 0.9124659552412518, acc: 64.32403433476395
val  - epoch 0: loss: 0.593061774969101, acc: 80.25751072961373
train - epoch 1: loss: 0.47949724657050635, acc: 84.92489270386267
val  - epoch 1: loss: 0.5000835130612056, acc: 81.54506437768241
train - epoch 2: loss: 0.3943698199118598, acc: 87.33905579399142
val  - epoch 2: loss: 0.49752609580755236, acc: 84.12017167381974
train - epoch 3: loss: 0.3702757445684934, acc: 88.78755364806867
val  - epoch 3: loss: 0.5458249593774478, acc: 83.2618025751073
train - epoch 4: loss: 0.38185361103486204, acc: 88.19742489270386
val  - epoch 4: loss: 0.5548598696788152, acc: 83.2618025751073
train - epoch 5: loss: 0.34259729668245475, acc: 89.48497854077253
val  - epoch 5: loss: 0.493646772702535, acc: 84.97854077253218
train - epoch 6: loss: 0.32509999324457123, acc: 89.16309012875537
val  - epoch 6: loss: 0.50447096824646, acc: 83.47639484978541
train - epoch 7: loss: 0.35167551305839573, acc: 89.8068669527897
val  - epoch 7: lo

In [21]:
for x, y in val_dataloader:
    break

In [22]:
import torch.nn.functional as F
x = x.to(device)
y = y.to(device)

In [23]:
outputs = model(x)

In [25]:
outputs.shape

torch.Size([32, 4])

In [27]:
loss = F.cross_entropy(outputs, y)
loss

tensor(0.9252, device='cuda:0', grad_fn=<NllLossBackward>)

In [40]:
from pytorch_lightning.metrics.classification import Accuracy
acc = Accuracy(4)
_, predicted = torch.max(outputs.data, 1)
print(predicted.shape)
print(predicted)
print(y)
print(acc(outputs, y))
print(acc(predicted, y))

torch.Size([32])
tensor([0, 3, 1, 3, 3, 3, 3, 1, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 3, 0, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
tensor(0.5938)
tensor(0.5938)


In [29]:
y.shape

torch.Size([32])

In [18]:
model.eval()
y_true = []
y_pred = []
for x, y in val_dataloader:         
    y_true += y
    
    x = x.to(device)
    y = y.to(device)

    outputs = model(x)
    _, predicted = torch.max(outputs.data, 1)
    y_pred += predicted.cpu()

In [19]:
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix(y_true, y_pred)

array([[ 37,   5,   0,  10],
       [  4, 163,   9,  12],
       [  2,  13, 136,   6],
       [  4,   4,   8,  53]])

In [20]:
target_names = le.classes_
print(classification_report(y_true, y_pred, digits=4, target_names=target_names))

              precision    recall  f1-score   support

        DMEL     0.7872    0.7115    0.7475        52
        DMFL     0.8811    0.8670    0.8740       188
        DMLI     0.8889    0.8662    0.8774       157
        DMTR     0.6543    0.7681    0.7067        69

    accuracy                         0.8348       466
   macro avg     0.8029    0.8032    0.8014       466
weighted avg     0.8397    0.8348    0.8363       466

