In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, models, transforms
from transformers import BertModel
import os, sys, time
from sklearn import preprocessing
from scipy import stats
from collections import defaultdict
import numpy as np
import pandas as pd
import statistics
import transformers
from ipywidgets import FloatProgress
from torch.optim import lr_scheduler
import matplotlib.pyplot as plt
import copy
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from collections import OrderedDict
from PIL import Image
#plt.ion()   # interactive mode


In [2]:
data_folder=os.getcwd()
test_set="book30-listing-test.csv"
train_set="book30-listing-train.csv"

test=pd.read_csv(os.path.join(data_folder+"/"+test_set), sep=",", encoding='latin_1') #'unicode_escape'
train=pd.read_csv(os.path.join(data_folder+"/"+train_set), sep=",", encoding='latin_1')
#setting up columns' names
test=test.set_axis(["AMAZON INDEX (ASIN)","FILENAME","IMAGE URL","TITLE","AUTHOR","CATEGORY ID","CATEGORY"], axis=1, inplace=False)
train=train.set_axis(["AMAZON INDEX (ASIN)","FILENAME","IMAGE URL","TITLE","AUTHOR","CATEGORY ID","CATEGORY"], axis=1, inplace=False)

#selecting title and category id
test_set = test[['FILENAME','TITLE','CATEGORY ID', "CATEGORY"]]
train_set = train[['FILENAME','TITLE','CATEGORY ID', "CATEGORY"]]


In [3]:
length = len(train_set) + len(test_set)
print(0.8*length, 0.1* length, 0.1*length)

train_s=train_set[:45598] 
val_s=train_set[45598:] 

print(f'LENGTH OF VALIDATION SET: {len(val_s)}, \
      LENGTH OF TEST SET: {len(test_set)},  \
      LENGTH OF TRAIN SET: {len(train_s)}. ')


45598.4 5699.8 5699.8
LENGTH OF VALIDATION SET: 5701,       LENGTH OF TEST SET: 5699,        LENGTH OF TRAIN SET: 45598. 


In [4]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, X, path, train = False):

        self.labels = [label for label in X['CATEGORY ID']]  #list
        self.texts = [tokenizer(title, truncation=True, padding='max_length', max_length = 74,  #512
                    return_tensors="pt") for title in X['TITLE']]
        self.images_name = [img_name for img_name in X['FILENAME']] 
        self.path = path
        self.label_name = [lbl_name for lbl_name in X['CATEGORY']]
        self.train = train
        
    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]
    
    def get_batch_label_name(self, idx):
        return self.label_name[idx]
            
    def get_batch_images_names(self, idx):
        # Fetch a batch of image names
        return self.images_name[idx]
    
    def get_batch_images(self, idx):
        # Fetch a batch of images
        batch_images_name = self.get_batch_images_names(idx)
        batch_label_name = self.get_batch_label_name(idx)
        img = Image.open(os.path.join(self.path, batch_label_name, batch_images_name)).convert('RGB')
        
        self.train = train
        
        if train:
            train_transform  =  transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
            img = train_transform(img)
        else:
            val_transform  =  transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
            img = val_transform(img)
                
        return img
        

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        batch_image = self.get_batch_images(idx)
        
        return batch_texts, batch_y, batch_image

In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                         do_lower_case=True)

#train, val = Dataset(train_s, "/home/gussikoju@GU.GU.SE/images/train", True), Dataset(val_s, "/home/gussikoju@GU.GU.SE/images/train")
train, val = Dataset(train_s, "/home/gussikoju@GU.GU.SE/Data/sorted/train", True), Dataset(val_s, "/home/gussikoju@GU.GU.SE/Data/sorted/train")

In [6]:
train.get_batch_images(2451)

tensor([[[2.1290, 2.1290, 2.1119,  ..., 2.0434, 1.9235, 1.9235],
         [2.1290, 2.1290, 2.1119,  ..., 2.0434, 1.9235, 1.9235],
         [2.1290, 2.1290, 2.0948,  ..., 2.0605, 1.9407, 1.9407],
         ...,
         [2.2489, 2.2489, 2.1804,  ..., 2.2318, 2.2489, 2.2489],
         [2.2489, 2.2489, 2.1633,  ..., 2.2318, 2.2489, 2.2489],
         [2.2489, 2.2489, 2.1633,  ..., 2.2318, 2.2489, 2.2489]],

        [[2.4286, 2.4286, 2.4286,  ..., 1.5707, 1.2381, 1.2206],
         [2.4286, 2.4286, 2.4286,  ..., 1.5707, 1.2381, 1.2206],
         [2.4286, 2.4286, 2.4286,  ..., 1.6057, 1.2731, 1.2556],
         ...,
         [2.3761, 2.3761, 2.3410,  ..., 2.0259, 2.3060, 2.3235],
         [2.3761, 2.3761, 2.3410,  ..., 2.0259, 2.2885, 2.3060],
         [2.3761, 2.3761, 2.3410,  ..., 2.0259, 2.2885, 2.3060]],

        [[2.6400, 2.6400, 2.6400,  ..., 0.7925, 0.6008, 0.5834],
         [2.6400, 2.6400, 2.6400,  ..., 0.7925, 0.6008, 0.5834],
         [2.6400, 2.6400, 2.6400,  ..., 0.9145, 0.6879, 0.

In [7]:
val.__getitem__(2)

({'input_ids': tensor([[  101,  1103,  2377,  3955,   112,   188,  3043,  9506,   131,  1121,
           1148,  3043,  1106,  2029, 10890,   113,  1103,  3043,  9506,  1116,
           1326,   114,   102,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
    

In [8]:
#loading saved texted model

model_state_dict = torch.load('best_model.pth') #trained on BertForSequenceClassification
model_text = BertModel.from_pretrained( 'bert-base-cased', state_dict = model_state_dict, num_labels=30,  output_attentions = False, 
   output_hidden_states = False)
#model_text = model_text.cuda()


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['linear.bias', 'linear.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
class CNN(torch.nn.Module):
    def __init__(self):
        super(CNN, self).__init__()

        self.pretrained = torchvision.models.resnet50(pretrained=False)
        self.pretrained.fc = torch.nn.Identity()
        
        self.linear = torch.nn.Linear(2048, 30)
       

    def forward(self, x):        
        x = self.pretrained(x)
        x = self.linear(x)
        return x

In [10]:
# loading saved image model
model_img = models.resnet50(pretrained=True)
num_ftrs = model_img.fc.in_features
model_img.fc = nn.Sequential(nn.Linear(2048, 512),
                         nn.ReLU(),
                         nn.Dropout(0.2),
                         nn.Linear(512, 30),
                         nn.LogSoftmax(dim=1))

model_img.load_state_dict(torch.load('CNN_model.pth'))


<All keys matched successfully>

In [11]:
#creating multimodal model

class MultimodalModel(nn.Module):
    def __init__(self, model_text, model_img):
        super(MultimodalModel, self).__init__()
        self.model_text = model_text
        self.model_img = model_img
        self.classifier = nn.Linear(30+30, 30)
        self.linear = nn.Linear(768, 30)
        self.relu = nn.ReLU()
        
    def forward(self, img, text, text_mask):
        
        input_img = self.model_img(img)
        input_text = self.model_text(text, text_mask)
        input_text = input_text[1]
        input_text = self.linear(input_text)
        input_text = self.relu(input_text)
        
        input_all = torch.cat((input_text, input_img), dim=1)
        input_all = self.classifier(nn.functional.relu(input_all))
        
        return input_all

my_model = MultimodalModel(model_text, model_img)

In [12]:
text_new, labels, img = train[17]
tekst = text_new['input_ids'][0]
text_mask = text_new['attention_mask']

predictions = my_model(img.unsqueeze(0), tekst.unsqueeze(0), text_mask.unsqueeze(0))
print(predictions)
print(img.type, tekst.unsqueeze(0).type, text_mask.type, labels)

tensor([[-0.1832, -0.1215, -0.0993,  0.0389, -0.1328, -0.0368,  0.0456, -0.1166,
         -0.1247,  0.0248,  0.0081,  0.1725, -0.0406,  0.2047, -0.0590,  0.0352,
         -0.1469,  0.2032, -0.0710,  0.0252,  0.0219, -0.0019, -0.0669, -0.0719,
         -0.1980, -0.0093, -0.1462,  0.0421,  0.1983,  0.0473]],
       grad_fn=<AddmmBackward0>)
<built-in method type of Tensor object at 0x7fc3c24a6d60> <built-in method type of Tensor object at 0x7fc3c2dc76d0> <built-in method type of Tensor object at 0x7fc3c36bf4a0> 15


In [13]:
train_new = Dataset(train_s[2450:3000], "/home/gussikoju@GU.GU.SE/Data/sorted/train", True)
train_all = Dataset(train_s, "/home/gussikoju@GU.GU.SE/Data/sorted/train", True)
val_new = Dataset(val_s, "/home/gussikoju@GU.GU.SE/Data/sorted/train", True)

In [14]:
my_model = my_model.cuda()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#criterion = nn.NLLLoss()
optimizer = optim.Adam(my_model.parameters(), lr=0.0003, weight_decay=1e-4)
#optimizer = optim.Adam(my_model.parameters(), lr=0.00000000001, weight_decay=1e-4)
loss_fn = torch.nn.CrossEntropyLoss()
#loss_fn = torch.nn.MSELoss()


In [15]:
model_text.parameters

<bound method Module.parameters of BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropo

In [16]:
def train_one_epoch(epoch_index):
    running_loss = 0.
    last_loss = 0.
    running_corrects = 0
    correct = 0
    total = 0
    training_loader = torch.utils.data.DataLoader(train_all, batch_size=50,shuffle=False) 
    
    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, out in enumerate(training_loader):
        title, label, picture = out         
        mask = title['attention_mask']
        title = title['input_ids'][0]
        
        label = label.to(device)
        picture = picture.to(device)
        mask = mask.to(device)
        title = title.to(device)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = my_model(picture,title,mask)
        _, preds = torch.max(outputs.data, 1)
        total += label.size(0)
        correct += (preds == label).sum().item()
        #print(outputs, preds, label)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, label)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        
        #running_corrects += torch.sum(preds == label).item()
        accuracy = 100 * correct / total
        #print('Accuracy of the network : %d %%' % (accuracy))
        
        if i % 10 == 9:
            last_loss = running_loss / 10 # loss per batch
            print('  batch {} loss: {} acc: {}%'.format(i + 1, last_loss, round(accuracy, 2)))
            running_loss = 0.
            
    return last_loss

In [17]:
# Initializing in a separate cell so we can easily add more epochs to the same run

epoch_number = 0
validation_loader = torch.utils.data.DataLoader(val_new, batch_size=50,shuffle=False)

EPOCHS = 2

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    my_model.train(True)
    avg_loss = train_one_epoch(epoch_number)

    # We don't need gradients on to do reporting
    #my_model.train(False)
    with torch.no_grad(): 
        my_model.eval()

        running_vloss = 0.0
        for i, vdata in enumerate(validation_loader):      
            vtitle, vlabel, vpicture = vdata   
            vmask = vtitle['attention_mask']
            vtitle = vtitle['input_ids'][0]

            vmask = vmask.to(device)
            vtitle = vtitle.to(device)
            vpicture = vpicture.to(device)
            vlabel = vlabel.to(device)

            voutputs = my_model(vpicture,vtitle,vmask)
            voutputs = voutputs.to(device)
            vloss = loss_fn(voutputs, vlabel)


            running_vloss += vloss

        avg_vloss = running_vloss / (i + 1)
        print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

        # Track best performance, and save the model's state
        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            model_path = 'model_{}'.format(epoch_number+1)
            torch.save(my_model.state_dict(), model_path)
    
    epoch_number += 1

EPOCH 1:
  batch 10 loss: 3.386093997955322 acc: 4.4%
  batch 20 loss: 3.3378937244415283 acc: 5.8%
  batch 30 loss: 3.3321927547454835 acc: 6.67%
  batch 40 loss: 3.305198574066162 acc: 7.25%
  batch 50 loss: 3.3370755195617674 acc: 7.24%
  batch 60 loss: 3.294179821014404 acc: 7.57%
  batch 70 loss: 3.259646201133728 acc: 8.0%
  batch 80 loss: 3.2618519067764282 acc: 8.4%
  batch 90 loss: 3.259652614593506 acc: 8.4%
  batch 100 loss: 3.316861057281494 acc: 8.44%
  batch 110 loss: 3.2555410146713255 acc: 8.62%
  batch 120 loss: 3.2252800464630127 acc: 8.77%
  batch 130 loss: 3.2713401556015014 acc: 8.8%
  batch 140 loss: 3.291543984413147 acc: 8.76%
  batch 150 loss: 3.287844181060791 acc: 8.67%
  batch 160 loss: 3.2917920112609864 acc: 8.61%
  batch 170 loss: 3.2820165395736693 acc: 8.74%
  batch 180 loss: 3.3114949464797974 acc: 8.79%
  batch 190 loss: 3.280066442489624 acc: 8.71%
  batch 200 loss: 3.2107651948928835 acc: 8.8%
  batch 210 loss: 3.2623926639556884 acc: 8.88%
  batch 

In [21]:
test_all = Dataset(test_set, "/home/gussikoju@GU.GU.SE/Data/sorted/test", True)
test_new = Dataset(test_set[:200], "/home/gussikoju@GU.GU.SE/Data/sorted/test", True)

In [22]:
# Function to test the model 
def test(path_to_model): 
    # Load the model that we saved at the end of the training loop 
    test_loader = torch.utils.data.DataLoader(test_all, batch_size=10,shuffle=False)

    model = MultimodalModel(model_text, model_img)
    path = path_to_model
    model.load_state_dict(torch.load(path)) 
    model = model.to(device)
     
    running_accuracy = 0 
    total = 0 
 
    with torch.no_grad(): 
        for out in test_loader:
            title, label, picture = out         
            mask = title['attention_mask']
            title = title['input_ids'][0]
            
            label = label.to(device)
            picture = picture.to(device)
            mask = mask.to(device)
            title = title.to(device)            
            
            outputs = my_model(picture,title,mask)
            _, preds = torch.max(outputs.data, 1)
            #print(outputs.data)
            
            total += label.size(0) 
            print(preds,label)
            running_accuracy += (preds == label).sum().item() 
            print(running_accuracy)
 
        print('Accuracy of the model','inputs is: %d %%' % (100 * running_accuracy / total))    

In [23]:
 test('model_2')

tensor([20, 20, 20, 20, 20, 20, 20, 20, 20, 20], device='cuda:0') tensor([28,  1, 21,  0, 15, 10, 23,  9,  4, 25], device='cuda:0')
0
tensor([20, 20, 20, 20, 20, 20, 20, 20, 20, 20], device='cuda:0') tensor([14, 19, 11,  2, 18, 26,  6,  7,  8, 29], device='cuda:0')
0
tensor([20, 20, 20, 20, 20, 20, 20, 20, 20, 20], device='cuda:0') tensor([13, 20,  3, 12,  5, 27, 22, 23,  4,  2], device='cuda:0')
1
tensor([20, 20, 20, 20, 20, 20, 20, 20, 20, 20], device='cuda:0') tensor([ 9, 29,  6, 24, 17,  8, 12, 16, 21,  4], device='cuda:0')
1
tensor([20, 20, 20, 20, 20, 20, 20, 20, 20, 20], device='cuda:0') tensor([11, 29, 13, 16, 23, 14,  9,  7,  0, 27], device='cuda:0')
1
tensor([20, 20, 20, 20, 20, 20, 20, 20, 20, 20], device='cuda:0') tensor([ 7,  8, 21,  2,  4, 23, 15, 11,  1, 29], device='cuda:0')
1
tensor([20, 20, 20, 20, 20, 20, 20, 20, 20, 20], device='cuda:0') tensor([22, 26,  9,  5, 18,  6, 13, 16,  8, 20], device='cuda:0')
2
tensor([20, 20, 20, 20, 20, 20, 20, 20, 20, 20], device='cuda: