# Multimodal Modality Dataset Test
Dataset combines the image and text data. Text data is represented as padded sequences.

In [1]:
# Reference to ../src
import os
import sys
module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np    
from dataset.MultimodalModalityDataset import MultimodalModalityDataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torchvision.transforms

In [2]:
csv = '../data/out/clef16_caption_classes.csv'
base_dir_images = '/mnt/clef/imageclef_2016/train'

df = pd.read_csv(csv)

# add repeated sample just to be able to divide in stratified manner
glpi_unique_sample = df.append(df[df['CLASS']=='GPLI'])
df = df.append(glpi_unique_sample)

df.head()

Unnamed: 0,FIG_NAME,CLASS,CAPTION,ID
0,11373_2007_9226_Fig1_HTML-10,DMFL,"Colocalization of hNopp140, pol I and rDNA rep...",0
1,11373_2007_9226_Fig1_HTML-11,DMFL,"Colocalization of hNopp140, pol I and rDNA rep...",1
2,11373_2007_9226_Fig1_HTML-12,DMFL,"Colocalization of hNopp140, pol I and rDNA rep...",2
3,11373_2007_9226_Fig1_HTML-13,DMFL,"Colocalization of hNopp140, pol I and rDNA rep...",3
4,11373_2007_9226_Fig1_HTML-14,DMFL,"Colocalization of hNopp140, pol I and rDNA rep...",4


In [3]:
classes = df['CLASS'].unique()
classes

array(['DMFL', 'GHDR', 'DMTR', 'DRXR', 'DRUS', 'GFIG', 'D3DR', 'GGEL',
       'DMEL', 'DRMR', 'DMLI', 'GTAB', 'DRCT', 'GSCR', 'GGEN', 'GCHE',
       'GMAT', 'GFLO', 'GSYS', 'GNCP', 'DVDM', 'GPLI', 'DSEM', 'DRCO',
       'DVOR', 'DSEE', 'DRAN', 'DVEN', 'DSEC', 'DRPE'], dtype=object)

In [4]:
max_words = 20000
# keys to define the training/validation set
set_keys = np.arange(300)

In [5]:
transform = torchvision.transforms.ToTensor()
dataset = MultimodalModalityDataset(base_dir_images, csv, set_keys, classes,
                                    max_seq_len=200, max_words=max_words, transform=transform)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.dataframe['CLEAN'] = self.dataframe['CAPTION'].apply(clean_str)


In [6]:
dataset[1]

(tensor([[[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [0.9686, 0.9765, 0.9843,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [0.9922, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ...,

## In practice, we use a DataLoader...

In [7]:
from dataset.TrainDataLoader import TrainDataLoader

In [8]:
train_data_loader = TrainDataLoader(base_dir_images, csv, classes, variant="multimodal")

There are 5422 training images and 1356 validation images


In [9]:
train_dataset = train_data_loader.get_train_dataset()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.dataframe['CLEAN'] = self.dataframe['CAPTION'].apply(clean_str)


In [12]:
len(train_dataset)

5422