In [1]:
import numpy as np
from datasets import load_from_disk
import sys
sys.path.append('../')
import os
from utils.config import training_config
os.environ["HF_DATASETS_CACHE"]="/shared/.cache/huggingface/dataset"
output_dir = "/efs-private/multimodal/" #0.8
config = training_config(os.path.join(output_dir,'config_dropout.yaml'))

batch_size: 8
clip: 2.0
dataset: /shared/how2_all_proc
ds_frac: 1.0
encoder_configs:
  aud:
    max_tokens: 320
    patch_size: [32, 8]
    type: PatchEncoder
  en:
    max_tokens: 64
    num_embeddings: 8192
    type: SequenceEncoder
  pt:
    max_tokens: 64
    num_embeddings: 8192
    type: SequenceEncoder
  sm:
    max_tokens: 64
    num_embeddings: 8192
    type: SequenceEncoder
  vid:
    max_tokens: 2048
    num_embeddings: 2048
    padding_idx: -1
    type: TabularEncoder
epochs: 64
inverse_doom: True
layers: 4
lr: 1e-05
lr_scheduler_type: constant_with_warmup
modality_config:
  aud:
    dropout: 0.3
    max_channels: 40
    pad_len: 2048
    padding:
      values: -10000
    predrop: True
    type: matrix
  en:
    data_col_name: tokens
    dropout: 0.3
    pad_len: 64
    padding:
      tokens: 0
    predrop: True
    type: sequence
  pt:
    data_col_name: tokens
    dropout: 0.3
    pad_len: 64
    padding:
      tokens: 0
    predrop: True
    type: sequence
  sm:
    data

In [2]:
from utils.dataset import setup_data
datasets = setup_data(config.dataset,
                      split=False,
                      ds_frac=1.0,
                      ds_seed=config.ds_seed,
                      predrop=config.predrop,
                      predrop_config=config.modality_config)

Running preprocessing dropout of modalities
aud:
  dropout: 0.3
  max_channels: 40
  pad_len: 2048
  padding:
    values: -10000
  predrop: True
  type: matrix
en:
  data_col_name: tokens
  dropout: 0.3
  pad_len: 64
  padding:
    tokens: 0
  predrop: True
  type: sequence
pt:
  data_col_name: tokens
  dropout: 0.3
  pad_len: 64
  padding:
    tokens: 0
  predrop: True
  type: sequence
sm:
  data_col_name: tokens
  dropout: 0.3
  pad_len: 64
  padding:
    tokens: 0
  predrop: True
  type: sequence
vid:
  dropout: 0.3
  pad_len: 2048
  padding:
    indices: -1
    values: 0
  predrop: True
  type: sequence


In [3]:
datasets.save_to_disk('/shared/how2_all_proc_30p')

Saving the dataset (0/42 shards):   0%|          | 0/184949 [00:00<?, ? examples/s]

In [6]:
import datasets
datset = datasets.load_from_disk('/shared/how2_all_proc')

In [9]:
datset[0:100]['sm']

[{'tokens': tensor([  73,   30,  213, 6812, 3615,  835, 2653,   30, 6697,   69,  107, 6043,
            83, 2114,  336,  100, 1954,   50, 1549,   83,   64,  478,   30, 1372,
            58, 7963, 1299,   14,   90,  198,  136,   76,   30,  105, 1794, 3615,
            83,  336, 1149,   92,   30, 1322,  130,  272, 1056,   77,   80,  184,
            14])},
 {'tokens': tensor([  73,   30,  213, 6812, 3615,  835, 2653,   30, 6697,   69,  107, 6043,
            83, 2114,  336,  100, 1954,   50, 1549,   83,   64,  478,   30, 1372,
            58, 7963, 1299,   14,   90,  198,  136,   76,   30,  105, 1794, 3615,
            83,  336, 1149,   92,   30, 1322,  130,  272, 1056,   77,   80,  184,
            14])},
 {'tokens': tensor([  73,   30,  213, 6812, 3615,  835, 2653,   30, 6697,   69,  107, 6043,
            83, 2114,  336,  100, 1954,   50, 1549,   83,   64,  478,   30, 1372,
            58, 7963, 1299,   14,   90,  198,  136,   76,   30,  105, 1794, 3615,
            83,  336, 1149,   