## Data processing

Dependency import

In [1]:
import os
import sys

Add path of project modules to visible area

In [2]:
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [3]:
from datasets import load_from_disk
from torch import save

from hydra import compose, initialize
from omegaconf import OmegaConf
from loguru import logger

from src.parser import Text2EmojiParser
from src.dataset import Text2EmojiDataset
from src.utils import seed_all, set_logger
from src.transfer import get_glove_embbedings

Set logger

In [4]:
set_logger()

Set paths

In [5]:
path_load_dataset = '../data/datasets/raw/KomeijiForce'
path_save_dataset = '../data/datasets/processed'
path_save_parser = '../data/parser'
path_save_embbeding = '../data/transfer/embbeding'
path_config="../configs"

Set configs

In [6]:
initialize(version_base=None, config_path=path_config)
cfg = compose(config_name="experiment")
print(OmegaConf.to_yaml(cfg))

model:
  hidden_size: 350
  num_layers: 2
  dropout: 0.2
  sup_unsup_ratio: 0.9
processing:
  data:
    min_freq_emoji: 5
    min_freq_text: 10
    max_text_length: 128
    train_test_ratio: 0.007
  special_tokens:
    pad:
      id: 0
      token: <pad>
    sos:
      id: 1
      token: <sos>
    eos:
      id: 2
      token: <eos>
    unk:
      id: 3
      token: <unk>
train:
  epoch: 8
  batch_sizes:
  - 32
  - 64
  - 128
  - 256
  batch_milestones:
  - 2
  - 4
  - 7
  lr_0: 0.001
  lr_milestones:
  - 2
  - 4
  - 7
  gamma: 0.464159
  epoch_emb_requires_grad: 4
  print_step: 100
name: 1.0-process-data-and-train-GRU-model
mlflow_server: http://127.0.0.1:5000
seed: 42



In [7]:
st = cfg.processing.special_tokens
pad_token, sos_token, eos_token, unk_token = st.pad.token, st.sos.token, st.eos.token, st.unk.token
pad_idx, sos_idx, eos_idx, unk_idx = st.pad.id, st.sos.id, st.eos.id, st.unk.id

Set seed

In [8]:
seed_all(cfg.seed)

Load data

In [9]:
logger.info(f'Data load')
dataset = load_from_disk(path_load_dataset)

2024-11-06 09:50:33 | INFO | Data load


Prepare data

In [10]:
logger.info(f'Data preprocessing started with test size: {cfg.processing.data.train_test_ratio}')
parser = Text2EmojiParser(pad_token=pad_token, sos_token=sos_token, eos_token=eos_token, unk_token=unk_token)
dataset = Text2EmojiDataset(dataset)

2024-11-06 09:50:35 | INFO | Data preprocessing started with test size: 0.007


In [11]:
dataset.filter_none()
dataset.shuffle(cfg.seed)

Filter:   0%|          | 0/503687 [00:00<?, ? examples/s]

In [12]:
logger.info('Data tokenization')
dataset.tokenization_dataset(parser.tokenize_emoji, parser.tokenize_text,
                             cfg.processing.data.max_text_length)

2024-11-06 09:50:45 | INFO | Data tokenization


Map:   0%|          | 0/503682 [00:00<?, ? examples/s]

Map:   0%|          | 0/503682 [00:00<?, ? examples/s]

Create vocab

In [13]:
logger.info('Create vocab')
parser.create_vocab(dataset.dataset['tokenized_emoji'],
                    dataset.dataset['tokenized_text'],
                    cfg.processing.data.min_freq_emoji,
                    cfg.processing.data.min_freq_text)
parser.set_default_index(unk_idx)

2024-11-06 09:54:06 | INFO | Create vocab


In [14]:
logger.info('Data numericalize')
dataset.numericalize_dataset(parser.numericalize_data)

2024-11-06 09:54:25 | INFO | Data numericalize


Map:   0%|          | 0/503682 [00:00<?, ? examples/s]

Get embbedings

In [15]:
logger.info('Get Glove embbedings')
embbedings, glove_word_count = get_glove_embbedings(parser.text_vocab.get_itos()[1:])
logger.info(f'glove_word_count: {glove_word_count}, size of vocab: {len(parser.text_vocab.get_itos()) - 1}')

2024-11-06 09:55:42 | INFO | Get Glove embbedings
2024-11-06 09:56:20 | INFO | glove_word_count: 11234, size of vocab: 11525


Save all on disk

In [16]:
parser.save(path_save_parser + '/parser.pt')
save(embbedings, path_save_embbeding + '/embbeding.pt')
dataset.save(path_save_dataset)

Saving the dataset (0/1 shards):   0%|          | 0/503682 [00:00<?, ? examples/s]

Save on MLFlow

In [98]:
mlflow.set_tracking_uri(cfg.mlflow_server)

Create experiment for this repository

In [99]:
experiment_description = (
    "This experiment processes simply KomeijiForce dataset and train Text2Emoji Seq2Seq model."
)

experiment_tags = {
    "project_name": "Text2Emoji",
    "team": "solo-ilmerkul",
    "project_quarter": "Q4-2024",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
mlflow.create_experiment(name=cfg.name, tags=experiment_tags)
mlflow.set_experiment(cfg.name)

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1730564946354, experiment_id='1', last_update_time=1730564946354, lifecycle_stage='active', name='1.0-process-data-and-train-GRU-model', tags={'mlflow.note.content': 'This experiment processes simply KomeijiForce dataset '
                        'and train Text2Emoji Seq2Seq model.',
 'project_name': 'Text2Emoji',
 'project_quarter': 'Q4-2024',
 'team': 'solo-ilmerkul'}>

In [100]:
run_name = "1.0-data-process"

In [101]:
dataset = mlflow.data.from_huggingface(
    dataset.dataset, data_dir='KomeijiForce/Text2Emoji', name="KomeijiForce dataset with simply processing"
)

AttributeError: 'HuggingFaceDataset' object has no attribute 'dataset'

In [103]:
with mlflow.start_run(run_name=run_name) as run:
    mlflow.log_input(dataset, context="train")

2024/11/02 19:29:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run 1.0-data-process at: http://127.0.0.1:5000/#/experiments/1/runs/87951d7efbbf4ec396910e5b251645ba.
2024/11/02 19:29:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
