## Data processing

Dependency import

In [36]:
import os
import sys

Add path of project modules to visible area

In [37]:
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [38]:
from datasets import load_from_disk
from torch import save

from omegaconf import OmegaConf
from loguru import logger

from src.parser import Text2EmojiParser
from src.dataset import Text2EmojiDataset
from src.utils import seed_all, set_logger
from src.transfer import get_glove_embbedings

Set logger

In [39]:
set_logger()

Set paths

In [54]:
path_experiment_config = '../configs/experiment.yaml'
path_train_config = '../configs/train.yaml'
path_processing_config = '../configs/processing.yaml'
path_save_dataset = '../data/datasets/processed'
path_save_parser = '../data/parser'
path_save_embbeding = '../data/transfer/embbeding'

Set configs

In [55]:
experiment_config = OmegaConf.load(path_experiment_config)
train_config = OmegaConf.load(path_train_config)
processing_config = OmegaConf.load(path_processing_config)

In [42]:
st = processing_config.special_tokens
pad_token, sos_token, eos_token, unk_token = st.pad.token, st.sos.token, st.eos.token, st.unk.token
pad_idx, sos_idx, eos_idx, unk_idx = st.pad.id, st.sos.id, st.eos.id, st.unk.id

Set seed

In [43]:
seed_all(train_config.seed)

Load data

In [44]:
logger.info(f'Data load')
dataset = load_from_disk('../data/datasets/raw/KomeijiForce')

2024-11-02 17:29:08 | INFO | Data load


Prepare data

In [45]:
logger.info(f'Data preprocessing started with test size: {processing_config.data.train_test_ratio}')
parser = Text2EmojiParser(pad_token=pad_token, sos_token=sos_token, eos_token=eos_token, unk_token=unk_token)
dataset = Text2EmojiDataset(dataset)

2024-11-02 17:29:10 | INFO | Data preprocessing started with test size: 0.007


In [46]:
dataset.filter_none()
dataset.shuffle(train_config.seed)

In [47]:
logger.info('Data tokenization')
dataset.tokenization_dataset(parser.tokenize_emoji, parser.tokenize_text,
                             processing_config.data.max_text_length)

2024-11-02 17:29:13 | INFO | Data tokenization


Create vocab

In [48]:
logger.info('Create vocab')
parser.create_vocab(dataset.dataset['tokenized_emoji'],
                    dataset.dataset['tokenized_text'],
                    processing_config.data.min_freq_emoji,
                    processing_config.data.min_freq_text)
parser.set_default_index(unk_idx)

2024-11-02 17:29:16 | INFO | Create vocab


In [49]:
logger.info('Data numericalize')
dataset.numericalize_dataset(parser.numericalize_data)

2024-11-02 17:29:35 | INFO | Data numericalize


Map:   0%|          | 0/503682 [00:00<?, ? examples/s]

Get embbedings

In [19]:
logger.info('Get Glove embbedings')
embbedings, glove_word_count = get_glove_embbedings(parser.text_vocab.get_itos()[1:])
logger.info(f'glove_word_count: {glove_word_count}, size of vocab: {len(parser.text_vocab.get_itos()) - 1}')

2024-11-02 16:38:25 | INFO | Get Glove embbedings
2024-11-02 16:38:57 | INFO | glove_word_count: 11234, size of vocab: 11525


Save all on disk

In [50]:
parser.save(path_save_parser + '/parser.pt')
save(embbedings, path_save_embbeding + '/embbeding.pt')
dataset.save(path_save_dataset)

Saving the dataset (0/1 shards):   0%|          | 0/503682 [00:00<?, ? examples/s]

Save on MLFlow

In [22]:
from mlflow import MlflowClient

In [31]:
client = MlflowClient(tracking_uri=experiment_config.mlflow_server)

In [33]:
experiment_description = (
    "This experiment processes simply KomeijiForce dataset for Text2Emoji model."
)

experiment_tags = {
    "project_name": "Text2Emoji",
    "team": "solo-ilmerkul",
    "project_quarter": "Q4-2024",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
experiment = client.create_experiment(
    name="1.0_exp_2.0-ilmerkul-data-process", tags=experiment_tags
)

In [35]:
run_name = "1.0-data-process"

artifact_path = "data-process"

In [52]:
dataset = mlflow.data.from_huggingface(
    dataset.dataset, data_dir='KomeijiForce/Text2Emoji', name="KomeijiForce dataset with simply processing"
)

In [53]:
with mlflow.start_run(run_name=run_name) as run:
    mlflow.log_input(dataset, context="training")

KeyboardInterrupt: 