In [3]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm

train = pd.read_csv('corpus_train_all.csv')
test = pd.read_csv('corpus_test.csv')

In [1]:
# Chuvash alphabet
chuvash_alpahbet = ['А', 'а', 'Ӑ', 'ӑ', 'Б', 'б', 'В', 'в', 'Г', 'г', 'Д', 'д',
                    'Е', 'е', 'Ё', 'ё', 'Ӗ', 'ӗ', 'Ж', 'ж', 'З', 'з', 'И', 'и',
                    'Й', 'й', 'К', 'к', 'Л', 'л', 'М', 'м', 'Н', 'н', 'О', 'о',
                    'П', 'п', 'Р', 'р', 'С', 'с', 'Ҫ', 'ҫ', 'Т', 'т', 'У', 'у',
                    'Ӳ', 'ӳ', 'Ф', 'ф', 'Х', 'х', 'Ц', 'ц', 'Ч', 'ч', 'Ш', 'ш',
                    'Щ', 'щ', 'Ъ', 'ъ', 'Ы', 'ы', 'Ь', 'ь', 'Э', 'э', 'Ю', 'ю',
                    'Я', 'я']


# Russian alphabet
russian_alphabet = ['А', 'а', 'Б', 'б', 'В', 'в', 'Г', 'г', 'Д', 'д', 'Е', 'е',
                    'Ё', 'ё', 'Ж', 'ж', 'З', 'з', 'И', 'и', 'Й', 'й', 'К', 'к',
                    'Л', 'л', 'М', 'м', 'Н', 'н', 'О', 'о', 'П', 'п', 'Р', 'р',
                    'С', 'с', 'Т', 'т', 'У', 'у', 'Ф', 'ф', 'Х', 'х', 'Ц', 'ц',
                    'Ч', 'ч', 'Ш', 'ш', 'Щ', 'щ', 'Ъ', 'ъ', 'Ы', 'ы', 'Ь', 'ь',
                    'Э', 'э', 'Ю', 'ю', 'Я', 'я']



char_map = {'Ӑ': 'А',
            'ӑ': 'а',
            'Ӗ': 'Е',
            'ӗ': 'е',
            'Ӳ': 'У',
            'ӳ': 'у',
            'Ҫ': 'С',
            'ҫ': 'с',
            'ĕ': 'е',
            'ă': 'а',
            'ç': 'c',
            'ÿ': 'у',
            'Ӳ': 'У'}

In [4]:
def replace_chars(text, replacement_dict, replace_probability=0.5):
    new_text = []
    for c in text:
        if c in replacement_dict and random.random() < replace_probability:
            new_text.append(replacement_dict[c])
        else:
            new_text.append(c)
    return ''.join(new_text)

def add_similar_rows(csv_path, percentage, replacement_dict):
    df = pd.read_csv(csv_path)
    num_rows = len(df)
    num_new_rows = int(num_rows * percentage / 100)

    random_indices = np.random.randint(0, num_rows, num_new_rows)
    selected_rows = df.iloc[random_indices].copy()

    selected_rows['chv'] = [replace_chars(text, replacement_dict) for text in tqdm(selected_rows['chv'], desc="Replacing characters")]

    new_df = pd.concat([df, selected_rows], ignore_index=True)
    return new_df


In [5]:
new_df = add_similar_rows('corpus_train_all.csv', 10, char_map)

Replacing characters: 100%|██████████| 90181/90181 [00:01<00:00, 74309.18it/s]


In [6]:
new_df.tail(10)

Unnamed: 0,rus,chv
991982,Мутная снежная пыль клубами вилась над ним.,Ун теленче юр тусанӗ мӑкӑрланса тӑрать.
991983,Какая ужасная картина!,Ҫав тери хӑрушла картина!
991984,"Нашелся какой-то добрый человек, помог ей в беде.",Пер ырӑ сын тупӑнна та ӑна инкекрен хатарнӑ.
991985,За посторонними разговорами ты можешь проглоти...,Ун-кун халапланса эс Альбина Альбертовна чунта...
991986,"Поеду без разговоров, пока вакан есть.",Вырӑн пур чух пер сӑмахсар каятӑп!
991987,Кто-то загудел басом:,Такам баспала мӗкӗрттерсе янӑ:
991988,"За пределами знакомого села и ближайших полей,...",Хӑй пелсе хӑнахса ҫитнӗ ялпа ҫывӑхри хирсенчен...
991989,— И я вас узнал также.,— Эпӗ те сире тӳрех палларам.
991990,"Я вас уверяю, что Турция не сделает и попытки ...","Шантарсах калатап сире, Турци сак палхава хеҫ-..."
991991,"Той девушки уже нет — мы даже не заметили, на ...",Леш хӗр сук ӗнтӗ — вӑл хӑш станцире анса юлнин...


In [7]:
new_df.to_csv('corpus_train_10_replacement.csv')

**Training**

In [8]:
import sockeye
import tensorboard

In [9]:
import torch
print(torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))

  from .autonotebook import tqdm as notebook_tqdm


1
Tesla V100-SXM2-32GB


In [10]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datasets import load_dataset

In [11]:
src = 'chv'
tgt = 'rus'
name = '1M_Corpus_diacritics_10'

In [12]:
train_path = 'corpus_train_10_replacement.csv'
test_path = 'corpus_test.csv'

df_test = pd.read_csv(test_path)
df_train = load_dataset('csv', data_files=train_path).shuffle(seed=42)

split_dataset = df_train['train'].train_test_split(test_size=0.1)

Downloading and preparing dataset csv/default to /home/kvzhirnov/.cache/huggingface/datasets/csv/default-eb44580390bdea7e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 2437.13it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 117.60it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                                     

Dataset csv downloaded and prepared to /home/kvzhirnov/.cache/huggingface/datasets/csv/default-eb44580390bdea7e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 11.41it/s]


In [13]:
!mkdir 'data'
!mkdir 'data/train'
!mkdir 'data/dev'
!mkdir 'data/test'

In [14]:
def writelines(list, filename):
  with open(filename, 'w', encoding='utf-8') as file:
    for item in list:
      file.write('%s\n' % item)

chv_train = [l['chv'] for l in split_dataset['train']]
rus_train = [l['rus'] for l in split_dataset['train']]

chv_dev = [l['chv'] for l in split_dataset['test']]
rus_dev = [l['rus'] for l in split_dataset['test']]

chv_test = df_test['chv'].tolist()
rus_test = df_test['rus'].tolist()

writelines(chv_train, 'data/train/train.chv.txt')
writelines(rus_train, 'data/train/train.rus.txt')

writelines(chv_dev, 'data/dev/dev.chv.txt')
writelines(rus_dev, 'data/dev/dev.rus.txt')

writelines(chv_test, 'data/test/test.chv.txt')
writelines(rus_test, 'data/test/test.rus.txt')

In [15]:
file_name = 'data/train/train.chv.txt'
number_of_rows = 5  # The number of rows you want to read

with open(file_name, 'r') as file:
    for i in range(number_of_rows):
        line = file.readline()
        print(line.strip())

Унӑн пӗтӗмпе те утмӑл пус кӑна, тенкӗ ваклама ҫитмест.
Хӗвелпе пиҫӗхнӗ пит-куҫӗ шурнӑ та халӗ куҫӗсем путса ларнӑ.
Вӑл пулас ӑрусем валли ҫеҫ маяк лартса хӑварнӑ: ҫак кӗтес ҫине ҫаврӑнса пӑхӑр, тесшӗн пулнӑ.
«Уставра кӑтартман пек» вӗҫнӗшӗн ӑна ҫакӑнта нумай хут лартнӑ.
Ташласамчче манпа, чунам!»


In [16]:
file_name = 'data/train/train.rus.txt'
number_of_rows = 5  # The number of rows you want to read

with open(file_name, 'r') as file:
    for i in range(number_of_rows):
        line = file.readline()
        print(line.strip())

Всего набралось шестьдесят копеек, на сдачу не хватает.
Лицо побледнело под загаром, глаза провалились.
Он просто ставил веху для будущих поколений — обратите, мол, внимание на этот уголок.
Много раз доставалось ему за «неуставные полеты».
Пропляши со мной, голубчик!»


In [17]:
!python subword-nmt/subword_nmt/learn_joint_bpe_and_vocab.py --input data/train/train.{src}.txt data/train/train.{tgt}.txt  -s 15000 -o data/bpe.codes  --write-vocabulary data/bpe.vocab.{src} data/bpe.vocab.{tgt}

 75%|##########################8         | 11177/15000 [03:44<00:12, 303.37it/s]


In [18]:
!python subword-nmt/subword_nmt/apply_bpe.py -c data/bpe.codes --vocabulary data/bpe.vocab.{src} --vocabulary-threshold 50 < data/train/train.{src}.txt > data/train/train.{src}.bpe
!python subword-nmt/subword_nmt/apply_bpe.py -c data/bpe.codes --vocabulary data/bpe.vocab.{tgt} --vocabulary-threshold 50 < data/train/train.{tgt}.txt > data/train/train.{tgt}.bpe

!python subword-nmt/subword_nmt/apply_bpe.py -c data/bpe.codes --vocabulary data/bpe.vocab.{src} --vocabulary-threshold 50 < data/dev/dev.{src}.txt > data/dev/dev.{src}.bpe
!python subword-nmt/subword_nmt/apply_bpe.py -c data/bpe.codes --vocabulary data/bpe.vocab.{tgt} --vocabulary-threshold 50 < data/dev/dev.{tgt}.txt > data/dev/dev.{tgt}.bpe

!python subword-nmt/subword_nmt/apply_bpe.py -c data/bpe.codes --vocabulary data/bpe.vocab.{src} --vocabulary-threshold 50 < data/test/test.{src}.txt > data/test/test.{src}.bpe
!python subword-nmt/subword_nmt/apply_bpe.py -c data/bpe.codes --vocabulary data/bpe.vocab.{tgt} --vocabulary-threshold 50 < data/test/test.{tgt}.txt > data/test/test.{tgt}.bpe

In [19]:
!python -m sockeye.prepare_data -s data/train/train.{src}.bpe -t data/train/train.{tgt}.bpe --shared-vocab -o {src}_{tgt}_{name}_data

[INFO:sockeye.utils] Sockeye: 3.1.34, commit 4c30942ddb523533bccb4d2cbb3e894e45b1db93, path /home/kvzhirnov/.conda/envs/diploma/lib/python3.9/site-packages/sockeye/__init__.py
[INFO:sockeye.utils] PyTorch: 1.13.1 (/home/kvzhirnov/.conda/envs/diploma/lib/python3.9/site-packages/torch/__init__.py)
[INFO:sockeye.utils] Command: /home/kvzhirnov/.conda/envs/diploma/lib/python3.9/site-packages/sockeye/prepare_data.py -s data/train/train.chv.bpe -t data/train/train.rus.bpe --shared-vocab -o chv_rus_1M_Corpus_diacritics_10_data
[INFO:sockeye.utils] Arguments: Namespace(config=None, source='data/train/train.chv.bpe', source_factors=[], source_factors_use_source_vocab=[], target_factors=[], target_factors_use_target_vocab=[], target='data/train/train.rus.bpe', end_of_prepending_tag=None, source_vocab=None, target_vocab=None, source_factor_vocabs=[], target_factor_vocabs=[], shared_vocab=True, num_words=(0, 0), word_min_count=(1, 1), pad_vocab_to_multiple_of=8, no_bucketing=False, bucket_width=8,

In [20]:
!python -m sockeye.train -d {src}_{tgt}_{name}_data -vs data/dev/dev.{src}.bpe -vt data/dev/dev.{tgt}.bpe --encoder transformer --decoder transformer --transformer-model-size 512 --transformer-feed-forward-num-hidden 2048 --transformer-dropout-prepost 0.1 --num-embed 512 --max-seq-len 100 --decode-and-evaluate 500 -o {src}_{tgt}_{name}_model --num-layers 6 --keep-last-params 3 --batch-size 2048 --optimized-metric bleu --max-num-checkpoint-not-improved 5 --amp

[INFO:sockeye.utils] Sockeye: 3.1.34, commit 4c30942ddb523533bccb4d2cbb3e894e45b1db93, path /home/kvzhirnov/.conda/envs/diploma/lib/python3.9/site-packages/sockeye/__init__.py
[INFO:sockeye.utils] PyTorch: 1.13.1 (/home/kvzhirnov/.conda/envs/diploma/lib/python3.9/site-packages/torch/__init__.py)
[INFO:sockeye.utils] Command: /home/kvzhirnov/.conda/envs/diploma/lib/python3.9/site-packages/sockeye/train.py -d chv_rus_1M_Corpus_diacritics_10_data -vs data/dev/dev.chv.bpe -vt data/dev/dev.rus.bpe --encoder transformer --decoder transformer --transformer-model-size 512 --transformer-feed-forward-num-hidden 2048 --transformer-dropout-prepost 0.1 --num-embed 512 --max-seq-len 100 --decode-and-evaluate 500 -o chv_rus_1M_Corpus_diacritics_10_model --num-layers 6 --keep-last-params 3 --batch-size 2048 --optimized-metric bleu --max-num-checkpoint-not-improved 5 --amp
[INFO:sockeye.utils] Arguments: Namespace(config=None, source=None, source_factors=[], source_factors_use_source_vocab=[], target_f

In [21]:
!python -m sockeye.translate -m {src}_{tgt}_{name}_model -i data/test/test.{src}.bpe -o data/test/test.{tgt}.pred.bpe --batch-size 128 --beam-size 5

[INFO:sockeye.utils] Sockeye: 3.1.34, commit 4c30942ddb523533bccb4d2cbb3e894e45b1db93, path /home/kvzhirnov/.conda/envs/diploma/lib/python3.9/site-packages/sockeye/__init__.py
[INFO:sockeye.utils] PyTorch: 1.13.1 (/home/kvzhirnov/.conda/envs/diploma/lib/python3.9/site-packages/torch/__init__.py)
[INFO:sockeye.utils] Command: /home/kvzhirnov/.conda/envs/diploma/lib/python3.9/site-packages/sockeye/translate.py -m chv_rus_1M_Corpus_diacritics_10_model -i data/test/test.chv.bpe -o data/test/test.rus.pred.bpe --batch-size 128 --beam-size 5
[INFO:sockeye.utils] Arguments: Namespace(config=None, input='data/test/test.chv.bpe', input_factors=None, json_input=False, output='data/test/test.rus.pred.bpe', models=['chv_rus_1M_Corpus_diacritics_10_model'], checkpoints=None, nbest_size=1, beam_size=5, greedy=False, beam_search_stop='all', batch_size=128, chunk_size=None, sample=None, seed=None, ensemble_mode='linear', bucket_width=10, max_input_length=None, max_output_length_num_stds=2, max_output_l

In [22]:
!sed -r 's/(@@ )|(@@ ?$)//g' data/test/test.{tgt}.pred.bpe > data/test/test.{tgt}.pred.txt

In [23]:
!sacrebleu data/test/test.rus.txt -tok none -i data/test/test.{tgt}.pred.txt

{
 "name": "BLEU",
 "score": 18.4,
 "signature": "nrefs:1|case:mixed|eff:no|tok:none|smooth:exp|version:2.3.1",
 "verbose_score": "46.4/24.3/14.3/8.9 (BP = 0.948 ratio = 0.950 hyp_len = 1033398 ref_len = 1088181)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "none",
 "smooth": "exp",
 "version": "2.3.1"
}
[0m

**Testing**

In [24]:
df_test_replacement_50 = pd.read_csv('test_replacement_50.csv')

In [25]:
def writelines(list, filename):
  with open(filename, 'w', encoding='utf-8') as file:
    for item in list:
      file.write('%s\n' % item)

chv_test_replacement_50 = df_test_replacement_50['chv'].tolist()
rus_test_replacement_50 = df_test_replacement_50['rus'].tolist()

writelines(chv_test_replacement_50, 'data/test/test_replacement_50.chv.txt')
writelines(rus_test_replacement_50, 'data/test/test_replacement_50.rus.txt')

In [27]:
!python subword-nmt/subword_nmt/apply_bpe.py -c data/bpe.codes --vocabulary data/bpe.vocab.{src} --vocabulary-threshold 50 < data/test/test_replacement_50.{src}.txt > data/test/test_replacement_50.{src}.bpe
!python subword-nmt/subword_nmt/apply_bpe.py -c data/bpe.codes --vocabulary data/bpe.vocab.{tgt} --vocabulary-threshold 50 < data/test/test_replacement_50.{tgt}.txt > data/test/test_replacement_50.{tgt}.bpe

In [28]:
!python -m sockeye.translate -m {src}_{tgt}_{name}_model -i data/test/test_replacement_50.{src}.bpe -o data/test/test_replacement_50.{tgt}.pred.bpe --batch-size 128 --beam-size 5

[INFO:sockeye.utils] Sockeye: 3.1.34, commit 4c30942ddb523533bccb4d2cbb3e894e45b1db93, path /home/kvzhirnov/.conda/envs/diploma/lib/python3.9/site-packages/sockeye/__init__.py
[INFO:sockeye.utils] PyTorch: 1.13.1 (/home/kvzhirnov/.conda/envs/diploma/lib/python3.9/site-packages/torch/__init__.py)
[INFO:sockeye.utils] Command: /home/kvzhirnov/.conda/envs/diploma/lib/python3.9/site-packages/sockeye/translate.py -m chv_rus_1M_Corpus_diacritics_10_model -i data/test/test_replacement_50.chv.bpe -o data/test/test_replacement_50.rus.pred.bpe --batch-size 128 --beam-size 5
[INFO:sockeye.utils] Arguments: Namespace(config=None, input='data/test/test_replacement_50.chv.bpe', input_factors=None, json_input=False, output='data/test/test_replacement_50.rus.pred.bpe', models=['chv_rus_1M_Corpus_diacritics_10_model'], checkpoints=None, nbest_size=1, beam_size=5, greedy=False, beam_search_stop='all', batch_size=128, chunk_size=None, sample=None, seed=None, ensemble_mode='linear', bucket_width=10, max_i

In [29]:
!sed -r 's/(@@ )|(@@ ?$)//g' data/test/test_replacement_50.{tgt}.pred.bpe > data/test/test_replacement_50.{tgt}.pred.txt

In [31]:
!sacrebleu data/test/test_replacement_50.rus.txt -tok none -i data/test/test_replacement_50.{tgt}.pred.txt

{
 "name": "BLEU",
 "score": 16.3,
 "signature": "nrefs:1|case:mixed|eff:no|tok:none|smooth:exp|version:2.3.1",
 "verbose_score": "43.9/21.7/12.3/7.4 (BP = 0.951 ratio = 0.952 hyp_len = 1036678 ref_len = 1088642)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "none",
 "smooth": "exp",
 "version": "2.3.1"
}
[0m

In [32]:
print(0)

0
