In [1]:
import pandas as pd
import pyarrow.parquet as pq
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


## Load the Wikipedia Dataset and view contents
Note : The original wiki_en_data.parquet is a ~10GB file. We have provided with a sample parquet with 1000 entries for demo purposes.

In [2]:
df = pd.read_parquet("sample_data/sample_wiki_en_data.parquet")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      1000 non-null   object
 1   body       1000 non-null   object
 2   source     1000 non-null   object
 3   url        1000 non-null   object
 4   langCode   1000 non-null   object
 5   timestamp  1000 non-null   object
dtypes: object(6)
memory usage: 47.0+ KB


In [4]:
df.head()

Unnamed: 0,title,body,source,url,langCode,timestamp
0,Boycott (album),\nBoycott (album)\n\n\n\n,wiki,https://en.wikipedia.org/wiki?curid=63405427,en,24/07/23 15:06
1,Javier Gallo,\nJavier Gallo\n\nJavier Gallo González (born ...,wiki,https://en.wikipedia.org/wiki?curid=31637473,en,24/07/23 14:49
2,R. M. Tristram,\nR. M. Tristram\n\n\n\n,wiki,https://en.wikipedia.org/wiki?curid=56333678,en,24/07/23 15:02
3,T. J. Carter (defensive back),\nT. J. Carter (defensive back)\n\nT. J. Carte...,wiki,https://en.wikipedia.org/wiki?curid=72605589,en,24/07/23 15:11
4,Wrestling at the 2015 Pan American Games - Men...,\nWrestling at the 2015 Pan American Games - M...,wiki,https://en.wikipedia.org/wiki?curid=49831530,en,24/07/23 14:59


In [5]:
df['source'].value_counts()

source
wiki    1000
Name: count, dtype: int64

## First we perform Templating Stage on the dataset

In [12]:
!HF_DATASETS_CACHE=/mnt/sea/tmp python /home/kd/Desktop/proj/dec/setu-translate/stages/perform_templating.py \
    --glob_path "/home/kd/Desktop/proj/dec/setu-translate/examples/sample_data/sample_wiki_en_data.parquet" \
    --cache_dir_for_original_data "/mnt/sea/setu-translate/examples/cache" \
    --base_save_path "/mnt/sea/setu-translate/examples/output/wiki_en/doc_csvs" \
    --save_path "/mnt/sea/setu-translate/examples/output/wiki_en/templated" \
    --text_col body \
    --url_col url \
    --timestamp_col timestamp \
    --source_type wiki_en \
    --translation_type sentence \
    --use_cache False \
    --split "train[:100]"

Downloading data files: 100%|██████████████████| 1/1 [00:00<00:00, 23045.63it/s]
Extracting data files: 100%|████████████████████| 1/1 [00:00<00:00, 2799.94it/s]
Setting num_proc from 64 back to 1 for the train split to disable multiprocessing as it only contains one shard.
Generating train split: 1000 examples [00:00, 114223.97 examples/s]
Loaded Dataset from path - /home/shanks/setu-translate/examples/sample_data/sample_wiki_en_data.parquet
Map (num_proc=64): 100%|██████████████| 100/100 [00:00<00:00, 171.11 examples/s]
Performed `templating`
Filter (num_proc=64): 100%|███████████| 100/100 [00:00<00:00, 277.07 examples/s]
Filtered `null` text docs
Map (num_proc=64): 100%|██████████████| 100/100 [00:00<00:00, 194.53 examples/s]
Saving the dataset (64/64 shards): 100%|█| 100/100 [00:00<00:00, 178.82 examples
Saved `templated` dataset to /home/shanks/setu-translate/examples/output/wiki_en/templated


Also while viewing stage output .arrow files make sure the filepaths are proper while using Dataset.from_file()

In [6]:
ds = Dataset.from_file("/mnt/sea/setu-translate/examples/output/wiki_en/templated/data-00000-of-00064.arrow")

In [21]:
ds

Dataset({
    features: ['source', 'url', 'timestamp', 'doc_id', 'text', 'sub_strs', 'sids', 'tlt_folder'],
    num_rows: 2
})

#### View the templated dataset output

In [7]:
ds[0]

{'source': 'en_wikipedia',
 'url': 'https://en.wikipedia.org/wiki?curid=63405427',
 'timestamp': '24/07/23 15:06',
 'doc_id': '7299c62f59ec33baca764b3cc9f6aa529a64ca0e784d963b48649db5500c0b96',
 'text': '\nboycott (album)\n\n\n\n',
 'sub_strs': '["boycott (album)"]',
 'sids': '["545490bdf181f0d46f6c7bf3a1d2ee08d8266c11fd40c96d3dc6f2238387fffc"]',
 'tlt_folder': '/mnt/sea/setu-translate/examples/output/wiki_en/doc_csvs/923682bea6d517dc178d480c88e129e485ed902f4fa024866666658cd4ea6836/7299c62f59ec33baca764b3cc9f6aa529a64ca0e784d963b48649db5500c0b96'}

In [8]:
ds[0:10]['text']

['\nboycott (album)\n\n\n\n',
 '\njavier gallo\n\njavier gallo gonzález (born 6 august 1983) is a mexican professional boxer.\nprofessional career.\nin may 2011, gallo lost a majority decision to former world champion rodel mayol on showtime\'s televised portion of the pacquiao vs. mosley undercard.\non september 9, 2011 at the "war at woodland hills 5", gallo won with a tko over jason rorie.\n\n']

## Create the Global Sentence Level Dataset

In [9]:
!HF_DATASETS_CACHE=/mnt/sea/tmp python /home/kd/Desktop/proj/dec/setu-translate/stages/create_global_ds.py \
    --paths_data "/mnt/sea/setu-translate/examples/output/wiki_en/templated/*.arrow" \
    --cache_dir "/mnt/sea/setu-translate/examples/cache" \
    --global_sent_ds_path "/mnt/sea/setu-translate/examples/output/wiki_en/sentences"

Resolving data files: 100%|█████████████████| 64/64 [00:00<00:00, 171196.08it/s]
Generating train split: 100 examples [00:00, 195.94 examples/s]
Loading dataset shards: 100%|█████████████████| 64/64 [00:00<00:00, 7417.19it/s]
Map (num_proc=64): 100%|██████████████| 100/100 [00:00<00:00, 239.25 examples/s]
Map (num_proc=64): 100%|█████████████| 612/612 [00:00<00:00, 1365.21 examples/s]
Saving the dataset (64/64 shards): 100%|█| 612/612 [00:00<00:00, 1336.78 example


In [10]:
ds = Dataset.from_file("/mnt/sea/setu-translate/examples/output/wiki_en/sentences/data-00000-of-00064.arrow")

In [11]:
ds[0:10]["sub_strs"]

['boycott (album)',
 'javier gallo',
 'javier gallo gonzález (born 6 august 1983) is a mexican professional boxer.',
 'professional career.',
 "in may 2011, gallo lost a majority decision to former world champion rodel mayol on showtime's televised portion of the pacquiao vs.",
 'mosley undercard.',
 'on september 9, 2011 at the "war at woodland hills 5", gallo won with a tko over jason rorie.',
 'r.',
 'm.',
 'tristram']

## Now Binarize the Sentence Level Dataset

In [12]:
!HF_DATASETS_CACHE=/mnt/sea/tmp python /home/kd/Desktop/proj/dec/setu-translate/stages/binarize.py \
    --root_dir "/mnt/sea/setu-translate" \
    --data_files "/mnt/sea/setu-translate/examples/output/wiki_en/sentences/*.arrow" \
    --cache_dir "/mnt/sea/setu-translate/examples/cache" \
    --binarized_dir "/mnt/sea/setu-translate/examples/output/wiki_en/binarized_sentences" \
    --batch_size 2048 \
    --total_procs 1 \
    --padding max_length \
    --src_lang eng_Latn \
    --tgt_lang pan_Guru \
    --return_format pt

Resolving data files: 100%|█████████████████| 64/64 [00:00<00:00, 483667.49it/s]
Generating train split: 612 examples [00:00, 35631.29 examples/s]
Loaded Dataset....
Map: 100%|███████████████████████████| 612/612 [00:00<00:00, 1856.05 examples/s]
Saving the dataset (1/1 shards): 100%|█| 612/612 [00:00<00:00, 149143.81 example


In [13]:
ds = Dataset.from_file("/home/shanks/setu-translate/examples/output/wiki_en/binarized_sentences/data-00000-of-00001.arrow")

In [14]:
ds[0:10]["sub_strs"]

Dataset({
    features: ['doc_id', 'sub_strs', 'sids', 'tlt_file_loc', 'tlt_idx', 'input_ids', 'attention_mask', 'placeholder_entity_map'],
    num_rows: 612
})

In [15]:
for input_id in ds[0:2]["input_ids"]:
    print(str(input_id))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 46, 18178, 53, 4994, 52, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

## Now perform translation on the binarized dataset

In [16]:
!HF_DATASETS_CACHE=/mnt/sea/tmp python /home/kd/Desktop/proj/dec/setu-translate/stages/tlt_pipelines/translate_joblib.py \
    --root_dir "/mnt/sea/setu-translate" \
    --data_files "/mnt/sea/setu-translate/examples/output/wiki_en/binarized_sentences/*.arrow" \
    --cache_dir "/mnt/sea/setu-translate/examples/cache" \
    --base_save_dir "/mnt/sea/setu-translate/examples/output/wiki_en/model_out" \
    --joblib_temp_folder "/mnt/sea/setu-translate/tmp" \
    --batch_size 16 \
    --total_procs 1 \
    --devices "0"

Generating train split: 612 examples [00:00, 319944.42 examples/s]
100%|████████████████████████████| 10/10 [00:28<00:00,  2.89s/ba: 64 samples/ba]
Saving the dataset (1/1 shards): 100%|█| 612/612 [00:00<00:00, 92782.26 examples


In [17]:
ds = Dataset.from_file("/mnt/sea/setu-translate/examples/output/wiki_en/model_out/rank_0-device_cuda/data-00000-of-00001.arrow")
ds

Dataset({
    features: ['doc_id', 'sid', 'sub_str', 'tlt_idx', 'placeholder_entity_map', 'translated_input_ids', 'tlt_file_loc'],
    num_rows: 612
})

In [18]:
for input_id in ds[0:2]["translated_input_ids"]:
    print(str(input_id))

[2, 15330, 40997, 22, 363, 13075, 143, 21, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[2, 4505, 748, 4093, 7837, 178, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


## Now let's decode the translated inputs

In [19]:
!HF_DATASETS_CACHE=/mnt/sea/tmp python /home/kd/Desktop/proj/dec/setu-translate/stages/decode.py \
    --data_files "/mnt/sea/setu-translate/examples/output/wiki_en/model_out/*/*.arrow" \
    --cache_dir "/mnt/sea/setu-translate/examples/cache" \
    --decode_dir "/mnt/sea/setu-translate/examples/output/wiki_en/decode" \
    --format arrow \
    --batch_size 64 \
    --total_procs 1 \
    --src_lang eng_Latn \
    --tgt_lang pan_Guru 

usage: decode.py [-h] [--data_files DATA_FILES] [--format FORMAT]
                 [--cache_dir CACHE_DIR] [--decode_dir DECODE_DIR]
                 [--batch_size BATCH_SIZE] [--total_procs TOTAL_PROCS]
                 [--save_strs SAVE_STRS] [--src_lang SRC_LANG]
                 [--tgt_lang TGT_LANG]
decode.py: error: unrecognized arguments: --root_dir /home/shanks/setu-translate


In [20]:
ds = Dataset.from_file("/home/shanks/setu-translate/examples/output/wiki_en/decode/data-00000-of-00001.arrow")

In [21]:
ds[0:10]["translated"]

Dataset({
    features: ['doc_id', 'sid', 'sub_str', 'tlt_idx', 'placeholder_entity_map', 'translated_input_ids', 'tlt_file_loc', 'translated', 'written'],
    num_rows: 612
})

## Now replace the text with translations in the original templated dataset

In [22]:
!HF_DATASETS_CACHE=/mnt/sea/tmp python /home/kd/Desktop/proj/dec/setu-translate/stages/replace.py \
    --paths_data "/mnt/sea/setu-translate/examples/output/wiki_en/templated/*.arrow" \
    --cache_dir "/mnt/sea/setu-translate/examples/cache" \
    --batch_size 64 \
    --num_procs 1 \
    --translated_save_path "/mnt/sea/setu-translate/examples/output/wiki_en/translated"

Resolving data files: 100%|█████████████████| 64/64 [00:00<00:00, 508400.48it/s]
Generating train split: 100 examples [00:00, 5780.78 examples/s]
Map: 100%|████████████████████████████████| 4/4 [00:00<00:00, 280.66 examples/s]
Saving the dataset (1/1 shards): 100%|███| 4/4 [00:00<00:00, 1216.80 examples/s]


In [23]:
ds = Dataset.from_file("/home/shanks/setu-translate/examples/output/wiki_en/translated/data-00000-of-00001.arrow")

In [24]:
ds[0:10]["translated"]

Dataset({
    features: ['source', 'url', 'timestamp', 'doc_id', 'text', 'sub_strs', 'sids', 'tlt_folder', 'translated', 'substr_tlt'],
    num_rows: 4
})

In [69]:
ds[0]

{'source': 'en_wikipedia',
 'url': 'https://en.wikipedia.org/wiki?curid=63405427',
 'timestamp': '24/07/23 15:06',
 'doc_id': '7299c62f59ec33baca764b3cc9f6aa529a64ca0e784d963b48649db5500c0b96',
 'text': '\nboycott (album)\n\n\n\n',
 'sub_strs': '["boycott (album)"]',
 'sids': '["545490bdf181f0d46f6c7bf3a1d2ee08d8266c11fd40c96d3dc6f2238387fffc"]',
 'tlt_folder': '/mnt/sea/setu-translate/examples/output/wiki_en/doc_csvs/923682bea6d517dc178d480c88e129e485ed902f4fa024866666658cd4ea6836/7299c62f59ec33baca764b3cc9f6aa529a64ca0e784d963b48649db5500c0b96',
 'translated': '\nਬਾਈਕਾਟ (ਐਲਬਮ)\n\n\n\n',
 'substr_tlt': ['ਬਾਈਕਾਟ (ਐਲਬਮ)']}

In [None]:
!HF_DATASETS_CACHE=/mnt/sea/tmp python /home/kd/Desktop/proj/dec/setu-translate/stages/perform_templating.py \
    --glob_path {initial_parquet_file_path} \
    --cache_dir_for_original_data "/mnt/sea/setu-translate/examples/cache" \
    --base_save_path "/mnt/sea/setu-translate/examples/output/{dir_name}/doc_csvs" \
    --save_path "/mnt/sea/setu-translate/examples/output/{dir_name}/templated" \
    --text_col value \
    --source_type {dir_name} \
    --translation_type sentence \
    --use_cache False \
    --id_col doc_id \
    --write_style batch \
    --split "train"




HF_DATASETS_CACHE=/mnt/sea/tmp; initial_parquet_file_path=/mnt/sea/dolma/openhermes-readability_it2.parquet; dir_name=open_hermes; python /home/kd/Desktop/proj/dec/setu-translate/stages/tlt_pipelines/translate_joblib.py --root_dir "/mnt/sea/setu-translate" --data_files "/mnt/sea/setu-translate/examples/output/$dir_name/binarized_sentences/*.arrow" --cache_dir "/mnt/sea/setu-translate/examples/cache" --base_save_dir "/mnt/sea/setu-translate/examples/output/$dir_name/model_out" --joblib_temp_folder "/mnt/sea/setu-translate/tmp" --total_procs 24 --batch_size 400 --devices "0"


HF_DATASETS_CACHE=/mnt/sea/tmp; initial_parquet_file_path=/mnt/sea/dolma/openhermes-readability_it2.parquet; dir_name=open_hermes; python /home/kd/Desktop/proj/dec/setu-translate/stages/decode.py --data_files "/mnt/sea/setu-translate/examples/output/$dir_name/model_out/*/*.arrow" --cache_dir "/mnt/sea/setu-translate/examples/cache" --decode_dir "/mnt/sea/setu-translate/examples/output/$dir_name/decode" --format arrow --batch_size 256 --total_procs 24 --src_lang eng_Latn --tgt_lang pan_Guru 

HF_DATASETS_CACHE=/mnt/sea/tmp; initial_parquet_file_path=/mnt/sea/dolma/openhermes-readability_it2.parquet; dir_name=open_hermes; python /home/kd/Desktop/proj/dec/setu-translate/stages/replace.py \
    --paths_data "/mnt/sea/setu-translate/examples/output/wiki_en/templated/*.arrow" \
    --cache_dir "/mnt/sea/setu-translate/examples/cache" \
    --batch_size 128 \
    --num_procs 1 \
    --translated_save_path "/mnt/sea/setu-translate/examples/output/wiki_en/translated"