# Install Dependencies

In [None]:
# !pip install datasets==1.18.1

## Please ensure you have executed this at least once.

# Download Parallel Dataset

In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("cfilt/iitb-english-hindi")

Using custom data configuration cfilt--iitb-english-hindi-930ee63dc3ad2bff
Reusing dataset parquet (/home/harpreet/.cache/huggingface/datasets/parquet/cfilt--iitb-english-hindi-930ee63dc3ad2bff/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/3 [00:00<?, ?it/s]

### View Parallel Corpus Details

In [3]:
dataset

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
})

### Extract Dataset in Source and Target Text files

In [4]:
source_train_file = open("source_train.txt", "w+", encoding='utf8')
target_train_file = open("target_train.txt", "w+", encoding='utf8')
for translation_pair in dataset["train"]["translation"]:
  source_sentence = translation_pair["en"]
  target_sentence = translation_pair["hi"]
  source_train_file.write(source_sentence.strip("\n") + "\n")
  target_train_file.write(target_sentence.strip("\n") + "\n")
source_train_file.close()
target_train_file.close()

source_valid_file = open("source_valid.txt", "w+", encoding='utf8')
target_valid_file = open("target_valid.txt", "w+", encoding='utf8')
for translation_pair in dataset["validation"]["translation"]:
  source_sentence = translation_pair["en"]
  target_sentence = translation_pair["hi"]
  source_valid_file.write(source_sentence.strip("\n") + "\n")
  target_valid_file.write(target_sentence.strip("\n") + "\n")
source_valid_file.close()
target_valid_file.close()

source_test_file = open("source_test.txt", "w+", encoding='utf8')
target_test_file = open("target_test.txt", "w+", encoding='utf8')
for translation_pair in dataset["test"]["translation"]:
  source_sentence = translation_pair["en"]
  target_sentence = translation_pair["hi"]
  source_test_file.write(source_sentence.strip("\n") + "\n")
  target_test_file.write(target_sentence.strip("\n") + "\n")
source_test_file.close()
target_test_file.close()


### Parallel Corpus Sentence Pairs (Linux)

In [5]:
! wc -l source_train.txt target_train.txt source_valid.txt target_valid.txt source_test.txt target_test.txt

  1659083 source_train.txt
  1659083 target_train.txt
      520 source_valid.txt
      520 target_valid.txt
     2507 source_test.txt
     2507 target_test.txt
  3324220 total


# Byte Pair Encoding (BPE)

In [None]:
# ! pip install subword-nmt

## Please ensure you have executed this at least once.

## Learn BPE codes

Note: Please make sure to generate the text files by running the previous cells

In [6]:
import os
os.environ['NUM_OF_MERGE_OPERATIONS'] = "16000"

# Or set environment variable NUM_OF_MERGE_OPERATIONS to desired number of BPE merge operations

In [7]:
! cat source_train.txt source_test.txt source_valid.txt > source_full.txt
! cat target_train.txt target_test.txt target_valid.txt > target_full.txt

In [8]:
! subword-nmt learn-bpe -s $NUM_OF_MERGE_OPERATIONS < source_full.txt > source-bpe.codes
! subword-nmt learn-bpe -s $NUM_OF_MERGE_OPERATIONS < target_full.txt > target-bpe.codes


100%|####################################| 16000/16000 [00:39<00:00, 405.30it/s]
100%|####################################| 16000/16000 [00:52<00:00, 303.78it/s]


## Apply BPE

In [9]:
! subword-nmt apply-bpe -c source-bpe.codes < source_train.txt > source_train_bpe.txt
! subword-nmt apply-bpe -c source-bpe.codes < source_valid.txt > source_valid_bpe.txt
! subword-nmt apply-bpe -c source-bpe.codes < source_test.txt > source_test_bpe.txt

! subword-nmt apply-bpe -c target-bpe.codes < target_train.txt > target_train_bpe.txt
! subword-nmt apply-bpe -c target-bpe.codes < target_valid.txt > target_valid_bpe.txt
! subword-nmt apply-bpe -c target-bpe.codes < target_test.txt > target_test_bpe.txt


## BPE Codes (Linux)

In [10]:
! wc -l source-bpe.codes target-bpe.codes

 16001 source-bpe.codes
 16001 target-bpe.codes
 32002 total


## BPE Parallel Corpus Sentence Pairs (Linux)

In [11]:
! wc -l source_train_bpe.txt target_train_bpe.txt source_valid_bpe.txt target_valid_bpe.txt source_test_bpe.txt target_test_bpe.txt

  1659083 source_train_bpe.txt
  1659083 target_train_bpe.txt
      520 source_valid_bpe.txt
      520 target_valid_bpe.txt
     2507 source_test_bpe.txt
     2507 target_test_bpe.txt
  3324220 total


In [12]:
!head -n 5 source_train_bpe.txt

Give your application an acces@@ sibility work@@ out
Ac@@ cer@@ c@@ is@@ er Acc@@ es@@ sibility Expl@@ ore@@ r
The default plugin layout for the bottom panel
The default plugin layout for the top panel
A list of plugins that are disabled by default


In [13]:
!head -n 5 source_train.txt

Give your application an accessibility workout
Accerciser Accessibility Explorer
The default plugin layout for the bottom panel
The default plugin layout for the top panel
A list of plugins that are disabled by default


In [15]:
!head -n 5 target_train_bpe.txt

अपने अनुप्रयोग को पहुंच@@ नीयता व्यायाम का लाभ दें
एक्से@@ र्@@ सा@@ इस@@ र पहुंच@@ नीयता अन्वे@@ षक
निचले पटल के लिए डि@@ फो@@ ल्ट प्लग@@ -इन खाका
ऊपरी पटल के लिए डि@@ फो@@ ल्ट प्लग@@ -इन खाका
उन प्लग@@ -@@ इ@@ नों की सूची जिन्हें डि@@ फो@@ ल्ट रूप से निष्क्रिय किया गया है


In [1]:
!head -n 5 source_full.txt

Give your application an accessibility workout
Accerciser Accessibility Explorer
The default plugin layout for the bottom panel
The default plugin layout for the top panel
A list of plugins that are disabled by default
