In [1]:
import numpy as np
import pandas as pd

In [2]:
# Download the dataset
!wget "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar"

--2022-04-18 05:22:53--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.127.128, 142.250.153.128, 142.250.145.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.127.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2022-04-18 05:23:12 (103 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]



In [3]:
!tar -xvf /content/dakshina_dataset_v1.0.tar

dakshina_dataset_v1.0/bn/
dakshina_dataset_v1.0/bn/lexicons/
dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.test.tsv
dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv
dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv
dakshina_dataset_v1.0/bn/native_script_wikipedia/
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.valid.text.shuf.txt.gz
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.info.sorted.tsv.gz
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.info.sorted.tsv.gz
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.text.sorted.tsv.gz
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.text.shuf.txt.gz
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.nonblock.sections.tsv.gz
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.omit_pages.txt.gz
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.text.sorted.tsv.gz
dakshina_dataset_v1.0/bn/na

In [38]:
train_data_path = '/content/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.train.tsv'
validation_data_path = '/content/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.dev.tsv'
test_data_path = '/content/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.test.tsv'

df_train = pd.read_csv(train_data_path, sep='\t', header=None)
df_val = pd.read_csv(validation_data_path, sep='\t', header=None)
df_test = pd.read_csv(test_data_path, sep='\t', header=None)

In [39]:
def load_data(df, input_texts, target_texts, input_characters=None, target_characters=None, is_test_data=False):
    
    for  _, row in df.iterrows():
        input_text, target_text = str(row[0]), str(row[1])
        input_texts.append(input_text)
        target_text = '\t' + target_text + '\n'
        target_texts.append(target_text)
        
        if not is_test_data:
            for char in input_text:
                if char not in input_characters:
                    input_characters.add(char)

            for char in target_text:
                if char not in target_characters:
                    target_characters.add(char)

    max_encoder_seq_length = max([len(txt) for txt in input_texts])
    max_decoder_seq_length = max([len(txt) for txt in target_texts])

    if not is_test_data:
        input_characters = sorted(list(input_characters))
        target_characters = sorted(list(target_characters))
        num_encoder_tokens = len(input_characters)
        num_decoder_tokens = len(target_characters)

        return max_encoder_seq_length, max_decoder_seq_length, num_encoder_tokens, num_decoder_tokens
    
    return max_encoder_seq_length, max_decoder_seq_length

In [40]:
def get_vectors(input_texts, target_texts, input_token_index, target_token_index,
                max_encoder_seq_length, num_encoder_tokens, 
                max_decoder_seq_length=None, num_decoder_tokens=None,
                is_test_data=False):
      
    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32")

    if not is_test_data:
        decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
        decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.0 
        encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0

        if not is_test_data:
            for t, char in enumerate(target_text):
                # decoder_target_data is ahead of decoder_input_data by one timestep
                decoder_input_data[i, t, target_token_index[char]] = 1.0
                if t > 0:
                    # decoder_target_data will be ahead by one timestep and will not include the start character.
                    decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
            decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
            decoder_target_data[i, t:, target_token_index[" "]] = 1.0    

    if is_test_data:
        return encoder_input_data

    return encoder_input_data, decoder_input_data, decoder_target_data

### Load Train, Validation and Test data

In [41]:
input_texts = []
target_texts = []

input_characters = set(' ')
target_characters = set(' ')

max_encoder_seq_length, max_decoder_seq_length, num_encoder_tokens, num_decoder_tokens = load_data(df_train, input_texts, target_texts, input_characters, target_characters)

print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

Number of samples: 58550
Number of unique input tokens: 64
Number of unique output tokens: 29
Max sequence length for inputs: 20
Max sequence length for outputs: 27


In [42]:
val_input_texts = []
val_target_texts = []

val_max_encoder_seq_length, val_max_decoder_seq_length, val_num_encoder_tokens, val_num_decoder_tokens = load_data(
    df_val, val_input_texts, val_target_texts, input_characters, target_characters)

print("Number of samples:", len(val_input_texts))
print("Number of unique input tokens:", val_num_encoder_tokens)
print("Number of unique output tokens:", val_num_decoder_tokens)
print("Max sequence length for inputs:", val_max_encoder_seq_length)
print("Max sequence length for outputs:", val_max_decoder_seq_length)

Number of samples: 5683
Number of unique input tokens: 64
Number of unique output tokens: 29
Max sequence length for inputs: 19
Max sequence length for outputs: 23


In [43]:
test_input_texts = []
test_target_texts = []

test_max_encoder_seq_length, test_max_decoder_seq_length = load_data(df_test, test_input_texts, test_target_texts, is_test_data=True)

print("Number of Test samples:", len(test_input_texts))
print("Test Max sequence length for inputs:", test_max_encoder_seq_length)
print("Test Max sequence length for outputs:", test_max_decoder_seq_length)

Number of Test samples: 5747
Test Max sequence length for inputs: 18
Test Max sequence length for outputs: 25


In [44]:
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

In [45]:
encoder_input_data, decoder_input_data, decoder_target_data = get_vectors(
    input_texts, target_texts, input_token_index, target_token_index, 
    max_encoder_seq_length, num_encoder_tokens, max_decoder_seq_length,
    num_decoder_tokens)

In [46]:
val_encoder_input_data, val_decoder_input_data, val_decoder_target_data = get_vectors(
    val_input_texts, val_target_texts, input_token_index, target_token_index,
    val_max_encoder_seq_length, val_num_encoder_tokens, 
    val_max_decoder_seq_length, val_num_decoder_tokens)

In [47]:
test_encoder_input_data = get_vectors(
    test_input_texts, test_target_texts, input_token_index, target_token_index,
    test_max_encoder_seq_length, num_encoder_tokens, is_test_data=True)