In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from nltk.lm import Vocabulary
import torch
import torch.utils.data as tud
import sys
sys.path.append("../../lib")
from metrics import levenshtein
import pickle
from pathlib import Path
import re

In [2]:
folder = Path("../../data/en/data/")

In [3]:
train = pd.read_pickle(folder/"train_aligned.pkl")
train.shape

(217904, 2)

In [4]:
dev = pd.read_pickle(folder/"dev_aligned.pkl")
dev.shape

(7305, 2)

In [5]:
with open(folder/"vocabulary.pkl", "rb") as file:
    vocabulary = pickle.load(file)

In [6]:
char2i = {c:i for i, c in enumerate(sorted(vocabulary), 3)}
char2i["<PAD>"] = 0
char2i["<START>"] = 1
char2i["<END>"] = 2
len(char2i)

164

In [7]:
i2char = {i:c for i, c in enumerate(sorted(vocabulary), 3)}
i2char[0] = "<PAD>"
i2char[1] = "<START>"
i2char[2] = "<END>"
len(i2char)

164

In [8]:
length = 100

output = []
for s in tqdm(train.source):
    output.append(torch.tensor([1] + [char2i[c] for c in s] + [2]))
    
train_source = torch.nn.utils.rnn.pad_sequence(output, batch_first = True)
print(train_source.shape)

output = []
for s in tqdm(train.target):
    output.append(torch.tensor([1] + [char2i[c] for c in s] + [2]))
    
train_target = torch.nn.utils.rnn.pad_sequence(output, batch_first = True)
print(train_target.shape)

  0%|          | 0/217904 [00:00<?, ?it/s]

torch.Size([217904, 102])


  0%|          | 0/217904 [00:00<?, ?it/s]

torch.Size([217904, 102])


In [9]:
train.source[0] == re.sub(r"<START>|<END>|<PAD>", "", "".join([i2char[c] for c in train_source[0].tolist()]))

True

In [10]:
train.target[0] == re.sub(r"<START>|<END>|<PAD>", "", "".join([i2char[c] for c in train_target[0].tolist()]))

True

In [11]:
output = []
for s in tqdm(dev.source):
    output.append(torch.tensor([1] + [char2i[c] for c in s] + [2]))
    
dev_source = torch.nn.utils.rnn.pad_sequence(output, batch_first = True)
print(dev_source.shape)

output = []
for s in tqdm(dev.target):
    output.append(torch.tensor([1] + [char2i[c] for c in s] + [2]))
    
dev_target = torch.nn.utils.rnn.pad_sequence(output, batch_first = True)
print(dev_target.shape)

  0%|          | 0/7305 [00:00<?, ?it/s]

torch.Size([7305, 102])


  0%|          | 0/7305 [00:00<?, ?it/s]

torch.Size([7305, 102])


In [12]:
dev.source[0] == re.sub(r"<START>|<END>|<PAD>", "", "".join([i2char[c] for c in dev_source[0].tolist()]))

True

In [13]:
dev.target[0] == re.sub(r"<START>|<END>|<PAD>", "", "".join([i2char[c] for c in dev_target[0].tolist()]))

True

In [14]:
torch.save(train_source, folder/"train_source.pt")

In [15]:
torch.save(train_target, folder/"train_target.pt")

In [16]:
torch.save(dev_source, folder/"dev_source.pt")

In [17]:
torch.save(dev_target, folder/"dev_target.pt")

In [18]:
with open(folder/"char2i.pkl", "wb") as file:
    pickle.dump(char2i, file)

In [19]:
with open(folder/"i2char.pkl", "wb") as file:
    pickle.dump(i2char, file)