# Step 0: Generating Training, Validating, and Testing Data

This notebook is to show how [this dataset](https://www.kaggle.com/datasets/lonnieqin/englishspanish-translation-dataset) is converted to a  60-20-20 split for training, validating, and testing (respectively) the NMT models in the other notebooks.

In [1]:
#@title Import dependencies
import pandas as pd
from string import punctuation
import re
import numpy as np
import tensorflow as tf
import os
import time
import random

import matplotlib.pyplot as plt
%matplotlib inline

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import unicodedata
import io

In [2]:
#@title Set up paths
DATA_FILEPATH = "/content/sample_data/data.csv"

# Aux functions for importing dataset

def load_doc(filepath):
  """
  Input:
    filepath  : (str) path to file from colab dir

  Output:
                (pd df)
  """
  return pd.read_csv(filepath)


In [3]:
#@title Load in the data
data = load_doc(DATA_FILEPATH).sample(95000)
data.head()

Unnamed: 0,english,spanish
88516,It was a mistake to break up with you.,Terminar contigo fue un error.
103227,We have to see how much cash we have on hand.,Hay que ver cuánto efectivo tenemos a la mano.
22596,Why worry about Tom?,¿Por qué preocuparse por Tom?
75490,Do you know how beautiful you are?,¿Sabes lo hermosa que eres?
8872,Someone's there.,Alguien está allí.


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95000 entries, 88516 to 109908
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   english  95000 non-null  object
 1   spanish  95000 non-null  object
dtypes: object(2)
memory usage: 2.2+ MB


In [5]:
#@title Train-Val-Test Split

# swapping the spanish set and english set as inputs so
# we can train the model to translate from span to eng
X_train, X_rest, y_train, y_rest = train_test_split(data.spanish, data.english,
                                                            test_size=0.4,
                                                            random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest,
                                                            test_size=0.5,
                                                            random_state=42)

print(f"Number of training samples   : {len(X_train)}")
print(f"Number of validation samples : {len(X_val)}")
print(f"Number of test samples       : {len(X_test)}")

Number of training samples   : 57000
Number of validation samples : 19000
Number of test samples       : 19000


In [6]:
def to_list(dataset):
  """
  converting from type df to type lst

  dataset : (type pd.core.series.Series)
  """
  return dataset.tolist()

In [7]:
#@title Saving the data splits as lists

X_train = to_list(X_train)
X_val = to_list(X_val)
X_test = to_list(X_test)
y_train = to_list(y_train)
y_val = to_list(y_val)
y_test = to_list(y_test)

In [8]:
#@title Test if the conversion worked

type(X_train) == list

True

In [9]:
#@title Generating samples for translating
samples_test = []
for i in range(100):
  rand_idx = random.choice(range(len(X_test)))
  source = X_test[rand_idx]
  target = y_test[rand_idx]
  samples_test.append((source, target))

In [11]:
#@title Saving data splits in external file

import pickle

SAVED_PKL_PATH = "./data_splits.pkl"

# Open a file and use dump()
with open(SAVED_PKL_PATH, 'wb') as file:

    # A new file will be created
    pickle.dump((
        X_train,
        X_val,
        X_test,
        y_train,
        y_val,
        y_test,
        samples_test
    ), file)