# Process Dataset

#### References
* https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html
* https://ascii.cl

In [1]:
import utils_char
import pickle
import numpy as np
from sklearn.model_selection import train_test_split

pickle_filename = "data/shakespeare_corpus_data.pkl"
pickle_filename_train = "data/shakespeare_corpus_data_train.pkl"
pickle_filename_test = "data/shakespeare_corpus_data_test.pkl"

print('All letters:', utils_char.all_letters)
print('All set_classes:', utils_char.set_classes)
print('Number of all letters:', utils_char.n_letters)
print(utils_char.unicodeToAscii('CrazY12^@s%g O\'Néàl'))
data_lst = utils_char.readLines('./data/char_data.txt')
print('Lines on dataset:', len(data_lst))
biggest_line_size = np.max([len(line) for line in data_lst])
print('Biggest line size:', biggest_line_size, 'characters')

All letters: abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'-0123456789
All set_classes: [' ', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ';', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Number of all letters: 69
CrazY12sg O'Neal
Lines on dataset: 124453
Biggest line size: 84 characters


#### Split phrases into words

In [None]:
data_lst_words = [phrases.split() for phrases in data_lst]
big_lst_words = []
for list_words in data_lst_words:
    for words in list_words:
        big_lst_words.append(words)
biggest_word_size = np.max([len(words) for words in big_lst_words])

# Just substitute variables
data_lst = big_lst_words
biggest_line_size = biggest_word_size

##### Create Codemap

In [2]:
# Create codemap
codemap = {}
for idx, val in enumerate(utils_char.set_classes):
    codemap[idx] = val

if (utils_char.EOS_token in codemap.values()) is False:
    print('EOS: Not included adding....')
    codemap[len(codemap)] = utils_char.EOS_token
else:
    print('EOS already on codemap')

EOS: Not included adding....


In [3]:
# Save it to pickle file
with open('codemap_LM.pickle', 'wb') as handle:
    pickle.dump(codemap, handle)

##### Training sample for Character Language Model
Just to illustrate the input and expected output for the character langauge model. All those characters will be actually a class_id given by the codemap

In [4]:
#sample = data_lst[0]
sample = 'Hello'
print('Timesteps:', len(sample))
print('Input: <SOS>', sample[0:])
print('Target:', sample[1:], '<EOS>')

# The the sample would be something like this....
X = [utils_char.class_id_from_char(char, codemap) for char in sample]
# Input shifted by one character + EOS_token
Y = [utils_char.class_id_from_char(char, codemap) for char in sample[1:]] + [utils_char.class_id_from_char(utils_char.EOS_token, codemap)]
print('X:', X)
print('Y:', Y)
X_dec = [utils_char.char_from_class_id(class_id, codemap) for class_id in X]
Y_dec = [utils_char.char_from_class_id(class_id, codemap) for class_id in Y]
print('X_dec:', X_dec)
print('Y_dec:', Y_dec)

Timesteps: 5
Input: <SOS> Hello
Target: ello <EOS>
X: [23, 46, 53, 53, 56]
Y: [46, 53, 53, 56, 68]
X_dec: ['H', 'e', 'l', 'l', 'o']
Y_dec: ['e', 'l', 'l', 'o', '<EOS>']


##### Create Dataset

In [5]:
char_language_model_data = {}
cnt_data = 0
for idx, sample in enumerate(data_lst):
    #print('Idx:', idx, 'text:', sample)
    X_class_id = [utils_char.class_id_from_char(char, codemap) for char in sample]
    #X_chars = [char for char in sample]
    #Y_chars = [char for char in sample[1:]] + [utils_char.class_id_from_char(utils_char.EOS_token, codemap)]    
    Y_class_id = [utils_char.class_id_from_char(char, codemap) for char in sample[1:]] + [utils_char.class_id_from_char(utils_char.EOS_token, codemap)]  
    len_x = len(X_class_id)
    len_y = len(Y_class_id)
    # Avoid empty sequences on the dataset
    if len_x == 0:
        continue
    char_language_model_data[cnt_data] = X_class_id, Y_class_id, len_x, len_y
    cnt_data += 1

#### Pad Dataset
This step is necessary in order to use mini-batches

In [6]:
char_language_model_data_pad = utils_char.pad_data(char_language_model_data, biggest_line_size)

In [7]:
# Save processed dataset to pickle
with open(pickle_filename, 'wb') as handle:
    pickle.dump(char_language_model_data_pad, handle)

#### Divide between train/test

In [8]:
dataset = utils_char.load_pickle(pickle_filename)
print('Total dataset size:', len(dataset))
# Convert dictionary to list
dataset_lst = list(dataset.values())
dataset_lst_train, dataset_lst_test = train_test_split(dataset_lst, test_size=1/10, random_state=42)
print('Train dataset size:', len(dataset_lst_train))
print('Test dataset size:', len(dataset_lst_test))

# Save processed dataset to pickle
with open(pickle_filename_train, 'wb') as handle:
    pickle.dump(dataset_lst_train, handle)

# Save processed dataset to pickle
with open(pickle_filename_test, 'wb') as handle:
    pickle.dump(dataset_lst_test, handle)

Total dataset size: 115000
Train dataset size: 103500
Test dataset size: 11500
