In [1]:
# default_exp transformers

# transformers

> Deep dive into transformers

In [15]:
#export
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *
import jovsatools
from fastcore.test import *
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Attention

### Dataset construction

In [58]:
#export
def download_keras_data(fname, floc, origin):
    # Download the file
    path_to_zip = tf.keras.utils.get_file(fname, origin, extract=True)
    return os.path.dirname(path_to_zip)+floc

# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_attention_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

  return zip(*word_pairs)

def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

def load_attention_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
  targ_lang, inp_lang = create_attention_dataset(path, num_examples)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [59]:
# tests

en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
test_eq(
    preprocess_sentence(sp_sentence).encode('utf-8') == 
    b'<start> \xc2\xbf puedo tomar prestado este libro ? <end>',
    True
    )
test_eq(
        preprocess_sentence(en_sentence) == 
        "<start> may i borrow this book ? <end>",
        True
    )
test_eq(
    preprocess_sentence(sp_sentence).encode('utf-8') == 
    b'<start> \xc2\xbf puedo tomar prestado este libro ? <end>',
    True
    )

In [72]:
num_examples = 10
dataset_path = download_keras_data(
                    'spa-eng.zip', 
                    '/spa-eng/spa.txt',
                    'http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip')

input_tensor, target_tensor, inp_lang, targ_lang = load_attention_dataset(dataset_path, num_examples)

In [74]:
# tests
test_eq(len(input_tensor) == len(target_tensor) ==  num_examples, True)

In [85]:
# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

8 8 2 2


In [100]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

## References
* [Attention dataset](http://www.manythings.org/anki/)
* [NMT with attention - tensorflow](https://www.tensorflow.org/tutorials/text/nmt_with_attention)