In [None]:
### INSTALL DEPS QUIETLY
!pip install -U -q tensorflow keras datasets tensorflow-estimator tfds-nightly tf-models-official==2.7.0 "tensorflow-text==2.8.*"

In [None]:
import os
import time
import pathlib
import re
import string
import warnings
import tensorflow as tf
import datasets as huggingface
import tensorflow_text as text
import tensorflow_hub as hub
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
import tensorflow_datasets as tfds
import tensorflow_addons as tfa
from official.nlp import optimization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from typing import List, Dict, Tuple
from typing_extensions import Literal, ClassVar

warnings.filterwarnings("ignore")
tf.get_logger().setLevel('ERROR')

# LANGUAGE DETECTION

In [None]:
### DO NOT COMPRESS LOADED/EXPORTED MODEL
os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "UNCOMPRESSED"

In [None]:
from enum import Enum

class Split(Enum):
  TRAIN = "train"
  VALIDATION = "validation"
  TEST = "test"

class Languages(Enum):
  ru = 1
  en = 0

In [None]:
### UTILS
def convert_dataframe_column_to_tensor(seq: pd.Series, dtype, name:str) -> tf.Tensor:
  return tf.convert_to_tensor(seq, dtype=dtype, name=name)


### MODEL

In [None]:
class LanguageDetection:
  tensorflow_preprocess_handle: ClassVar[str] = 'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3'
  tensorflow_model: ClassVar[str] = 'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3'
  AUTOTUNE: ClassVar[int] = tf.data.AUTOTUNE
  classes: ClassVar[List[str]] = ["en", "ru"]
  model_path: ClassVar[str] = os.path.join(".", "language-detection")
  features: ClassVar[List[str]] = ["Text"]
  label_name: ClassVar[str] = "Language"

  class Classifier(tf.keras.Model):
    def __init__(self, num_classes: int):
      super(LanguageDetection.Classifier, self).__init__(name="language_classifier")
      self.encoder = hub.KerasLayer(LanguageDetection.tensorflow_model, trainable=True)
      self.dropout = tf.keras.layers.Dropout(0.1)
      self.dense = tf.keras.layers.Dense(num_classes)

    def call(self, preprocessed_text):
      encoder_outputs = self.encoder(preprocessed_text)
      pooled_output = encoder_outputs["pooled_output"]
      x = self.dropout(pooled_output)
      x = self.dense(x)
      return x

  def __init__(self, seq_length=128):
    if os.environ.get('COLAB_TPU_ADDR', None):
      cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
      tf.config.experimental_connect_to_cluster(cluster_resolver)
      tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
      self.strategy = tf.distribute.TPUStrategy(cluster_resolver)
    elif tf.config.list_physical_devices('GPU'):
      self.strategy = tf.distribute.MirroredStrategy()
    else:
      # NOT RECOMMENDED.
      self.strategy = tf.distribute.OneDeviceStrategy("/device:CPU:0")

    self.__preprocess_model = self.__make_preprocess_model(self.features, seq_length=seq_length)
    self.__reloaded_model = None

  def load_dataset(self, dataframes: Dict[Split, pd.DataFrame], split: Split, batch_size=32) -> Tuple[tf.data.Dataset, int]:
    df = dataframes[split]
    data_count = len(df)

    tensor_slice: Dict[str, tf.Tensor] = {
        self.label_name: convert_dataframe_column_to_tensor(df[self.label_name], dtype=tf.int32, name=f"{split}-{self.label_name}")
    }
    for feature in self.features:
      tensor_slice[feature] = convert_dataframe_column_to_tensor(df[feature], dtype=tf.string, name=f"{split}-{feature}")

    dataset = tf.data.Dataset.from_tensor_slices(tensor_slice)
    if split == Split.TRAIN:
      dataset = dataset.shuffle(data_count)
      dataset = dataset.repeat()
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda ex: (self.__preprocess_model(ex), ex[self.label_name]))
    dataset = dataset.cache().prefetch(buffer_size=LanguageDetection.AUTOTUNE)
    return dataset, data_count

  def __make_preprocess_model(self, features: List[str], seq_length=128):
    text_inputs: List[tf.keras.layers.Input] = [
      tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
      for ft in self.features
    ]

    preprocessor = hub.load(LanguageDetection.tensorflow_preprocess_handle)
    tokenize = hub.KerasLayer(preprocessor.tokenize)
    tokenized_inputs = [tokenize(s) for s in text_inputs]

    bert_pack_inputs = hub.KerasLayer(
      preprocessor.bert_pack_inputs,
      arguments=dict(seq_length=seq_length),
      name='bert_pack_inputs'
    )
    model_inputs = bert_pack_inputs(tokenized_inputs)
    return tf.keras.Model(text_inputs, model_inputs)

  def __make_classifier_model(self):
    return LanguageDetection.Classifier(len(LanguageDetection.classes))

  def fit(self, dataframes: Dict[Split, pd.DataFrame], epochs=3, init_lr=2e-5, batch_size=32):
    with self.strategy.scope():
      metrics = [tfa.metrics.MatthewsCorrelationCoefficient(num_classes=len(LanguageDetection.classes))]
      loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

      train_dataset, train_datacount = self.load_dataset(dataframes=dataframes, split=Split.TRAIN, batch_size=batch_size)
      validation_dataset, validation_datacount = self.load_dataset(dataframes=dataframes, split=Split.VALIDATION, batch_size=batch_size)

      steps_per_epoch = train_datacount // batch_size
      num_train_steps = steps_per_epoch * epochs
      num_warmup_steps = num_train_steps // 10

      validation_steps = validation_datacount // batch_size

      optimizer = optimization.create_optimizer(
          init_lr=init_lr,
          num_train_steps=num_train_steps,
          num_warmup_steps=num_warmup_steps,
          optimizer_type='adamw'
      )
      self.__model = self.__make_classifier_model()
      self.__model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
      self.__model.fit(
          x=train_dataset,
          validation_data=validation_dataset,
          steps_per_epoch=steps_per_epoch,
          epochs=epochs,
          validation_steps=validation_steps
      )

  def export(self) -> None:
    bert_outputs = self.__model(self.__preprocess_model(self.__preprocess_model.inputs))
    exported_model = tf.keras.Model(self.__preprocess_model.inputs, bert_outputs)

    save_options = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
    exported_model.save(LanguageDetection.model_path, include_optimizer=False,
                      options=save_options)
    
  @classmethod
  def evaluate(cls, sentence: List[str]) -> List[str]:
    with tf.device('/job:localhost'):
      reloaded_model = tf.saved_model.load(cls.model_path)
      test_dataset = tf.data.Dataset.from_tensor_slices({
          "Text": sentence
      })

      results: List[str] = []

      for features in test_dataset.map(lambda rec: [[rec[ft]] for ft in cls.features]):
        if len(cls.features) == 1:
          result = reloaded_model(features[0])
        else:
          result = reloaded_model(list(features))
        classification = tf.argmax(result, axis=1)[0].numpy().item()
        results.append(Languages(classification).name)
      
      return pd.DataFrame.from_dict(data=dict(Text=sentence, Language=results))


### LOAD DATASET

In [None]:
def clean_text(text: str):
    text =  text.lower()
    # Change abbreviation
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    # Remove special characters/punctuation 
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    text = re.sub("(\\W)"," ",text) 
    # Remove spaces and digits
    text = re.sub('\S*\d\S*\s*','', text)
    
    return text


In [None]:
examples, metadata = tfds.load('ted_hrlr_translate/ru_to_en', with_info=True, as_supervised=True)
huggingface_dataset: huggingface.DatasetDict = huggingface.load_dataset("papluca/language-identification")

try:
  with open('cleaned_train.csv', 'r'):
    pass
except:
  df = tfds.as_dataframe(examples['train'], metadata)
  from_tfds = pd.concat([
    pd.DataFrame(dict(Text=df["en"], Language="en"))[::2], 
    pd.DataFrame(dict(Text=df["ru"], Language="ru"))[::2]
  ])
  from_tfds["Text"] = from_tfds["Text"].apply(lambda x: clean_text(x.decode("utf-8")))

  columns = ["Text", "Language"]

  ### HUGGINGFACE_DATASET_URL = https://huggingface.co/datasets/papluca/language-identification
  df2 = huggingface_dataset["train"].to_pandas()
  df2 = df2.rename(columns={ "labels": "Language", "text": "Text" })
  df2 = df2[(df2['Language'] == "en") | (df2["Language"] == "ru")]
  df2 = df2.reindex(columns=columns)
  # df2.to_csv('./cleaned_train.csv', columns=columns, index=False)
  pd.concat([df2, from_tfds]).to_csv("./cleaned_train.csv", columns=columns, index=False)


try:
  with open('cleaned_test.csv', 'r'):
    pass
except:
  df = tfds.as_dataframe(examples['test'], metadata)
  from_tfds = pd.concat([
    pd.DataFrame(dict(Text=df["en"], Language="en"))[::2], 
    pd.DataFrame(dict(Text=df["ru"], Language="ru"))[::2]
  ])
  from_tfds["Text"] = from_tfds["Text"].apply(lambda x: clean_text(x.decode("utf-8")))
  
  df3 = huggingface_dataset["test"].to_pandas()
  df3 = df3.rename(columns={ "labels": "Language", "text": "Text" })
  df3 = df3[(df3['Language'] == "en") | (df3["Language"] == "ru")]
  df3 = df3.reindex(columns=columns)
  # df3.to_csv('./cleaned_test.csv', columns=columns, index=False)
  pd.concat([df3, from_tfds]).to_csv("./cleaned_test.csv", columns=columns, index=False)

try:
  with open('cleaned_validation.csv', 'r'):
    pass
except:
  df = tfds.as_dataframe(examples["validation"], metadata)
  from_tfds = pd.concat([
    pd.DataFrame(dict(Text=df["en"], Language="en"))[::2], 
    pd.DataFrame(dict(Text=df["ru"], Language="ru"))[::2]
  ])
  from_tfds["Text"] = from_tfds["Text"].apply(lambda x: clean_text(x.decode("utf-8")))
  
  df4 = huggingface_dataset["validation"].to_pandas()
  df4 = df4.rename(columns={ "labels": "Language", "text": "Text" })
  df4 = df4[(df4['Language'] == "en") | (df4["Language"] == "ru")]
  df4 = df4.reindex(columns=columns)
  # df4.to_csv('./cleaned_validation.csv', columns=columns, index=False)
  pd.concat([df4, from_tfds]).to_csv("./cleaned_validation.csv", columns=columns, index=False)

In [None]:
### LOAD DATASET FOR LANGUAGE DETECTION
cleaned_ld_training_data = pd.read_csv("cleaned_train.csv")
cleaned_ld_training_data.loc[cleaned_ld_training_data["Language"] == "ru", "Language"] = 1
cleaned_ld_training_data.loc[cleaned_ld_training_data["Language"] == "en", "Language"] = 0
cleaned_ld_training_data["Language"] = cleaned_ld_training_data["Language"].astype('int')
cleaned_ld_training_data['Text'] = cleaned_ld_training_data['Text'].astype("str")

cleaned_ld_test_data = pd.read_csv("cleaned_test.csv")
cleaned_ld_test_data.loc[cleaned_ld_test_data["Language"] == "ru", "Language"] = 1
cleaned_ld_test_data.loc[cleaned_ld_test_data["Language"] == "en", "Language"] = 0
cleaned_ld_test_data["Language"] = cleaned_ld_test_data["Language"].astype('int')
cleaned_ld_test_data['Text'] = cleaned_ld_test_data['Text'].astype("str")

cleaned_ld_validation_data = pd.read_csv("cleaned_validation.csv")
cleaned_ld_validation_data.loc[cleaned_ld_validation_data["Language"] == "ru", "Language"] = 1
cleaned_ld_validation_data.loc[cleaned_ld_validation_data["Language"] == "en", "Language"] = 0
cleaned_ld_validation_data["Language"] = cleaned_ld_validation_data["Language"].astype('int')
cleaned_ld_validation_data['Text'] = cleaned_ld_validation_data['Text'].astype("str")

language_detection_dataset = {
    Split.TRAIN: cleaned_ld_training_data,
    Split.TEST: cleaned_ld_test_data,
    Split.VALIDATION: cleaned_ld_validation_data
}

### TRAINING

In [None]:
language_detection_model = LanguageDetection()

In [None]:
language_detection_model.fit(dataframes=language_detection_dataset, epochs=3, batch_size=16)
language_detection_model.export()


### EVALUATION

In [None]:
test_df_as_dict = dict(language_detection_dataset[Split.TEST])
ld_texts, ld_labels = test_df_as_dict['Text'].to_numpy().tolist(), test_df_as_dict['Language'].to_numpy().tolist()

sentences_language = LanguageDetection.evaluate(ld_texts)
# russian = 1
# english = 0
true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0
for i in range(len(language_detection_dataset[Split.TEST])):
  if ld_labels[i] == 1:
    if sentences_language.iloc[i]['Language'] == Languages(ld_labels[i]).name:
      true_positive += 1
    else:
      false_negative += 1
  else:
    if sentences_language.iloc[i]['Language'] == Languages(ld_labels[i]).name:
      true_negative += 1
    else:
      false_positive += 1
  
print(true_positive, false_positive)
print(false_negative, true_negative)

# MACHINE TRANSLATION

### LOAD DATASET

In [None]:
examples, metadata = tfds.load('ted_hrlr_translate/ru_to_en', with_info=True, as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']

### TOKENIZATION

In [None]:
train_en = train_examples.map(lambda ru, en: en)
train_ru = train_examples.map(lambda ru, en: ru)

In [None]:
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size=8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=dict(lower_case=True),
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [None]:
# THIS PART WILL AUTOMATICALLY CHECK YOUR VOCAB
VOCAB_FILE_DIR = os.path.join(os.getcwd(), 'machine-translation-vocab')

def write_vocab_file(filename: str, vocab: List[str]):
  os.makedirs(VOCAB_FILE_DIR, exist_ok=True)
  with open(os.path.join(VOCAB_FILE_DIR, filename), 'w') as f:
    for token in vocab:
      print(token, file=f)

try:
  with open(os.path.join(VOCAB_FILE_DIR, 'en_vocab.txt'), 'r'):
    pass
except:
    en_vocab = bert_vocab.bert_vocab_from_dataset(
        train_en.batch(1000).prefetch(2),
        **bert_vocab_args
    )
    write_vocab_file(os.path.join(VOCAB_FILE_DIR, 'en_vocab.txt'), en_vocab)

try:
  with open(os.path.join(VOCAB_FILE_DIR, 'ru_vocab.txt'), 'r'):
    pass
except:
    ru_vocab = bert_vocab.bert_vocab_from_dataset(
        train_ru.batch(1000).prefetch(2),
        **bert_vocab_args
    )
    write_vocab_file(os.path.join(VOCAB_FILE_DIR, 'ru_vocab.txt'), ru_vocab)


In [None]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def cleanup_text(reserved_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:   

    # Include a tokenize signature for a batch of strings. 
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

tokenizers = tf.Module()
tokenizers.ru = CustomTokenizer(reserved_tokens, os.path.join(VOCAB_FILE_DIR, 'ru_vocab.txt'))
tokenizers.en = CustomTokenizer(reserved_tokens, os.path.join(VOCAB_FILE_DIR, 'en_vocab.txt'))

In [None]:

def tokenize_pairs(ru, en):
  ru = tokenizers.ru.tokenize(ru)
  ru = ru.to_tensor()

  en = tokenizers.en.tokenize(en)
  en = en.to_tensor()
  return ru, en

  
BATCH_SIZE = 32
BUFFER_SIZE = 20000

def make_batches(ds):
  return (
    ds
    .cache()
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
    .map(tokenize_pairs, num_parallel_calls=tf.data.AUTOTUNE)
    .prefetch(tf.data.AUTOTUNE)
  )

In [None]:
train_batches = make_batches(train_examples)

In [None]:
tf.saved_model.save(tokenizers, os.path.join(os.getcwd(), 'machine-translation'))

### MODEL

#### POSITIONAL ENCODING

In [None]:
def positional_encoding(position, d_model):
  i = np.arange(d_model)[np.newaxis,:]
  pos = np.arange(position)[:, np.newaxis]

  angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
  angle_rads = pos * angle_rates

  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  return tf.cast(angle_rads[np.newaxis, ...], dtype=tf.float32)

#### MULTIHEAD ATTENTION

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
    # get dimensions of the input, cast from tensor to float
    d_k = tf.cast(tf.shape(k)[-1], tf.float32)
    
    # compute queries x keys and scale by dimension
    attention_logits = tf.matmul(q, k, transpose_b=True)
    
    scaled_attention_logits = attention_logits / tf.math.sqrt(d_k)

    # apply decoder mask
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    # normalize all scores
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    # times value
    output = tf.matmul(attention_weights, v)

    return output, attention_weights

class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()

    self.d_model = d_model
    self.num_heads = num_heads

    self.depth = self.d_model // num_heads

    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    self.dense = tf.keras.layers.Dense(d_model)
  
  def split_heads(self, x, batch_size):
    new_x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(new_x, perm=[0, 2, 1, 3])
  
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.split_heads(self.wq(q), batch_size)
    k = self.split_heads(self.wk(k), batch_size)
    v = self.split_heads(self.wv(v), batch_size)

    scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
    concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))

    return self.dense(concat_attention), attention_weights

#### ENCODER & DECODER

In [None]:
def ffnn(d_model: int, d_ff: int):
  return tf.keras.Sequential([tf.keras.layers.Dense(d_ff, activation='relu'), tf.keras.layers.Dense(d_model)])

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model: int, num_heads: int, d_ffnn: int, dropout_rate = 0.1):
    super(EncoderLayer, self).__init__()

    self.multi_head_attention = MultiHeadAttention(d_model, num_heads)
    self.ffnn = ffnn(d_model, d_ffnn)

    self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout_1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout_2 = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, training, mask):
    att_output, _ = self.multi_head_attention(x, x, x, mask)
    dropout_1 = self.dropout_1(att_output, training=training)
    output_1 = self.layer_norm_1(x + dropout_1)

    ffnn_output = self.ffnn(output_1)
    dropout_2 = self.dropout_2(ffnn_output, training=training)
    output_2 = self.layer_norm_2(output_1 + dropout_2)

    return output_2

class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers: int, d_model: int, num_heads: int, d_ffnn: int, input_vocab_size: int, maximum_position_encoding, dropout_rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)

    self.enc_layers = [EncoderLayer(d_model, num_heads, d_ffnn, dropout_rate) for i in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
  
  def call(self, x, training, mask):
    seq_len = tf.shape(x)[1]
        
    new_x = self.embedding(x)
    new_x *= tf.sqrt(tf.cast(self.d_model, tf.float32))
    new_x += self.pos_encoding[:, :seq_len, :]
    
    new_x = self.dropout(new_x, training=training)
    for i in range(self.num_layers):
      new_x = self.enc_layers[i](new_x, training, mask)

    return new_x

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model: int, num_heads: int, d_ffnn: int, dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mmha = MultiHeadAttention(d_model, num_heads)
    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffnn = ffnn(d_model, d_ffnn)

    self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layer_norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout_1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout_2 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout_3 = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, encoder_out, training, look_ahead_mask, padding_mask):
    mask_attn_out, attn_weights_block1 = self.mmha(x, x, x, look_ahead_mask)
    mask_attn_out = self.dropout_1(mask_attn_out, training=training)
    out1 = self.layer_norm_1(mask_attn_out + x)

    attn_out, attn_weights_block2 = self.mha(encoder_out, encoder_out, out1, padding_mask)
    attn_out = self.dropout_2(attn_out, training=training)
    out2 = self.layer_norm_2(attn_out + out1)

    ff_out = self.ffnn(out2)
    ff_out = self.dropout_3(ff_out, training=training)
    out3 = self.layer_norm_3(ff_out + out2)

    return out3, attn_weights_block1, attn_weights_block2


class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers: int, d_model: int, num_heads: int, d_ff: int, target_vocab_size: int, maximum_position_encoding, dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
    
    self.dec_layers = [DecoderLayer(d_model, num_heads, d_ff, dropout_rate) for i in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, enc_output, training,  look_ahead_mask, padding_mask):
    seq_len = tf.shape(x)[1]
    attn_weights = {}
    
    x = self.embedding(x)
    x *= tf.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
        x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask,padding_mask)

        attn_weights[f'decoder_layer{i+1}_block1'] = block1
        attn_weights[f'decoder_layer{i+1}_block2'] = block2

    return x, attn_weights

#### HYPERPARAMETER

In [None]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1
EPOCHS = 20
step = 0

#### TRANSFORMER

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers: int, d_model: int, num_heads: int, d_ff: int, input_vocab_size: int, target_vocab_size: int, pe_input, pe_target, dropout_rate=0.1):
    super(Transformer, self).__init__()
    
    self.encoder = Encoder(num_layers, d_model, num_heads, d_ff, input_vocab_size, pe_input, dropout_rate)
    self.decoder = Decoder(num_layers, d_model, num_heads, d_ff, input_vocab_size, pe_target, dropout_rate)
    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
        
  def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
    enc_output = self.encoder(inp, training, enc_padding_mask)
    dec_output, attention_weights  = self.decoder(tar, enc_output, training, look_ahead_mask,dec_padding_mask)

    return self.final_layer(dec_output), attention_weights

transformer = Transformer(
    num_layers=num_layers, 
    d_model=d_model, 
    num_heads=num_heads, 
    d_ff=dff, 
    input_vocab_size=tokenizers.ru.get_vocab_size(), 
    target_vocab_size=tokenizers.en.get_vocab_size(), 
    pe_input=10000, 
    pe_target=6000,
    dropout_rate=dropout_rate,
)

### TRAINING

#### OPTIMIZER

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model: int, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
  
    self.warmup_steps = warmup_steps
    self.d_model = tf.cast(d_model, dtype=tf.float32)

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

#### LOSSES

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def loss_function(label, pred):
  mask = tf.math.logical_not(tf.math.equal(label, 0))
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  return tf.reduce_sum(loss)/tf.reduce_sum(mask)

def accuracy_function(label, pred):
  accuracies = tf.equal(label, tf.argmax(pred, axis=2))

  mask = tf.math.logical_not(tf.math.equal(label, 0))
  accuracies = tf.math.logical_and(mask, accuracies)

  accuracies = tf.cast(accuracies, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)

  return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

#### PREPARATIONS

In [None]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size: int):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask

def create_masks(inp, tar):
  enc_padding_mask = create_padding_mask(inp)
  dec_padding_mask = create_padding_mask(inp)

  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  dec_target_padding_mask = create_padding_mask(tar)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

  return enc_padding_mask, combined_mask, dec_padding_mask

#### CHECKPOINTS

In [None]:
checkpoint_path = "./machine-translation/checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print('Latest checkpoint restored!!')

In [None]:
train_step_signature = [
  tf.TensorSpec(shape=(None, None), dtype=tf.int64),
  tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]

  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask)
    loss = loss_function(tar_real, predictions)
  
  gradients = tape.gradient(loss, transformer.trainable_variables)
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

  train_loss(loss)
  train_accuracy(accuracy_function(tar_real, predictions))

#### TRAINING

In [None]:
for epoch in range(EPOCHS):
  start = time.time()

  train_loss.reset_states()
  train_accuracy.reset_states()

  # inp -> russian, tar -> english
  for (batch, (inp, tar)) in enumerate(train_batches):
    train_step(inp, tar)

    if batch % 300 == 0:
      print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

  ckpt_save_path = ckpt_manager.save()
  
  print(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')
  print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')
  print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

### EVALUATION

In [None]:
def evaluate(sentence, max_length=40):
  # inp sentence is russian, hence adding the start and end token
  sentence = tf.convert_to_tensor([sentence])
  sentence = tokenizers.ru.tokenize(sentence).to_tensor()

  encoder_input = sentence

  # as the target is english, the first word to the transformer should be the
  # english start token.
  start, end = tokenizers.en.tokenize([''])[0]
  output = tf.convert_to_tensor([start])
  output = tf.expand_dims(output, 0)

  for i in range(max_length):
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        encoder_input, output)

    # predictions.shape == (batch_size, seq_len, vocab_size)
    predictions, attention_weights = transformer(encoder_input,
                                                output,
                                                False,
                                                enc_padding_mask,
                                                combined_mask,
                                                dec_padding_mask)

    # select the last word from the seq_len dimension
    predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

    predicted_id = tf.argmax(predictions, axis=-1)

    # concatentate the predicted_id to the output which is given to the decoder
    # as its input.
    output = tf.concat([output, predicted_id], axis=-1)

    # return the result if the predicted_id is equal to the end token
    if predicted_id == end:
      break

  # output.shape (1, tokens)
  text = tokenizers.en.detokenize(output)[0]  # shape: ()

  tokens = tokenizers.en.lookup(output)[0]

  return text.numpy().decode("utf-8")


# TOXIC CLASSIFICATION

In [None]:
train_df = pd.read_csv(os.path.join(os.getcwd(), 'toxic_classification', 'train.csv')).fillna(' ')

In [None]:
train_df['comment_text'] = train_df['comment_text'].apply(lambda x:clean_text(x))

In [None]:
max_features = 20000 
max_text_length = 100

x = train_df['comment_text'].values
y = train_df['toxic'].values
x_tokenizer = tf.keras.preprocessing.text.Tokenizer(max_features)
# Create vocabulary index based on word frequency
x_tokenizer.fit_on_texts(list(x))
# Transform each text to a sequence of integers mapped to its index in vocabulary index
x_tokenized = x_tokenizer.texts_to_sequences(x)
# Standardize output shape, longer sequence will be truncated and any shorter will be 0-padded
x_train_val = tf.keras.preprocessing.sequence.pad_sequences(x_tokenized, maxlen = max_text_length)

#### Pre-trained GloVe (Global Vector) Embeddings 

In [None]:
embedding_dim = 100 # maximum length for the sequences
embeddings_index = dict()

# Create word embedding matrix for each word in the vocabulary index
f = open(os.path.join(os.getcwd(), 'toxic_classification', 'glove', 'glove.6B.100d.txt'), encoding="utf8")
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, index in x_tokenizer.word_index.items():
    if index > max_features -1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

### EMBEDDING LAYER

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(max_features, embedding_dim,
                    embeddings_initializer=tf.keras.initializers.Constant(
                    embedding_matrix), trainable=False))
model.add(tf.keras.layers.Dropout(0.2))

### MODEL

In [None]:
# Define model parameters
filters = 150
kernel_size = 3
hidden_dims = 150

In [None]:
model.add(tf.keras.layers.Conv1D(filters, kernel_size, activation='relu'))
model.add(tf.keras.layers.MaxPooling1D())
model.add(tf.keras.layers.Conv1D(filters, kernel_size, activation='relu'))
model.add(tf.keras.layers.GlobalMaxPooling1D())
model.add(tf.keras.layers.Dense(hidden_dims, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))

model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### TRAINING

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y, test_size=0.30)
batch_size = 64
epochs = 10

model.fit(x_train, y_train, batch_size = batch_size,
          epochs = epochs, validation_data=(x_val, y_val))

### EVALUATION

In [None]:
test_df = pd.read_csv(os.path.join(os.getcwd(), 'toxic_classification', 'test.csv'))
test_df['comment_text'] = test_df['comment_text'].apply(lambda x: clean_text(x))

In [None]:
def make_test_predictions(df: pd.DataFrame):
    df.comment_text = df.comment_text.apply(clean_text)
    x_test = df['comment_text'].values
    x_test_tokenized = x_tokenizer.texts_to_sequences(x_test)
    x_testing = tf.keras.preprocessing.sequence.pad_sequences(x_test_tokenized, maxlen=max_text_length)
    y_testing = model.predict(x_testing, verbose=1, batch_size=64)
    df['is_toxic'] = ['not toxic' if x[0] < 0.5 else 'toxic' for x in y_testing]

make_test_predictions(test_df)
test_df

# PIPELINE INTEGRATION

### LANGUAGE_DETECTION

In [None]:
ld_inputs = language_detection_dataset[Split.TEST]['Text'].to_numpy().tolist()[:5]
sentences_language = LanguageDetection.evaluate(ld_inputs)
sentences_language

### MACHINE TRANSLATION

In [None]:
def test(series: pd.Series):
  return evaluate(series[0]) if series[1] == 'ru' else series[0]

sentences_language['comment_text'] = sentences_language.apply(test, axis=1)
sentences_language

### TOXIC CLASSIFICATION

In [None]:
make_test_predictions(sentences_language)
sentences_language