In [None]:
### INSTALL DEPS QUIETLY
!pip install -U -q tfds-nightly tf-models-official==2.7.0 "tensorflow-text==2.8.*"

In [25]:
import os
import tensorflow as tf
import tensorflow_text as text  # A dependency of the preprocessing model
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_addons as tfa
from official.nlp import optimization
import numpy as np
import pandas as pd
from typing import List, Dict
from typing_extensions import Literal, ClassVar

tf.get_logger().setLevel('ERROR')

In [None]:
### DO NOT COMPRESS LOADED/EXPORTED MODEL
os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "UNCOMPRESSED"

In [None]:
from enum import Enum

class Split(Enum):
  TRAIN = "train"
  VALIDATION = "validation"
  TEST = "test"


In [None]:
### UTILS

def convert_dataframe_to_tensor(seq: pd.Series, dtype, name:str) -> tf.Tensor:
  return tf.convert_to_tensor(seq, dtype=dtype, name=name)

In [32]:
class LanguageDetection:
  tensorflow_preprocess_handle: ClassVar[str] = 'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3'
  tensorflow_model: ClassVar[str] = 'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3'
  AUTOTUNE: ClassVar[int] = tf.data.AUTOTUNE
  classes: ClassVar[List[str]] = ["English", "Malayalam", "Hindi", "Tamil", "Kannada", "French", "Spanish", "Portuguese", "Italian", "Russian", "Sweedish", "Dutch", "Arabic", "Turkish", "German", "Danish", "Greek"]

  class Classifier(tf.keras.Model):
    def __init__(self, num_classes: int):
      super(LanguageDetection.Classifier, self).__init__(name="language_classifier")
      self.encoder = hub.KerasLayer(LanguageDetection.tensorflow_model, trainable=True)
      self.dropout = tf.keras.layers.Dropout(0.1)
      self.dense = tf.keras.layers.Dense(num_classes)

    def call(self, preprocessed_text):
      encoder_outputs = self.encoder(preprocessed_text)
      pooled_output = encoder_outputs["pooled_output"]
      x = self.dropout(pooled_output)
      x = self.dense(x)
      return x

  def __init__(self, seq_length=128):
    if os.environ.get('COLAB_TPU_ADDR', None):
      cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
      tf.config.experimental_connect_to_cluster(cluster_resolver)
      tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
      self.strategy = tf.distribute.TPUStrategy(cluster_resolver)
    elif tf.config.list_physical_devices('GPU'):
      self.strategy = tf.distribute.MirroredStrategy()
    else:
      self.strategy = tf.distribute.OneDeviceStrategy("/device:CPU:0")

    self.features: List[str] = ["Text"]
    self.label_name: str = "Language"

    self.__preprocess_model = self.__make_preprocess_model(self.features, seq_length=seq_length)
    self.__model = self.__make_classifier_model()

  def __load_dataset(self, dataframes: Dict[Split, pd.DataFrame], split: Split, batch_size = 128):
    df = dataframes[split]
    data_count = len(df)

    tensor_slice: Dict[str, tf.Tensor] = {
        self.label_name: convert_dataframe_to_tensor(df[self.label_name], dtype=tf.int32, name=self.label_name)
    }
    for feature in self.features:
      tensor_slice[feature] = convert_dataframe_to_tensor(df[feature], dtype=tf.string, name=feature)

    dataset = tf.data.Dataset.from_tensor_slices(tensor_slice)
    if split == Split.TRAIN:
      dataset = dataset.shuffle(data_count)
      dataset = dataset.repeat()
    if batch_size > 0:
      dataset = dataset.batch(batch_size)
    else:
      dataset = dataset.batch(data_count)
    dataset = dataset.map(lambda ex: (self.__preprocess_model(ex), ex[self.label]))
    dataset = dataset.cache().prefetch(buffer_size=LanguageDetection.AUTOTUNE)
    return dataset

  def __make_preprocess_model(self, features: List[str], seq_length = 128):
    """
    Returns Model mapping string features to BERT inputs.
  
    See: https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3#:~:text=seq_length%3D128.-,General%20usage,-For%20pairs%20of

    Args:
      sentence_features: a list with the names of string-valued features.
      seq_length: an integer that defines the sequence length of BERT inputs.

    Returns:
      A Keras Model that can be called on a list or dict of string Tensors
      (with the order or names, resp., given by sentence_features) and
      returns a dict of tensors for input to BERT.
    """
    text_inputs: List[tf.keras.layers.Input] = [
      tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
      for ft in self.features
    ]

    # Tokenize the text to word pieces.
    preprocessor = hub.load(LanguageDetection.tensorflow_preprocess_handle)
    tokenize = hub.KerasLayer(preprocessor.tokenize)
    # tokenize() returns an int32 RaggedTensor of shape [batch_size, (words), (tokens_per_word)].
    tokenized_inputs = [tokenize(s) for s in text_inputs]

    # Pack inputs. The details (start/end token ids, dict of output tensors)
    # are model-dependent, so this gets loaded from the SavedModel.
    bert_pack_inputs = hub.KerasLayer(
      preprocessor.bert_pack_inputs,
      arguments=dict(seq_length=seq_length),
      name='bert_pack_inputs'
    )
    model_inputs = bert_pack_inputs(tokenized_inputs)
    return tf.keras.Model(text_inputs, model_inputs)

  def __make_classifier_model(self):
    return LanguageDetection.Classifier(len(LanguageDetection.classes))

  def fit(self, training_data: tf.data.Dataset, validation_data: tf.data.Dataset):
    epochs = 3
    batch_size = 32
    init_lr = 2e-5

    ### TODO: Ganti metrics atau sisain class "English" ama "Russian"
    ### MathewsCorrelationCoefficient metrics paling bagus buat 2 class
    metrics = [tfa.metrics.MatthewsCorrelationCoefficient(num_classes=len(LanguageDetection.classes))]
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    with self.strategy.scope():
      pass


In [33]:
LanguageDetection()

<__main__.LanguageDetection at 0x7f363c7c8ed0>