In [None]:
### INSTALL DEPS QUIETLY
!pip install -U -q tfds-nightly tf-models-official==2.7.0 "tensorflow-text==2.8.*"

In [None]:
import os
import tensorflow as tf
import tensorflow_text as text  # A dependency of the preprocessing model
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_addons as tfa
from official.nlp import optimization
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
from typing_extensions import Literal, ClassVar

tf.get_logger().setLevel('ERROR')

In [None]:
### DO NOT COMPRESS LOADED/EXPORTED MODEL
os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "UNCOMPRESSED"

In [None]:
from enum import Enum

class Split(Enum):
  TRAIN = "train"
  VALIDATION = "validation"
  TEST = "test"

class Languages(Enum):
  ru = 1
  en = 0

In [None]:
### UTILS
def convert_dataframe_column_to_tensor(seq: pd.Series, dtype, name:str) -> tf.Tensor:
  return tf.convert_to_tensor(seq, dtype=dtype, name=name)


In [None]:
class LanguageDetection:
  tensorflow_preprocess_handle: ClassVar[str] = 'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3'
  tensorflow_model: ClassVar[str] = 'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3'
  AUTOTUNE: ClassVar[int] = tf.data.AUTOTUNE
  batch_size: ClassVar[int] = 32
  classes: ClassVar[List[str]] = ["en", "ru"]
  model_path: ClassVar[str] = os.path.join(".", "language-detection")
  features: ClassVar[List[str]] = ["Text"]
  label_name: ClassVar[str] = "Language"

  class Classifier(tf.keras.Model):
    def __init__(self, num_classes: int):
      super(LanguageDetection.Classifier, self).__init__(name="language_classifier")
      self.encoder = hub.KerasLayer(LanguageDetection.tensorflow_model, trainable=True)
      self.dropout = tf.keras.layers.Dropout(0.1)
      self.dense = tf.keras.layers.Dense(num_classes)

    def __call__(self, *args, **kwargs):
      print(f"Classifier.__call__() called.")
      return super().__call__(*args, **kwargs)

    def call(self, preprocessed_text):
      encoder_outputs = self.encoder(preprocessed_text)
      pooled_output = encoder_outputs["pooled_output"]
      x = self.dropout(pooled_output)
      x = self.dense(x)
      print(f'Classifier.call({preprocessed_text}) = {x}')
      return x

  def __init__(self, seq_length=128):
    if os.environ.get('COLAB_TPU_ADDR', None):
      cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
      tf.config.experimental_connect_to_cluster(cluster_resolver)
      tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
      self.strategy = tf.distribute.TPUStrategy(cluster_resolver)
    elif tf.config.list_physical_devices('GPU'):
      self.strategy = tf.distribute.MirroredStrategy()
    else:
      self.strategy = tf.distribute.OneDeviceStrategy("/device:CPU:0")

    self.__preprocess_model = self.__make_preprocess_model(self.features, seq_length=seq_length)
    self.__reloaded_model = None

  def load_dataset(self, dataframes: Dict[Split, pd.DataFrame], split: Split) -> Tuple[tf.data.Dataset, int]:
    df = dataframes[split]
    data_count = len(df)

    tensor_slice: Dict[str, tf.Tensor] = {
        self.label_name: convert_dataframe_column_to_tensor(df[self.label_name], dtype=tf.int32, name=f"{split}-{self.label_name}")
    }
    for feature in self.features:
      tensor_slice[feature] = convert_dataframe_column_to_tensor(df[feature], dtype=tf.string, name=f"{split}-{feature}")

    dataset = tf.data.Dataset.from_tensor_slices(tensor_slice)
    if split == Split.TRAIN:
      dataset = dataset.shuffle(data_count)
      dataset = dataset.repeat()
    dataset = dataset.batch(LanguageDetection.batch_size)
    dataset = dataset.map(lambda ex: (self.__preprocess_model(ex), ex[self.label_name]))
    dataset = dataset.cache().prefetch(buffer_size=LanguageDetection.AUTOTUNE)
    return dataset, data_count

  def __make_preprocess_model(self, features: List[str], seq_length = 128):
    text_inputs: List[tf.keras.layers.Input] = [
      tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
      for ft in self.features
    ]

    preprocessor = hub.load(LanguageDetection.tensorflow_preprocess_handle)
    tokenize = hub.KerasLayer(preprocessor.tokenize)
    tokenized_inputs = [tokenize(s) for s in text_inputs]

    bert_pack_inputs = hub.KerasLayer(
      preprocessor.bert_pack_inputs,
      arguments=dict(seq_length=seq_length),
      name='bert_pack_inputs'
    )
    model_inputs = bert_pack_inputs(tokenized_inputs)
    return tf.keras.Model(text_inputs, model_inputs)

  def __make_classifier_model(self):
    return LanguageDetection.Classifier(len(LanguageDetection.classes))

  def fit(self, dataframes: Dict[Split, pd.DataFrame], epochs=3, init_lr=2e-5):
    with self.strategy.scope():
      metrics = [tfa.metrics.MatthewsCorrelationCoefficient(num_classes=len(LanguageDetection.classes))]
      loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

      train_dataset, train_datacount = language_detection_model.load_dataset(dataframes=dataframes, split=Split.TRAIN)
      validation_dataset, validation_datacount = language_detection_model.load_dataset(dataframes=dataframes, split=Split.VALIDATION)

      steps_per_epoch = train_datacount // LanguageDetection.batch_size
      num_train_steps = steps_per_epoch * epochs
      num_warmup_steps = num_train_steps // 10

      validation_steps = validation_datacount // LanguageDetection.batch_size

      optimizer = optimization.create_optimizer(
          init_lr=init_lr,
          num_train_steps=num_train_steps,
          num_warmup_steps=num_warmup_steps,
          optimizer_type='adamw'
      )
      self.__model = self.__make_classifier_model()
      self.__model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
      self.__model.fit(
          x=train_dataset,
          validation_data=validation_dataset,
          steps_per_epoch=steps_per_epoch,
          epochs=epochs,
          validation_steps=validation_steps
      )

  def export(self) -> None:
    bert_outputs = self.__model(self.__preprocess_model(self.__preprocess_model.inputs))
    exported_model = tf.keras.Model(self.__preprocess_model.inputs, bert_outputs)

    save_options = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
    exported_model.save(LanguageDetection.model_path, include_optimizer=False,
                      options=save_options)
    
  @classmethod
  def evaluate(cls, sentence: List[str]) -> List[str]:
    with tf.device('/job:localhost'):
      reloaded_model = tf.saved_model.load(cls.model_path)
      test_dataset = tf.data.Dataset.from_tensor_slices({
          "Text": sentence
      })

      results: List[str] = []

      for features in test_dataset.map(lambda rec: [[rec[ft]] for ft in cls.features]):
        if len(cls.features) == 1:
          result = reloaded_model(features[0])
        else:
          result = reloaded_model(list(features))
        classification = tf.argmax(result, axis=1)[0].numpy().item()
        results.append(Languages(classification).name)
      
      return results


In [None]:
### LOAD DATASET FOR LANGUAGE DETECTION
cleaned_ld_training_data = pd.read_csv("cleaned_train.csv")
cleaned_ld_training_data.loc[cleaned_ld_training_data["Language"] == "ru", "Language"] = 1
cleaned_ld_training_data.loc[cleaned_ld_training_data["Language"] == "en", "Language"] = 0
cleaned_ld_training_data["Language"] = cleaned_ld_training_data["Language"].astype('int')
cleaned_ld_training_data['Text'] = cleaned_ld_training_data['Text'].astype("str")

cleaned_ld_test_data = pd.read_csv("cleaned_test.csv")
cleaned_ld_test_data.loc[cleaned_ld_test_data["Language"] == "ru", "Language"] = 1
cleaned_ld_test_data.loc[cleaned_ld_test_data["Language"] == "en", "Language"] = 0
cleaned_ld_test_data["Language"] = cleaned_ld_test_data["Language"].astype('int')
cleaned_ld_test_data['Text'] = cleaned_ld_test_data['Text'].astype("str")

cleaned_ld_validation_data = pd.read_csv("cleaned_validation.csv")
cleaned_ld_validation_data.loc[cleaned_ld_validation_data["Language"] == "ru", "Language"] = 1
cleaned_ld_validation_data.loc[cleaned_ld_validation_data["Language"] == "en", "Language"] = 0
cleaned_ld_validation_data["Language"] = cleaned_ld_validation_data["Language"].astype('int')
cleaned_ld_validation_data['Text'] = cleaned_ld_validation_data['Text'].astype("str")

language_detection_dataset = {
    Split.TRAIN: cleaned_ld_training_data,
    Split.TEST: cleaned_ld_test_data,
    Split.VALIDATION: cleaned_ld_validation_data
}

In [None]:
language_detection_model = LanguageDetection()

In [None]:
language_detection_model.fit(dataframes=language_detection_dataset)
language_detection_model.export()


In [None]:
ld_inputs = language_detection_dataset[Split.TEST]['Text'].to_numpy().tolist()[:5]
LanguageDetection.evaluate(ld_inputs)
