In [None]:
import os
import sys
import boto3
import dotenv
import numpy as np
import tensorflow as tf
from datetime import date
from tensorflow import keras
import tensorflow_text as tf_text

dotenv.load_dotenv(os.path.join("..", ".env"));

In [None]:
# Constants and utilities
tok = tf_text.UnicodeCharTokenizer()
PAD_LENGTH: int = 128  # Length to which to pad sequences.
BATCH_SIZE: int = 512  # Size of dataset batches
VOCAB_SIZE: int = 256  # Vocabulary size (we use ASCII characters)
DATASET_PATH: str = os.path.join("..", "data", "processed-text")  # Path to dataset files

# Labels
LANGUAGES = [
    "English",
    "French",
    "Italian",
    "Portuguese",
    "Spanish",
    "Turkish"
]

LANGUAGES.sort()
NUM_CLASSES: int = len(LANGUAGES)  # Number of categories to classify

# Functions to prepare input for model
def drop_newlines(strings):
    return tf.strings.regex_replace(strings, r"\n", "")

def prepare_input(strings, labels):
    # Remove newline characters from strings
    strings_clean = drop_newlines(strings)
    
    # Tokenize
    tokens = tok.tokenize(strings_clean)
    
    # Pad sequences
    return tokens.to_tensor(0, shape=(tokens.shape[0], PAD_LENGTH)), labels

In [None]:
# Training and validation datasets
train_dataset = keras.preprocessing.text_dataset_from_directory(
    os.path.join(DATASET_PATH, "train"),
    batch_size=BATCH_SIZE
)
valid_dataset = keras.preprocessing.text_dataset_from_directory(
    os.path.join(DATASET_PATH, "test"),
    batch_size=BATCH_SIZE
)

# Preprocess data
train_dataset = train_dataset.map(prepare_input).prefetch(2)
valid_dataset = valid_dataset.map(prepare_input).prefetch(2)

In [None]:
def make_model(embedding_dim: int = 32, lstm_dim: int = 32) -> keras.Model:
    """
    Make the prediction model!
    """
    keras.backend.clear_session()

    # Input tokens and embed
    inputs = keras.layers.Input(shape=(PAD_LENGTH,), dtype=tf.int32)
    embed = keras.layers.Embedding(
        input_dim=VOCAB_SIZE, 
        output_dim=embedding_dim,
        mask_zero=True,
        input_length=PAD_LENGTH)(inputs)
    
    # Recurring / Convolutional layers
    x = keras.layers.Bidirectional(keras.layers.LSTM(
        lstm_dim,
        activation="tanh",
        return_sequences=True))(embed)
    
    x = keras.layers.Conv1D(
        2 * lstm_dim,
        3,
        padding="same",
        activation="tanh")(x)
    
    # Pooling, final output
    x = keras.layers.GlobalAveragePooling1D()(x)
    x = keras.layers.Dense(16, activation=keras.layers.LeakyReLU())(x)
    out = keras.layers.Dense(NUM_CLASSES, activation="softmax")(x)
    
    # Model
    model = keras.Model(inputs=inputs, outputs=out)
    model.summary()
    return model

In [None]:
model = make_model(lstm_dim=16)

In [None]:
# FIT Params
EPOCHS = 1
VERBOSE = 1

In [None]:
# Compile and fit
model.compile(
    optimizer=keras.optimizers.Adam(0.02),
    loss="sparse_categorical_crossentropy",
    metrics="accuracy"
)


history = model.fit(
    train_dataset, 
    epochs=EPOCHS, 
    validation_data=valid_dataset,
    verbose=VERBOSE
)

In [None]:
# Benchmark
test_sentences = [
    "Mary had a little lamb, its fleece was white as snow",
    "nel mezzo del cammin di nostra vita mi ritrovai per una selva oscura",
    "No tengo penas ni tengo amores y asi no sufro de sinsabores"
]

prepared, _ = prepare_input(test_sentences, None)
pred = model.predict(prepared)
langs = np.argmax(pred, axis=1).tolist()
conf = np.max(pred, axis=1).tolist()

for sentence, lang, confidence in zip(test_sentences, langs, conf):
    print("Sentence:")
    print(sentence)
    print()
    
    print("Predicted Language:")
    print(LANGUAGES[lang])
    print("Confidence: %.2f" % (100 * confidence))
    print("=" * 50)

In [None]:
# Save model and upload to S3
model_dir = os.path.join("..", "data", "models")
if not os.path.isdir(model_dir):
    os.makedirs(model_dir)

model_file = os.path.join(model_dir, "model.h5")
model.save(model_file)

s3 = boto3.client("s3")
s3.upload_file(
    model_file,
    os.getenv("S3_BUCKET"),
    f"models/model-{date.today():%Y-%m-%d}.h5"
);