In [1]:
from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow.keras.backend as K
import numpy as np
import pandas as pd
import os
import glob
import IPython

import sys
sys.path.append("..")
from data_preparation.data_preparation_sentiment import Example, convert_examples_to_tf_dataset, make_batches
import utils.utils as utils

### Training language setup

In [32]:
code_dicts = utils.make_lang_code_dicts()
code_to_name = code_dicts["code_to_name"]
name_to_code = code_dicts["name_to_code"]
    
file = open("../data_exploration/sentiment_table.txt", "r")
all_langs = [line.split("&")[1].strip() for line in file.readlines()]
trained_langs = [code_to_name[x.split("\\")[1]] for x in glob.glob("E:/TFM_CCIL/checkpoints/*/*sentiment.hdf5")]
remaining_langs = [lang for lang in all_langs if lang not in (trained_langs + ["Turkish", "Japanese", "Russian"])]

if remaining_langs:
    training_lang = remaining_langs[0]
    print("{:<20}".format("Training language:"), training_lang, "\n")
    training_lang = name_to_code[training_lang]
    print(IPython.utils.text.columnize(["Already trained:   "] + trained_langs, displaywidth=150))
    print(IPython.utils.text.columnize(["Not yet trained:   "] + remaining_langs[1:], displaywidth=150))
else:
    print("No languages remaining")

Training language:   Korean 

Already trained:     Bulgarian  English  Basque  Finnish  Hebrew  Croatian  Slovak  Thai  Vietnamese  Chinese

Not yet trained:     Arabic



### Model setup

In [3]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)

In [4]:
# Model parameters
model_name = "bert-base-multilingual-cased"
max_length = 512
batch_size = 4
learning_rate = 2e-5
epochs = 20
use_class_weights = False

# Model creation
model = TFBertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Checkpoint for best model weights
checkpoint_dir = "E:/TFM_CCIL/checkpoints/" + training_lang + "/"
if not os.path.isdir(checkpoint_dir):
    os.makedirs(checkpoint_dir)
checkpoint = ModelCheckpoint(checkpoint_dir + model_name + "_sentiment_checkpoint.hdf5", 
                             verbose=1, monitor="val_sparse_categorical_accuracy",
                             save_best_only=True, mode="max", save_weights_only=True)

# Model compilation
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy()
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertForSequenceClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier', 'dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Data preparation

In [5]:
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [6]:
datasets = {}
path = "../data/sentiment/"

for dataset_name in ["train", "dev"]:
    # Load and preprocess
    dataset = pd.read_csv(path + training_lang + "/" + dataset_name + ".csv", header=None)
    dataset.columns = ["sentiment", "review"]
    dataset["sentiment"] = pd.to_numeric(dataset["sentiment"]) # Sometimes label gets read as string
    lengths = dataset["review"].apply(lambda x: len(tokenizer.encode(x)))
    dataset = dataset[lengths <= max_length].reset_index(drop=True) # Remove long examples
    
    # Calculate class weights or balance dataset
    if use_class_weights and dataset_name == "train":
        positive_prop = dataset["sentiment"].mean()
        class_weights = {0: positive_prop, 1: 1 - positive_prop}
    elif not use_class_weights and dataset_name == "train":
        class_weights = None
        positive_examples = dataset["sentiment"].sum()
        n = min(positive_examples, dataset.shape[0] - positive_examples)
        
        if training_lang == "ar":
            # Testing whether a smaller dataset will work better
            n = 2500
            
        ones_idx = np.random.choice(np.where(dataset["sentiment"])[0], size=n)
        zeros_idx = np.random.choice(np.where(dataset["sentiment"] == 0)[0], size=n)
        dataset = dataset.loc[list(ones_idx) + list(zeros_idx)].reset_index(drop=True)
        
    # Convert to TF dataset
    dataset = convert_examples_to_tf_dataset([(Example(text=text, category_index=label)) for label, 
                                                                                             text in dataset.values], 
                                              tokenizer, max_length=max_length)
    if dataset_name == "train":
        dataset, batches = make_batches(dataset, batch_size, repetitions=epochs, shuffle=True)
    else:
        dataset, batches = make_batches(dataset, batch_size, repetitions=1, shuffle=False)
    
    datasets[dataset_name] = (dataset, batches)
    
train_dataset, train_batches = datasets["train"]
dev_dataset, dev_batches = datasets["dev"]
print(class_weights)

None


### Training

In [7]:
model.fit(train_dataset, epochs=epochs, steps_per_epoch=train_batches, 
          validation_data=dev_dataset, validation_steps=dev_batches,
          class_weight=class_weights,
          callbacks=[checkpoint])

Epoch 1/20
Epoch 00001: val_sparse_categorical_accuracy improved from -inf to 0.79530, saving model to E:/TFM_CCIL/checkpoints/th/bert-base-multilingual-cased_sentiment_checkpoint.hdf5
Epoch 2/20
Epoch 00002: val_sparse_categorical_accuracy improved from 0.79530 to 0.81882, saving model to E:/TFM_CCIL/checkpoints/th/bert-base-multilingual-cased_sentiment_checkpoint.hdf5
Epoch 3/20
Epoch 00003: val_sparse_categorical_accuracy did not improve from 0.81882
Epoch 4/20
Epoch 00004: val_sparse_categorical_accuracy did not improve from 0.81882
Epoch 5/20
Epoch 00005: val_sparse_categorical_accuracy did not improve from 0.81882
Epoch 6/20
Epoch 00006: val_sparse_categorical_accuracy did not improve from 0.81882
Epoch 7/20
Epoch 00007: val_sparse_categorical_accuracy improved from 0.81882 to 0.82491, saving model to E:/TFM_CCIL/checkpoints/th/bert-base-multilingual-cased_sentiment_checkpoint.hdf5
Epoch 8/20
Epoch 00008: val_sparse_categorical_accuracy did not improve from 0.82491
Epoch 9/20
Epo

KeyboardInterrupt: 

In [8]:
model.load_weights(checkpoint_dir + model_name + "_sentiment_checkpoint.hdf5")

In [9]:
preds = model.predict(dev_dataset, steps=dev_batches, verbose=1)



In [10]:
from sklearn.metrics import classification_report

In [11]:
val_data = pd.read_csv(path + training_lang + "/" + "dev" + ".csv", header=None)
val_data.columns = ["sentiment", "review"]
lengths = val_data["review"].apply(lambda x: len(tokenizer.encode(x)))
val_data = val_data[lengths <= max_length].reset_index(drop=True)
print(classification_report(val_data["sentiment"].values, preds[0].argmax(axis=-1), digits=3))

              precision    recall  f1-score   support

           0      0.832     0.887     0.859       689
           1      0.812     0.732     0.770       459

    accuracy                          0.825      1148
   macro avg      0.822     0.809     0.814      1148
weighted avg      0.824     0.825     0.823      1148



In [10]:
print(classification_report(val_data["sentiment"].values, preds[0].argmax(axis=-1), digits=3))

              precision    recall  f1-score   support

           0      0.857     0.914     0.885       689
           1      0.857     0.771     0.812       459

    accuracy                          0.857      1148
   macro avg      0.857     0.843     0.848      1148
weighted avg      0.857     0.857     0.856      1148

