In [1]:
from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow.keras.backend as K
import numpy as np
import pandas as pd
import os
import glob
import IPython
from sklearn.metrics import classification_report, f1_score
from tqdm.notebook import tqdm

import sys
sys.path.append("..")
from data_preparation.data_preparation_sentiment import Example, convert_examples_to_tf_dataset, make_batches
import utils.utils as utils

### Training language setup

In [2]:
code_dicts = utils.make_lang_code_dicts()
code_to_name = code_dicts["code_to_name"]
name_to_code = code_dicts["name_to_code"]

file = open("../data_exploration/sentiment_table.txt", "r")
all_langs = [line.split("&")[1].strip() for line in file.readlines()]
trained_langs = [code_to_name[x.split("\\")[1]] for x in glob.glob("E:/TFM_CCIL/checkpoints/*/*sentiment.hdf5")]
remaining_langs = [lang for lang in all_langs if lang not in (trained_langs + ["Turkish", "Japanese", "Russian"])]

if remaining_langs:
    training_lang = remaining_langs[0]
    print("{:<20}".format("Training language:"), training_lang, "\n")
    training_lang = name_to_code[training_lang]
    print(IPython.utils.text.columnize(["Already trained:   "] + trained_langs, displaywidth=150))
    print(IPython.utils.text.columnize(["Not yet trained:   "] + remaining_langs[1:], displaywidth=150))
else:
    print("No languages remaining")
    if input("Retrain language? ") == "y":
        training_lang = input("Language to re-train: ")
        training_lang = name_to_code[training_lang]

No languages remaining
Retrain language? y
Language to re-train: Hebrew


### Model setup

In [3]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)

In [4]:
# Model parameters
model_name = "bert-base-multilingual-cased"
max_length = 512
batch_size = 4
learning_rate = 2e-5
epochs = 20
use_class_weights = True

# Model creation
model = TFBertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Checkpoint for best model weights
checkpoint_dir = "E:/TFM_CCIL/checkpoints/" + training_lang + "/"
if not os.path.isdir(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Model compilation
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier', 'dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Data preparation

In [5]:
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [33]:
datasets = {}
path = "../data/sentiment/"
dataset_names = ["train", "dev", "train_eval"]
pbar = tqdm(total=4*len(dataset_names))
tqdm.pandas()

for dataset_name in dataset_names:
    # Load and preprocess
    if dataset_name == "train_eval":
        df = pd.read_csv(path + training_lang + "/" + "train" + ".csv", header=None)
    else:
        df = pd.read_csv(path + training_lang + "/" + dataset_name + ".csv", header=None)
    df.columns = ["sentiment", "review"]
    df["sentiment"] = pd.to_numeric(df["sentiment"]) # Sometimes label gets read as string
    lengths = df["review"].progress_apply(lambda x: len(tokenizer.encode(x)))
    df = df[lengths <= max_length].reset_index(drop=True) # Remove long examples
    pbar.update(1)
    
    # Calculate class weights or balance dataset
    if use_class_weights and dataset_name == "train":
        positive_prop = df["sentiment"].mean()
        class_weights = {0: positive_prop, 1: 1 - positive_prop}
    elif not use_class_weights and dataset_name == "train":
        class_weights = None
        positive_examples = df["sentiment"].sum()
        n = min(positive_examples, df.shape[0] - positive_examples)
        
        if training_lang == "ar":
            # Testing whether a smaller dataset will work better
            n = 5000
            
        ones_idx = np.random.choice(np.where(df["sentiment"])[0], size=n)
        zeros_idx = np.random.choice(np.where(df["sentiment"] == 0)[0], size=n)
        df = df.loc[list(ones_idx) + list(zeros_idx)].reset_index(drop=True)
    pbar.update(1)
        
    # Convert to TF dataset
    dataset = convert_examples_to_tf_dataset([(Example(text=text, category_index=label)) for label, 
                                                                                             text in df.values], 
                                              tokenizer, max_length=max_length)
    pbar.update(1)
    if dataset_name == "train":
        dataset, batches = make_batches(dataset, batch_size, repetitions=epochs, shuffle=True)
    else:
        dataset, batches = make_batches(dataset, 64, repetitions=1, shuffle=False)
    
    datasets[dataset_name] = (dataset, batches, df)
    pbar.update(1)
    
train_dataset, train_batches, train_df = datasets["train"]
dev_dataset, dev_batches, dev_df = datasets["dev"]
train_eval_dataset, train_eval_batches, train_eval_df = datasets["train_eval"]
print(class_weights)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=8702.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1240.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8702.0), HTML(value='')))


{0: 0.6863016863016863, 1: 0.3136983136983137}


### Training

In [None]:
best_dev_f1 = 0

for epoch in tqdm(range(epochs)):
    print("Epoch", epoch, "\n")
    # It's a bit painful to do this but for now it's enough
    hist = model.fit(train_dataset, epochs=1, steps_per_epoch=train_batches, class_weight=class_weights)
    train_preds = model.predict(train_eval_dataset, steps=train_eval_batches, verbose=1)[0].argmax(axis=-1)
    dev_preds = model.predict(dev_dataset, steps=dev_batches, verbose=1)[0].argmax(axis=-1)
    
    train_f1 = f1_score(train_eval_df["sentiment"].values, train_preds, average="macro")
    dev_f1 = f1_score(dev_df["sentiment"].values, dev_preds, average="macro")
    print("\nTrain F1:", train_f1)
    print("Dev F1:", dev_f1)
    
    if dev_f1 > best_dev_f1:
        model.save_weights(checkpoint_dir + model_name + "_sentiment_checkpoint.hdf5")
        print("\nDev F1 improved from", best_dev_f1, "to", dev_f1, 
              ", saving to " + checkpoint_dir + model_name + "_sentiment_checkpoint.hdf5\n")
        report = classification_report(dev_df["sentiment"].values, dev_preds, output_dict=True)
        pd.DataFrame(report).transpose().to_excel(checkpoint_dir + "checkpoint_report.xlsx")
        best_dev_f1 = dev_f1
    else:
        print("\nDev F1 did not improve from", best_dev_f1, "\n", "-"*50, "\n")

In [8]:
confirm = input("Confirm weight file overwrite: ")

if confirm == "y":
    print("Overwriting")
    os.replace(checkpoint_dir + model_name + "_sentiment_checkpoint.hdf5", 
               checkpoint_dir + model_name + "_sentiment.hdf5")
    os.replace(checkpoint_dir + "checkpoint_report.xlsx", 
               checkpoint_dir + "last_report.xlsx")
else:
    print("Aborting")

Confirm weight file overwrite: y
Overwriting
