In [1]:
from transformers import TFBertForSequenceClassification, BertTokenizer, AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import glob
from tqdm.notebook import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import IPython

import sys
sys.path.append("..")
from data_preparation.data_preparation_sentiment import Example, convert_examples_to_tf_dataset, make_batches
import utils.utils as utils

### Training language setup

In [2]:
code_dicts = utils.make_lang_code_dicts()
code_to_name = code_dicts["code_to_name"]
name_to_code = code_dicts["name_to_code"]

results_path = "../results/results_sentiment.xlsx"

# Look for languages that have sentiment weights but are not in the results file
file = open("../data_exploration/sentiment_table.txt", "r")
all_langs = [line.split("&")[1].strip() for line in file.readlines()]
all_langs = [lang for lang in all_langs if lang not in ["Turkish", "Japanese", "Russian"]]
trained_langs = [code_to_name[x.split("\\")[1]] for x in glob.glob("E:/TFM_CCIL/checkpoints/*/*sentiment.hdf5")]

if os.path.isfile(results_path):
    results = pd.read_excel(results_path, sheet_name=None)
    remaining_langs = [lang for lang in trained_langs if lang not in results["Accuracy"].columns]
else:
    remaining_langs = trained_langs
    
untrained_langs = [lang for lang in all_langs if lang not in trained_langs]
evaluated_langs = [lang for lang in trained_langs if lang not in remaining_langs]

if remaining_langs:
    training_lang = remaining_langs[0]
    print("Evaluating with:   ", training_lang, "\n")
    training_lang = name_to_code[training_lang]
    print(IPython.utils.text.columnize(["Already evaluated:"] + evaluated_langs, displaywidth=150))
    print(IPython.utils.text.columnize(["Not yet evaluated:"] + remaining_langs[1:], displaywidth=150))
    print(IPython.utils.text.columnize(["Still to train:   "] + untrained_langs, displaywidth=150))
else:
    print("No languages remaining")
    print(IPython.utils.text.columnize(["Already evaluated:"] + evaluated_langs, displaywidth=150))
    print(IPython.utils.text.columnize(["Still to train:   "] + untrained_langs, displaywidth=150))

Evaluating with:    Korean 

Already evaluated:  Arabic  Bulgarian  English  Basque  Finnish  Hebrew  Croatian  Slovak  Thai  Vietnamese  Chinese

Not yet evaluated:

Still to train:   



### Model setup

In [3]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)

In [4]:
# Model parameters
model_name = "bert-base-multilingual-cased"
max_length = 512
batch_size = 64

# Model creation and loading weights
model = TFBertForSequenceClassification.from_pretrained(model_name)
weights_path = "E:/TFM_CCIL/checkpoints/" + training_lang + "/"
weights_filename = model_name + "_sentiment.hdf5"
model.load_weights(weights_path + weights_filename)
print("Using weights from", weights_path + weights_filename)
tokenizer = BertTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier', 'dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using weights from E:/TFM_CCIL/checkpoints/ko/bert-base-multilingual-cased_sentiment.hdf5


### Evaluation

In [5]:
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [6]:
path = "../data/sentiment/"
sentiment_eval = []

for lang in tqdm(os.listdir(path)):
    if lang not in ["tr", "ja", "ru"]:
        # Load and preprocess
        test = pd.read_csv(path + lang + "/test.csv", header=None)
        test.columns = ["sentiment", "review"]
        lengths = test["review"].apply(lambda x: len(tokenizer.encode(x)))
        test = test[lengths <= 512].reset_index(drop=True) # Remove long examples
        
        # Convert to TF dataset
        test_dataset = convert_examples_to_tf_dataset([(Example(text=text, category_index=label)) for label, 
                                                       text in test.values], 
                                                      tokenizer, max_length=max_length)
        test_dataset, test_batches = make_batches(test_dataset, batch_size, repetitions=1, shuffle=False)
        
        # Predict
        preds = model.predict(test_dataset, steps=np.ceil(test.shape[0] / batch_size), verbose=1)
        clean_preds = preds[0].argmax(axis=-1)
        
        # Metrics
        accuracy = accuracy_score(test["sentiment"].values, clean_preds)
        precision = precision_score(test["sentiment"].values, clean_preds, average="macro", zero_division=0)
        recall = recall_score(test["sentiment"].values, clean_preds, average="macro")
        f1 = f1_score(test["sentiment"].values, clean_preds, average="macro")
        sentiment_eval.append((lang, accuracy, precision, recall, f1))

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))



  _warn_prf(average, modifier, msg_start, len(result))





Build the table for this training language

In [7]:
sentiment_eval = np.array(sentiment_eval, dtype=object)
table = pd.DataFrame({"Language": sentiment_eval[:,0],
                      "Accuracy": sentiment_eval[:,1],
                      "Macro_Precision": sentiment_eval[:,2],
                      "Macro_Recall": sentiment_eval[:,3],
                      "Macro_F1": sentiment_eval[:,4]})
table["Language"] = table["Language"].apply(lambda x: code_to_name[x])

Reorder so that language types are grouped

In [8]:
file = open("../data_exploration/sentiment_table.txt", "r")
lang_order = [line.split("&")[1].strip() for line in file.readlines()]
lang_order = [lang for lang in lang_order if lang not in ["Turkish", "Japanese", "Russian"]]
table["sort"] = table["Language"].apply(lambda x: lang_order.index(x))
table = table.sort_values(by=["sort"]).drop("sort", axis=1).reset_index(drop=True)

In [9]:
table

Unnamed: 0,Language,Accuracy,Macro_Precision,Macro_Recall,Macro_F1
0,Bulgarian,0.552301,0.541514,0.564149,0.502776
1,English,0.667765,0.695968,0.668076,0.655618
2,Slovak,0.724624,0.624845,0.764238,0.620669
3,Croatian,0.526316,0.584019,0.617898,0.509891
4,Chinese,0.59124,0.635051,0.628417,0.590373
5,Vietnamese,0.590643,0.600901,0.594133,0.584966
6,Thai,0.513699,0.540079,0.53873,0.513039
7,Finnish,0.642857,0.630442,0.673469,0.613521
8,Basque,0.625551,0.562279,0.615104,0.535385
9,Korean,0.75054,0.745549,0.750737,0.746769


Update results excel file

In [10]:
results_path = "../results/results_sentiment.xlsx"

if os.path.isfile(results_path):
    results = pd.read_excel(results_path, sheet_name=None)
else:
    results = dict.fromkeys(table.columns[1:].values, pd.DataFrame({"Language": table["Language"].values}))

In [11]:
with pd.ExcelWriter(results_path) as writer:
    full_training_lang = code_to_name[training_lang]
    for sheet_name, df in results.items():
        # Add each the column for each metric in the corresponding sheet
        df[full_training_lang] = table[sheet_name]
        df.to_excel(writer, index=False, sheet_name=sheet_name)