In [1]:
from transformers import TFBertForSequenceClassification, BertTokenizer, AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import glob
from tqdm.notebook import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import IPython

import sys
sys.path.append("..")
from data_preparation.data_preparation_sentiment import Example, convert_examples_to_tf_dataset, make_batches
import utils.utils as utils

### Language selection

In [2]:
code_dicts = utils.make_lang_code_dicts()
code_to_name = code_dicts["code_to_name"]
name_to_code = code_dicts["name_to_code"]

results_path = "../results/results_sentiment.xlsx"

target_lang = "Slovak"
target_lang = name_to_code[target_lang]

### Model setup

In [3]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)

In [4]:
# Model parameters
model_name = "bert-base-multilingual-cased"
max_length = 512
batch_size = 64

# Model creation and loading weights
model = TFBertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier', 'dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Data preparation

In [5]:
# Load and preprocess
test = pd.read_csv("../data/sentiment/" + target_lang + "/test.csv", header=None)
test.columns = ["sentiment", "review"]
lengths = test["review"].apply(lambda x: len(tokenizer.encode(x)))
test = test[lengths <= 512].reset_index(drop=True) # Remove long examples

# Convert to TF dataset
test_dataset = convert_examples_to_tf_dataset([(Example(text=text, category_index=label)) for label, 
                                               text in test.values], 
                                              tokenizer, max_length=max_length)
test_dataset, test_batches = make_batches(test_dataset, batch_size, repetitions=1, shuffle=False)

### Evaluation

In [6]:
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [7]:
sentiment_eval = []

for weights_filename in tqdm(glob.glob("E:/TFM_CCIL/checkpoints/*/*sentiment.hdf5")):
    # Load weights for training language
    lang = weights_filename.split("\\")[-2]
    model.load_weights(weights_filename)
    
    # Predict
    preds = model.predict(test_dataset, steps=np.ceil(test.shape[0] / batch_size), verbose=1)
    clean_preds = preds[0].argmax(axis=-1)
    
    # Metrics
    accuracy = accuracy_score(test["sentiment"].values, clean_preds)
    precision = precision_score(test["sentiment"].values, clean_preds, average="macro")
    recall = recall_score(test["sentiment"].values, clean_preds, average="macro")
    f1 = f1_score(test["sentiment"].values, clean_preds, average="macro")
    sentiment_eval.append((lang, accuracy, precision, recall, f1))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




Build the table for this testing language

In [8]:
sentiment_eval = np.array(sentiment_eval, dtype=object)
table = pd.DataFrame({**{"Metric": ["Accuracy", "Macro_Precision", "Macro_Recall", "Macro_F1"]},
                      **{code_to_name[sentiment_eval[i,0]]: sentiment_eval[i,1:] for i in range(sentiment_eval.shape[0])}})
table

Unnamed: 0,Metric,Bulgarian,English,Basque,Finnish,Hebrew,Croatian,Slovak,Thai,Vietnamese,Chinese
0,Accuracy,0.835526,0.718045,0.844925,0.900376,0.821429,0.87218,0.953947,0.347744,0.800752,0.737782
1,Macro_Precision,0.558827,0.63208,0.60013,0.796236,0.653602,0.704619,0.888882,0.558079,0.633757,0.504236
2,Macro_Recall,0.538757,0.786734,0.570363,0.67088,0.734134,0.707297,0.901561,0.598553,0.715782,0.505989
3,Macro_F1,0.54351,0.623114,0.580071,0.710106,0.676623,0.705944,0.895075,0.338467,0.653218,0.50092


Update results file

In [9]:
results = pd.read_excel(results_path, sheet_name=None)

In [10]:
with pd.ExcelWriter(results_path) as writer:
    for sheet_name, df in results.items():
        # Make sure the same training languages are present
        current_langs = [col_name for col_name in table.columns if (table[col_name].apply(lambda x: 
                                                                    isinstance(x, (np.floating, float))).all())]
        current_langs.sort()
        file_langs = [col_name for col_name in df.columns if (df[col_name].apply(lambda x: 
                                                              isinstance(x, (np.floating, float))).all())]
        file_langs.sort()
        assert current_langs == file_langs, "Language mismatch between table and results file"

        # Update values in testing language row
        df.update(pd.DataFrame(table.loc[table["Metric"] == sheet_name, current_langs].to_dict("list"),
                               index=[df.index[df["Language"] == code_to_name[target_lang]][0]]))
        
        df.to_excel(writer, index=False, sheet_name=sheet_name)