In [1]:
from transformers import TFBertForSequenceClassification, BertTokenizer, AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import pandas as pd
import numpy as np
from data_preparation_sentiment import Example, convert_examples_to_tf_dataset, make_batches
import os
from tqdm.notebook import tqdm

In [2]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
model.load_weights("../checkpoints/multibert_sentiment_0.883.hdf5")
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
max_length = 512
batch_size = 128

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy()
model.compile(loss=loss, metrics=[metric])

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertForSequenceClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['dropout_37', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [108]:
test = pd.read_csv("../data/sentiment/hr/test.csv", header=None)
test.columns = ["sentiment", "review"]
lengths = test["review"].apply(lambda x: len(tokenizer.encode(x)))
test = test[lengths <= 512].reset_index(drop=True) # Remove long examples
test_dataset = convert_examples_to_tf_dataset([(Example(text=text, category_index=label)) for label, 
                                               text in test.values], 
                                              tokenizer, max_length=max_length)
test_dataset, test_batches = make_batches(test_dataset, batch_size, repetitions=1)

In [109]:
test["sentiment"].mean()

0.7803203661327232

In [105]:
model.evaluate(test_dataset)



[1.1601811647415161, 0.6071428656578064]

In [110]:
path = "../data/sentiment/"

for lang in tqdm(os.listdir(path)):
    if lang not in ["tr", "ja", "ru"]:
        test = pd.read_csv(path + lang + "/test.csv", header=None)
        test.columns = ["sentiment", "review"]
        print(lang, test["sentiment"].mean())

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

ar 0.8325724493846764
bg 0.8111050626020686
en 0.49917627677100496
eu 0.8458149779735683
fi 0.7455919395465995
he 0.040654997176736304
hr 0.7803203661327232
ko 2.8757886435331232
sl 0.9219924812030075
th 0.40784982935153585
vi 0.5138686131386861
zh 0.6045710139669871



In [4]:
path = "../data/sentiment/"
sentiment_eval = {}

for lang in tqdm(os.listdir(path)):
    if lang not in ["tr", "ja", "ru"]:
        test = pd.read_csv(path + lang + "/test.csv", header=None)
        test.columns = ["sentiment", "review"]
        lengths = test["review"].apply(lambda x: len(tokenizer.encode(x)))
        test = test[lengths <= 512] # Remove long examples
        test_dataset = convert_examples_to_tf_dataset([(Example(text=text, category_index=label)) for label, 
                                                       text in test.values], 
                                                      tokenizer, max_length=max_length)
        test_dataset, test_batches = make_batches(test_dataset, batch_size, repetitions=1)
        sentiment_eval[lang] = model.evaluate(test_dataset, steps=test_batches)[1]

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [5]:
sentiment_eval

{'ar': 0.6229491233825684,
 'bg': 0.7152966856956482,
 'en': 0.8638110756874084,
 'eu': 0.4361233413219452,
 'fi': 0.4719387888908386,
 'he': 0.19739818572998047,
 'hr': 0.52173912525177,
 'ko': 0.12322555482387543,
 'sl': 0.6785714030265808,
 'th': 0.6288527250289917,
 'vi': 0.6198830604553223,
 'zh': 0.5985096096992493}

In [6]:
{k: str(round(v * 100, 2)) + "%" for k, v in sorted(sentiment_eval.items(), key=lambda item: item[1], reverse=True)}

{'en': '86.38%',
 'bg': '71.53%',
 'sl': '67.86%',
 'th': '62.89%',
 'ar': '62.29%',
 'vi': '61.99%',
 'zh': '59.85%',
 'hr': '52.17%',
 'fi': '47.19%',
 'eu': '43.61%',
 'he': '19.74%',
 'ko': '12.32%'}

In [2]:
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [5]:
path = "../data/sentiment/"
max_lengths = {}
for directory in tqdm(os.listdir(path)):
    if directory not in ["tr", "ja", "ru"]:
        lang_path = os.path.join(path, directory)
        test = pd.read_csv(lang_path + "/test.csv", header=None)
        test.columns = ["sentiment", "review"]
        lengths = test["review"].apply(lambda x: len(tokenizer.encode(x)))
        max_lengths[directory] = (lengths.max(), (lengths > 512).sum(), (lengths > 256).sum())

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [9]:
print("{:<15}{:<20}{:<20}{:<20}".format("Language", "Length of", "Number of examples", "Number of examples"))
print("{:<15}{:<20}{:<20}{:<20}".format("", "longest example", "above 512 tokens", "above 256 tokens") + "\n")
for lang, values in max_lengths.items():
    print("{:<15}{:<20}{:<20}{:<10}".format(lang, values[0], values[1], values[2]))

Language       Length of           Number of examples  Number of examples  
               longest example     above 512 tokens    above 256 tokens    

ar             5040                446                 1242      
bg             65                  0                   0         
en             77                  0                   0         
eu             79                  0                   0         
fi             657                 5                   26        
he             1183                3                   16        
hr             316                 0                   1         
ko             212                 0                   0         
sl             411                 0                   9         
th             1336                8                   48        
vi             826                 1                   3         
zh             847                 11                  106       


In [31]:
file = open("../data_exploration/pos_table.txt", "r")
output = ""
lang_codes = pd.read_excel("../data_exploration/lang_codes.xlsx", header=0)
max_lengths["sk"] = max_lengths["sl"]

for line in file.readlines():
    lang_name = line.split("&")[1].strip()
    lang_code = lang_codes["ISO 639-1 Code"][lang_codes["English name of Language"] == lang_name].values[0]
    
    if lang_code in max_lengths:
        values = max_lengths[lang_code]
        split_line = line.split("\\")
        start = split_line[0] + "\\" + "&".join(split_line[1].split("&")[:2])
        end = r"\\" + "".join(split_line[2:])
        new_line = start + "& " + " & ".join(np.array(values[1:]).astype(str)) + end
        
    else:
        new_line = line
        
    output += new_line

In [32]:
print(output)

    \fusional{Fusional}  & Bulgarian & 0 & 0\\ 
    \fusional{Fusional} & English & 0 & 0\\
    \fusional{Fusional}  & Russian &  &  \\ 
    \fusional{Fusional} & Slovak & 0 & 9\\
    \fusional{Fusional}  & Croatian & 0 & 1\\
    \isolating{Isolating} & Chinese & 11 & 106\\ 
    \isolating{Isolating} & Vietnamese  & 1 & 3\\
    \isolating{Isolating} & Thai & 8 & 48\\
    \agglutinative{Agglutinative} & Finnish & 5 & 26\\ 
    \agglutinative{Agglutinative} & Basque & 0 & 0\\
    \agglutinative{Agglutinative} & Japanese & \\ 
    \agglutinative{Agglutinative} & Korean & 0 & 0\\ 
    \agglutinative{Agglutinative} & Turkish & \\
    \introflexive{Introflexive} & Arabic & 446 & 1242\\
    \introflexive{Introflexive} & Hebrew & 3 & 16\\
