# Multilingual BERT Approach

Train on English and then Arabic and then Amharic

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_datasets as tfds

import sklearn
from sklearn.model_selection import train_test_split

import transformers

from transformers import BertTokenizer, TFBertModel
from tensorflow.keras import backend as K

import logging
tf.get_logger().setLevel(logging.ERROR)

In [2]:
#Check for GPU presence
tf.config.list_physical_devices('GPU')


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Read in the Data

In [8]:
amharic_data = pd.read_csv('data/amharic/amharic.csv')
amharic_data

Unnamed: 0,id,tweet,subtask_a
0,0,አስቀድሜ ጥያቄዬ በጨዋነት በውስጥ መስመር እንዲደርስዎ አድርጌ ፍትህን ለ...,NOT
1,1,እነዚህን ወሳኝ ጉዳዮችን የሚያስፈፅም አካል እንዲቋቋምና ክትትል እንዲደረ...,NOT
2,2,የአማራ ህዝብ በአእምሮ ክንፉ ያልበረረበት ጥበብና ፍልስፍና ያልከፈተው የ...,NOT
3,3,ከአማራ ህዝብ የሀገሪቱ ዘርፈ ብዙ እውቀት መንጭቶ የሞላበትከሙላቱም በመል...,NOT
4,4,ዛሬ በየትኛውም መለኪያ ይሁን መመዘኛ ኢትዮጵያዊነት የሚንፀባረቀው በአማራ...,OFF
...,...,...,...
29995,29995,በአሉ የሁሉም ኢትዮጵያዊ ስላልሆነ በኦሮምኛው ቢለፋደድ ምን አገባን,OFF
29996,29996,ተባረክ አብቹ ፈር ቀዳጅ ስለሆንህ መጋረጃው መቀደድ ስለጀመረ,NOT
29997,29997,እስከ አሁን አንተ ብቻ ነው በ መፅሀፍ ያልቻልከው አንተም ታሪክ እን...,NOT
29998,29998,ህገወጥት ጠቅላይ ሚንስትር ፅቤት የተፈቀደ ሆኖ ህዝብን እንዴት ህግ አክብ...,OFF


In [9]:
#Creating new column with 0/1
#df.loc[(df["Q35"]=="N/A (no satisfaction surveys conducted)"),"Q37_F"]=
amharic_data["label"] = np.nan
amharic_data.loc[(amharic_data["subtask_a"] == "OFF"), "label"] = 1
amharic_data.loc[(amharic_data["subtask_a"] == "NOT"), "label"] = 0
amharic_data

Unnamed: 0,id,tweet,subtask_a,label
0,0,አስቀድሜ ጥያቄዬ በጨዋነት በውስጥ መስመር እንዲደርስዎ አድርጌ ፍትህን ለ...,NOT,0.0
1,1,እነዚህን ወሳኝ ጉዳዮችን የሚያስፈፅም አካል እንዲቋቋምና ክትትል እንዲደረ...,NOT,0.0
2,2,የአማራ ህዝብ በአእምሮ ክንፉ ያልበረረበት ጥበብና ፍልስፍና ያልከፈተው የ...,NOT,0.0
3,3,ከአማራ ህዝብ የሀገሪቱ ዘርፈ ብዙ እውቀት መንጭቶ የሞላበትከሙላቱም በመል...,NOT,0.0
4,4,ዛሬ በየትኛውም መለኪያ ይሁን መመዘኛ ኢትዮጵያዊነት የሚንፀባረቀው በአማራ...,OFF,1.0
...,...,...,...,...
29995,29995,በአሉ የሁሉም ኢትዮጵያዊ ስላልሆነ በኦሮምኛው ቢለፋደድ ምን አገባን,OFF,1.0
29996,29996,ተባረክ አብቹ ፈር ቀዳጅ ስለሆንህ መጋረጃው መቀደድ ስለጀመረ,NOT,0.0
29997,29997,እስከ አሁን አንተ ብቻ ነው በ መፅሀፍ ያልቻልከው አንተም ታሪክ እን...,NOT,0.0
29998,29998,ህገወጥት ጠቅላይ ሚንስትር ፅቤት የተፈቀደ ሆኖ ህዝብን እንዴት ህግ አክብ...,OFF,1.0


In [3]:
english = pd.read_csv("data/olid/olid-training-v1.0.tsv", sep ="\t")
english

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,
...,...,...,...,...,...
13235,95338,@USER Sometimes I get strong vibes from people...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,@USER And why report this garbage. We don't g...,OFF,TIN,OTH
13238,27429,@USER Pussy,OFF,UNT,


In [4]:
english["label"] = np.nan
english.loc[(english["subtask_a"] == "OFF"), "label"] = 1
english.loc[(english["subtask_a"] == "NOT"), "label"] = 0
english

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c,label
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,,1.0
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND,1.0
2,16820,Amazon is investigating Chinese employees who ...,NOT,,,0.0
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,,1.0
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,,0.0
...,...,...,...,...,...,...
13235,95338,@USER Sometimes I get strong vibes from people...,OFF,TIN,IND,1.0
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,,0.0
13237,82921,@USER And why report this garbage. We don't g...,OFF,TIN,OTH,1.0
13238,27429,@USER Pussy,OFF,UNT,,1.0


In [5]:
arabic = pd.read_csv("data/Arabic/train.tsv", sep = "\t")
arabic

Unnamed: 0,id,tweet,subtask_a
0,1,الحمدلله يارب فوز مهم يا زمالك.. كل الدعم ليكم...,NOT
1,2,فدوه يا بخت فدوه يا زمن واحد منكم يجيبه,NOT
2,3,RT @USER: يا رب يا واحد يا أحد بحق يوم الاحد ا...,OFF
3,4,RT @USER: #هوا_الحرية يا وجع قلبي عليكي يا امي...,NOT
4,5,يا بكون بحياتك الأهم يا إما ما بدي أكون 🎼,NOT
...,...,...,...
7834,7996,RT @USER: انتو بتوزعوا زيت وسكر فعلا يا عباس؟<...,NOT
7835,7997,RT @USER: كدا يا عمر متزعلهاش يا حبيبي 😂 URL,NOT
7836,7998,هدا سكن اطفال امارتين من شارقة طالبين فزعتكم ي...,NOT
7837,7999,RT @USER: ومدني بمدد من قوتك أواجه به ضعفي.. و...,NOT


In [6]:
arabic["label"] = np.nan
arabic.loc[(arabic["subtask_a"] == "OFF"), "label"] = 1
arabic.loc[(arabic["subtask_a"] == "NOT"), "label"] = 0
arabic

Unnamed: 0,id,tweet,subtask_a,label
0,1,الحمدلله يارب فوز مهم يا زمالك.. كل الدعم ليكم...,NOT,0.0
1,2,فدوه يا بخت فدوه يا زمن واحد منكم يجيبه,NOT,0.0
2,3,RT @USER: يا رب يا واحد يا أحد بحق يوم الاحد ا...,OFF,1.0
3,4,RT @USER: #هوا_الحرية يا وجع قلبي عليكي يا امي...,NOT,0.0
4,5,يا بكون بحياتك الأهم يا إما ما بدي أكون 🎼,NOT,0.0
...,...,...,...,...
7834,7996,RT @USER: انتو بتوزعوا زيت وسكر فعلا يا عباس؟<...,NOT,0.0
7835,7997,RT @USER: كدا يا عمر متزعلهاش يا حبيبي 😂 URL,NOT,0.0
7836,7998,هدا سكن اطفال امارتين من شارقة طالبين فزعتكم ي...,NOT,0.0
7837,7999,RT @USER: ومدني بمدد من قوتك أواجه به ضعفي.. و...,NOT,0.0


In [10]:
#load our tuned Amharic dataset
amharic_train, amharic_test = train_test_split(amharic_data, train_size=0.9)
amharic_test, amharic_dev = train_test_split(amharic_test, train_size=0.5)
print(amharic_train.shape)
print(amharic_test.shape)
print(amharic_dev.shape)

(27000, 4)
(1500, 4)
(1500, 4)


In [11]:
amharic_train

Unnamed: 0,id,tweet,subtask_a,label
20950,20950,አትቀባጥር ማፊያዎች መጥፊያቹ እየደረሰ ነውየትግራይን ህዝብ መጡብህ እያል...,NOT,0.0
28510,28510,ስለ ሙስሊም መብት ሙስሊም ይጠይቅ አንተ ሙስሊምን ወክለህ መጠየቅ አትችልም,NOT,0.0
15669,15669,ለሱ የተመኘሀው ሞተ ለጠላቱ ክፉ ለሚመኝለት ይሁን,OFF,1.0
15875,15875,ሚኪ በመምጣትህ ደስ ብሎናል,NOT,0.0
13655,13655,ደደብ,OFF,1.0
...,...,...,...,...
7776,7776,ጌጡ የሚረባ ነገር ፃፍ አታበሳጨኝ በአንድ ጊዜ የድራማ ደረሲ አደረገህ ፌ...,OFF,1.0
573,573,አምላክ ሆይ ለኢትዮጵያ የገባህላትን ቃል አስታውስ,NOT,0.0
646,646,ያማል በጣም ተዋዶ ተጋብቶ ተዋልዶ የኖረ ያ ፍቅር የሆነ ህዝብ ሲገዳደልአ...,NOT,0.0
19360,19360,እዚህ ጋር የተጋሩስ መንጫጫት የነገረኝ መገንጠልን ያወሩታል እንጂ ለመፈፀ...,OFF,1.0


In [13]:
#Find the maximum length tweet
print("max length tweet is:",np.max([len(x.split()) for x in amharic_data.tweet]))
print("min length tweet is:",np.min([len(x.split()) for x in amharic_data.tweet]))
print("mean length tweet is:",np.mean([len(x.split()) for x in amharic_data.tweet]))

max length tweet is: 254
min length tweet is: 1
mean length tweet is: 18.026133333333334


## Preparing to run BERT

In [14]:
#NEED to import and load both of these
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')

Some layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [10]:
bert_model.summary()

Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  167356416 
Total params: 167,356,416
Trainable params: 167,356,416
Non-trainable params: 0
_________________________________________________________________


In [15]:

max_length = 100 #use to to a bit larger than the mean tweet length

x_train = tokenizer([x for x in amharic_train.tweet], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_train = amharic_train.label

eng_x_train = tokenizer([x for x in english.tweet], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
eng_y_train = english.label


arabic_x_train = tokenizer([x for x in arabic.tweet], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
arabic_y_train = arabic.label


x_dev = tokenizer([x for x in amharic_dev.tweet], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_dev = amharic_dev.label

In [13]:
#Let's look at class imbalance
print('ratio of positive examples: ', np.sum(y_train==1)/len(y_train))
#y_train

ratio of positive examples:  0.5070370370370371


In [14]:
x_train

{'input_ids': <tf.Tensor: shape=(27000, 100), dtype=int32, numpy=
array([[101, 100, 100, ...,   0,   0,   0],
       [101, 100, 100, ...,   0,   0,   0],
       [101, 100, 100, ...,   0,   0,   0],
       ...,
       [101, 100, 100, ...,   0,   0,   0],
       [101, 100, 100, ...,   0,   0,   0],
       [101, 100, 100, ...,   0,   0,   0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(27000, 100), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(27000, 100), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

## Building the Classification Model

In [16]:
#From BERT_Fine_tuning Walkthrough Notebook/Session

def create_classification_model(hidden_size = 200, 
                                train_layers = -1, 
                                optimizer=tf.keras.optimizers.Adam()):
    """
    Build a simple classification model with BERT. Let's keep it simple and don't add dropout, layer norms, etc.
    """

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                  'token_type_ids': token_type_ids,
                  'attention_mask': attention_mask}


    #restrict training to the train_layers outer transformer layers
    if not train_layers == -1:

            retrain_layers = []

            for retrain_layer_number in range(train_layers):

                layer_code = '_' + str(11 - retrain_layer_number)
                retrain_layers.append(layer_code)

            for w in bert_model.weights:
                if not any([x in w.name for x in retrain_layers]):
                    w._trainable = False


    bert_out = bert_model(bert_inputs) #same as x_tiny example above, always set ouput to model acting on input

    
    #getting the CLS token, could change to bert_out[1]
    classification_token = tf.keras.layers.Lambda(lambda x: x[:,0,:], name='get_first_vector')(bert_out[0]) 


    hidden = tf.keras.layers.Dense(hidden_size, name='hidden_layer')(classification_token)

    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], 
                                          outputs=[classification])
    
    classification_model.compile(optimizer=optimizer,
                            loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                            metrics='accuracy')


    return classification_model

## Experimentation
Creating models and changing parameters

#### Model 1: Fit on English and then Amharic

In [17]:
classification_model = create_classification_model()  

In [18]:
#Fit on English
classification_model.fit([eng_x_train.input_ids, eng_x_train.token_type_ids, eng_x_train.attention_mask],
                         eng_y_train,
                         validation_data=([eng_x_train.input_ids, eng_x_train.token_type_ids, eng_x_train.attention_mask],
                         eng_y_train),
                        epochs=5,
                        batch_size=8)


#Fit on Amharic train
classification_model.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train,
                         validation_data=([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev),
                        epochs=5,
                        batch_size=8)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f2bbc3b1700>

In [30]:
pred = classification_model.predict([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask]) 
pred.shape

(27000, 1)

In [27]:
classification_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask_layer (InputLaye [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_ids_layer (InputLayer)    [(None, 100)]        0                                            
__________________________________________________________________________________________________
token_type_ids_layer (InputLaye [(None, 100)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 167356416   attention_mask_layer[0][0]       
                                                                 input_ids_layer[0][0]        

In [26]:
x_train.token_type_ids.shape

TensorShape([27000, 100])

In [32]:
score = classification_model.evaluate([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train, verbose=0)
print(f'Train loss: {score[0]} / Train accuracy: {score[1]}')

Train loss: 0.6932680010795593 / Train accuracy: 0.5084444284439087


In [20]:
# Generate generalization metrics
score = classification_model.evaluate([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

Test loss: 0.6944733262062073 / Test accuracy: 0.4933333396911621


#### Model 2: Following Model 2 from BERT Walkthrough notebook
Updating learning rate

In [33]:
#do same thing as above but change learning rate in Adam below, need to get fresh bert model
try:
    del classification_model
except:
    pass

try:
    del bert_model
except:
    pass

tf.keras.backend.clear_session()
bert_model = TFBertModel.from_pretrained('bert-base-multilingual-uncased')

classification_model = create_classification_model(optimizer=tf.keras.optimizers.Adam(0.00005))

classification_model.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train,
                         validation_data=([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev),
                        epochs=5,
                        batch_size=8)

classification_model.predict([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask], 
                             batch_size=8, 
                             steps=2)


Some layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


array([[0.4711727 ],
       [0.4711727 ],
       [0.4711727 ],
       [0.4711727 ],
       [0.47117275],
       [0.4711727 ],
       [0.4711727 ],
       [0.4711727 ],
       [0.4711727 ],
       [0.47117275],
       [0.4711727 ],
       [0.4711727 ],
       [0.4711727 ],
       [0.4711727 ],
       [0.4711727 ],
       [0.4711727 ]], dtype=float32)

In [34]:
# Generate generalization metrics
score = classification_model.evaluate([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train, verbose=0)
print(f'Train loss: {score[0]} / Train accuracy: {score[1]}')

Train loss: 0.6957873106002808 / Train accuracy: 0.4915555417537689


In [22]:
# Generate generalization metrics
score = classification_model.evaluate([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

Test loss: 0.6977327466011047 / Test accuracy: 0.4959999918937683


In [21]:
classification_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask_layer (InputLaye [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_ids_layer (InputLayer)    [(None, 100)]        0                                            
__________________________________________________________________________________________________
token_type_ids_layer (InputLaye [(None, 100)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 108310272   attention_mask_layer[0][0]       
                                                                 input_ids_layer[0][0]        

In [24]:
classification_model.save_model("models/BERT_multilingual_adam_v1")
tokenizer.save_pretrained("tokenizers/BERT_multilingual_v1")

AttributeError: 'Functional' object has no attribute 'save_model'