# Multilingual BERT Approach

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_datasets as tfds

import sklearn
from sklearn.model_selection import train_test_split

import transformers

from transformers import BertTokenizer, TFBertModel
from tensorflow.keras import backend as K

import logging
tf.get_logger().setLevel(logging.ERROR)

In [2]:
#Check for GPU presence
tf.config.list_physical_devices('GPU')


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Read in the Data

In [31]:
amharic_data = pd.read_csv('data/amharic/amharic.csv')
amharic_data

Unnamed: 0,id,tweet,subtask_a
0,0,አስቀድሜ ጥያቄዬ በጨዋነት በውስጥ መስመር እንዲደርስዎ አድርጌ ፍትህን ለ...,NOT
1,1,እነዚህን ወሳኝ ጉዳዮችን የሚያስፈፅም አካል እንዲቋቋምና ክትትል እንዲደረ...,NOT
2,2,የአማራ ህዝብ በአእምሮ ክንፉ ያልበረረበት ጥበብና ፍልስፍና ያልከፈተው የ...,NOT
3,3,ከአማራ ህዝብ የሀገሪቱ ዘርፈ ብዙ እውቀት መንጭቶ የሞላበትከሙላቱም በመል...,NOT
4,4,ዛሬ በየትኛውም መለኪያ ይሁን መመዘኛ ኢትዮጵያዊነት የሚንፀባረቀው በአማራ...,OFF
...,...,...,...
29995,29995,በአሉ የሁሉም ኢትዮጵያዊ ስላልሆነ በኦሮምኛው ቢለፋደድ ምን አገባን,OFF
29996,29996,ተባረክ አብቹ ፈር ቀዳጅ ስለሆንህ መጋረጃው መቀደድ ስለጀመረ,NOT
29997,29997,እስከ አሁን አንተ ብቻ ነው በ መፅሀፍ ያልቻልከው አንተም ታሪክ እን...,NOT
29998,29998,ህገወጥት ጠቅላይ ሚንስትር ፅቤት የተፈቀደ ሆኖ ህዝብን እንዴት ህግ አክብ...,OFF


In [35]:
#Creating new column with 0/1
#df.loc[(df["Q35"]=="N/A (no satisfaction surveys conducted)"),"Q37_F"]=
amharic_data["label"] = np.nan
amharic_data.loc[(amharic_data["subtask_a"] == "OFF"), "label"] = 1
amharic_data.loc[(amharic_data["subtask_a"] == "NOT"), "label"] = 0
amharic_data

Unnamed: 0,id,tweet,subtask_a,label
0,0,አስቀድሜ ጥያቄዬ በጨዋነት በውስጥ መስመር እንዲደርስዎ አድርጌ ፍትህን ለ...,NOT,0.0
1,1,እነዚህን ወሳኝ ጉዳዮችን የሚያስፈፅም አካል እንዲቋቋምና ክትትል እንዲደረ...,NOT,0.0
2,2,የአማራ ህዝብ በአእምሮ ክንፉ ያልበረረበት ጥበብና ፍልስፍና ያልከፈተው የ...,NOT,0.0
3,3,ከአማራ ህዝብ የሀገሪቱ ዘርፈ ብዙ እውቀት መንጭቶ የሞላበትከሙላቱም በመል...,NOT,0.0
4,4,ዛሬ በየትኛውም መለኪያ ይሁን መመዘኛ ኢትዮጵያዊነት የሚንፀባረቀው በአማራ...,OFF,1.0
...,...,...,...,...
29995,29995,በአሉ የሁሉም ኢትዮጵያዊ ስላልሆነ በኦሮምኛው ቢለፋደድ ምን አገባን,OFF,1.0
29996,29996,ተባረክ አብቹ ፈር ቀዳጅ ስለሆንህ መጋረጃው መቀደድ ስለጀመረ,NOT,0.0
29997,29997,እስከ አሁን አንተ ብቻ ነው በ መፅሀፍ ያልቻልከው አንተም ታሪክ እን...,NOT,0.0
29998,29998,ህገወጥት ጠቅላይ ሚንስትር ፅቤት የተፈቀደ ሆኖ ህዝብን እንዴት ህግ አክብ...,OFF,1.0


In [36]:
#load our tuned Amharic dataset
amharic_train, amharic_test = train_test_split(amharic_data, train_size=0.7)
amharic_test, amharic_dev = train_test_split(amharic_test, train_size=0.5)
print(amharic_train.shape)
print(amharic_test.shape)
print(amharic_dev.shape)

(21000, 4)
(4500, 4)
(4500, 4)


In [37]:
amharic_train

Unnamed: 0,id,tweet,subtask_a,label
8761,8761,ሀሳብህ ታላቅ ሩቅ ተመልካአሳመነው ህዝቡን ወዳጅ ነፍስህን ሰጠኽን በአዋጅ,OFF,1.0
16929,16929,በርቱ ማሻ አሏህ,NOT,0.0
21761,21761,ትክክል የገዳ ስርአተ ትምህርት በመሰጠቱ ደስ ብሎኛልምክንያቱም የኦሮሞ ህ...,NOT,0.0
5449,5449,ፋና እውነትም ብታዎሩ እናተ ውሸታም ስለሆናችሁ የሚሰማችሁ የለም ከአንበጣ...,OFF,1.0
20279,20279,ሰለሞን ዘባህርዳር ተረት ተረት የመሰረት ድሮ አንድ ሰው ይኖር ነበረ ...,OFF,1.0
...,...,...,...,...
9642,9642,ልምን ተብሎ የወጣ ዶላር ነው እንክት አርገህ የበላሀው በአሁን ሰአት ሁሉ...,OFF,1.0
6390,6390,ለማስመሰል እንኳን ንጉሱን ወይ መንጌን ወይ ዜናዊን እንዲህ ያታዩበትን ማ...,OFF,1.0
5851,5851,የፖለቲካ ፓርቲዎች የጋራ ምክር ቤት በኢትዮጵያ ብሄራዊ ምርጫ ቦርድ በኩል...,OFF,1.0
19518,19518,ተራና ነገ በታሪክ ፊት የሚያሳፍር ብሽሽቅ ውስጥ አንድ ድርጅት ሲገባ ዝም...,OFF,1.0


In [38]:
#Find the maximum length tweet
print("max length tweet is:",np.max([len(x) for x in amharic_data.tweet]))
print("min length tweet is:",np.min([len(x) for x in amharic_data.tweet]))
print("mean length tweet is:",np.mean([len(x) for x in amharic_data.tweet]))

max length tweet is: 1326
min length tweet is: 2
mean length tweet is: 88.00763333333333


## Preparing to run BERT

In [5]:
#NEED to import and load both of these
#using the pretrained model called bert-base-cased
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-multilingual-uncased')

Some layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [40]:
#Creating a few dev and train examples

num_train_examples = 2500
num_dev_examples = 500
num_tiny_set = 5

max_length = 100 #use to to a bit larger than the mean tweet length

x_train = tokenizer([x for x in amharic_train.tweet[:num_train_examples]], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_train = amharic_train.label[:num_train_examples]




x_dev = tokenizer([x for x in amharic_dev.tweet[:num_dev_examples]], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_dev = amharic_dev.label[:num_dev_examples]


x_tiny = tokenizer([x for x in amharic_dev.tweet[:num_tiny_set]], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_tiny = amharic_dev.label[:num_tiny_set]

In [42]:
#Let's look at class imbalance
print('ratio of positive examples: ', np.sum(y_train==1)/len(y_train))
#y_train

ratio of positive examples:  0.5124


In [43]:
x_train.input_ids

<tf.Tensor: shape=(2500, 100), dtype=int32, numpy=
array([[101, 100, 100, ...,   0,   0,   0],
       [101, 100, 100, ...,   0,   0,   0],
       [101, 100, 100, ...,   0,   0,   0],
       ...,
       [101, 100, 100, ...,   0,   0,   0],
       [101, 100, 100, ...,   0,   0,   0],
       [101, 100, 100, ..., 100, 100, 102]], dtype=int32)>

## Building the Classification Model

In [27]:
#From BERT_Fine_tuning Walkthrough Notebook/Session

def create_classification_model(hidden_size = 200, 
                                train_layers = -1, 
                                optimizer=tf.keras.optimizers.Adam()):
    """
    Build a simple classification model with BERT. Let's keep it simple and don't add dropout, layer norms, etc.
    """

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                  'token_type_ids': token_type_ids,
                  'attention_mask': attention_mask}


    #restrict training to the train_layers outer transformer layers
    if not train_layers == -1:

            retrain_layers = []

            for retrain_layer_number in range(train_layers):

                layer_code = '_' + str(11 - retrain_layer_number)
                retrain_layers.append(layer_code)

            for w in bert_model.weights:
                if not any([x in w.name for x in retrain_layers]):
                    w._trainable = False


    bert_out = bert_model(bert_inputs) #same as x_tiny example above, always set ouput to model acting on input

    
    #getting the CLS token, could change to ber_out[1]
    classification_token = tf.keras.layers.Lambda(lambda x: x[:,0,:], name='get_first_vector')(bert_out[0]) 


    hidden = tf.keras.layers.Dense(hidden_size, name='hidden_layer')(classification_token)

    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], 
                                          outputs=[classification])
    
    classification_model.compile(optimizer=optimizer,
                            loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                            metrics='accuracy')


    return classification_model

## Experimentation
Creating models and changing parameters

#### Model 1: Following Model 1 from BERT_Fine_tuning walkthrough notebook

In [44]:
classification_model = create_classification_model()  

In [45]:
classification_model.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train,
                         validation_data=([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev),
                        epochs=5,
                        batch_size=8)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff1f9d5eca0>

In [46]:
classification_model.predict([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask], 
                             batch_size=8, 
                             steps=2) #output represents likelihood example was in the positive class
#these are all about the same and not very confident either way about whether example is in the class or not

array([[0.43347943],
       [0.43347943],
       [0.43347943],
       [0.43347943],
       [0.43347937],
       [0.43347943],
       [0.43347937],
       [0.43347943],
       [0.43347943],
       [0.43347943],
       [0.43347943],
       [0.43347937],
       [0.43347937],
       [0.43347937],
       [0.43347943],
       [0.43347943]], dtype=float32)

In [None]:
# Generate generalization metrics
score = classification_model.evaluate([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

#### Model 2: Following Model 2 from BERT Walkthrough notebook
Updating learning rate

In [48]:
#do same thing as above but change learning rate in Adam below, need to get fresh bert model
try:
    del classification_model
except:
    pass

try:
    del bert_model
except:
    pass

tf.keras.backend.clear_session()
bert_model = TFBertModel.from_pretrained('bert-base-cased')

classification_model = create_classification_model(optimizer=tf.keras.optimizers.Adam(0.00005))

classification_model.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train,
                         validation_data=([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev),
                        epochs=5,
                        batch_size=8)

classification_model.predict([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask], 
                             batch_size=8, 
                             steps=2)


#This looks even worse, as now model is predicting even close to 50% of the time

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


array([[0.50230324],
       [0.50211066],
       [0.50244266],
       [0.5025217 ],
       [0.5025738 ],
       [0.50223875],
       [0.5025535 ],
       [0.50233537],
       [0.5023818 ],
       [0.50226843],
       [0.50230324],
       [0.5026371 ],
       [0.50241315],
       [0.50262165],
       [0.5023784 ],
       [0.50214475]], dtype=float32)

In [49]:
# Generate generalization metrics
score = classification_model.evaluate([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

Test loss: 0.6930242776870728 / Test accuracy: 0.515999972820282
