# Multilingual BERT Approach

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_datasets as tfds

import sklearn
from sklearn.model_selection import train_test_split

import transformers

from transformers import BertTokenizer, TFBertModel
from tensorflow.keras import backend as K

import logging
tf.get_logger().setLevel(logging.ERROR)

In [2]:
#Check for GPU presence
tf.config.list_physical_devices('GPU')


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Read in the Data

In [3]:
amharic_data = pd.read_csv('data/amharic/amharic.csv')
amharic_data

Unnamed: 0,id,tweet,subtask_a
0,0,አስቀድሜ ጥያቄዬ በጨዋነት በውስጥ መስመር እንዲደርስዎ አድርጌ ፍትህን ለ...,NOT
1,1,እነዚህን ወሳኝ ጉዳዮችን የሚያስፈፅም አካል እንዲቋቋምና ክትትል እንዲደረ...,NOT
2,2,የአማራ ህዝብ በአእምሮ ክንፉ ያልበረረበት ጥበብና ፍልስፍና ያልከፈተው የ...,NOT
3,3,ከአማራ ህዝብ የሀገሪቱ ዘርፈ ብዙ እውቀት መንጭቶ የሞላበትከሙላቱም በመል...,NOT
4,4,ዛሬ በየትኛውም መለኪያ ይሁን መመዘኛ ኢትዮጵያዊነት የሚንፀባረቀው በአማራ...,OFF
...,...,...,...
29995,29995,በአሉ የሁሉም ኢትዮጵያዊ ስላልሆነ በኦሮምኛው ቢለፋደድ ምን አገባን,OFF
29996,29996,ተባረክ አብቹ ፈር ቀዳጅ ስለሆንህ መጋረጃው መቀደድ ስለጀመረ,NOT
29997,29997,እስከ አሁን አንተ ብቻ ነው በ መፅሀፍ ያልቻልከው አንተም ታሪክ እን...,NOT
29998,29998,ህገወጥት ጠቅላይ ሚንስትር ፅቤት የተፈቀደ ሆኖ ህዝብን እንዴት ህግ አክብ...,OFF


In [4]:
#Creating new column with 0/1
#df.loc[(df["Q35"]=="N/A (no satisfaction surveys conducted)"),"Q37_F"]=
amharic_data["label"] = np.nan
amharic_data.loc[(amharic_data["subtask_a"] == "OFF"), "label"] = 1
amharic_data.loc[(amharic_data["subtask_a"] == "NOT"), "label"] = 0
amharic_data

Unnamed: 0,id,tweet,subtask_a,label
0,0,አስቀድሜ ጥያቄዬ በጨዋነት በውስጥ መስመር እንዲደርስዎ አድርጌ ፍትህን ለ...,NOT,0.0
1,1,እነዚህን ወሳኝ ጉዳዮችን የሚያስፈፅም አካል እንዲቋቋምና ክትትል እንዲደረ...,NOT,0.0
2,2,የአማራ ህዝብ በአእምሮ ክንፉ ያልበረረበት ጥበብና ፍልስፍና ያልከፈተው የ...,NOT,0.0
3,3,ከአማራ ህዝብ የሀገሪቱ ዘርፈ ብዙ እውቀት መንጭቶ የሞላበትከሙላቱም በመል...,NOT,0.0
4,4,ዛሬ በየትኛውም መለኪያ ይሁን መመዘኛ ኢትዮጵያዊነት የሚንፀባረቀው በአማራ...,OFF,1.0
...,...,...,...,...
29995,29995,በአሉ የሁሉም ኢትዮጵያዊ ስላልሆነ በኦሮምኛው ቢለፋደድ ምን አገባን,OFF,1.0
29996,29996,ተባረክ አብቹ ፈር ቀዳጅ ስለሆንህ መጋረጃው መቀደድ ስለጀመረ,NOT,0.0
29997,29997,እስከ አሁን አንተ ብቻ ነው በ መፅሀፍ ያልቻልከው አንተም ታሪክ እን...,NOT,0.0
29998,29998,ህገወጥት ጠቅላይ ሚንስትር ፅቤት የተፈቀደ ሆኖ ህዝብን እንዴት ህግ አክብ...,OFF,1.0


In [5]:
#load our tuned Amharic dataset
amharic_train, amharic_test = train_test_split(amharic_data, train_size=0.9)
amharic_test, amharic_dev = train_test_split(amharic_test, train_size=0.5)
print(amharic_train.shape)
print(amharic_test.shape)
print(amharic_dev.shape)

(27000, 4)
(1500, 4)
(1500, 4)


In [6]:
amharic_train

Unnamed: 0,id,tweet,subtask_a,label
5137,5137,ትገረማለህ አንተ ግን ጠምንሰተሪ ሰትባል ፐሪኮንሽዮሰ የለህም ጣውላ ራስ,OFF,1.0
28676,28676,ሚኪ እናመስግናለን እውነት_ትደበቃለች_እንጂ_አትጠፋም ስልጣኔውም ሆነ ፊደ...,NOT,0.0
18291,18291,መጀመሪያም የተናገርኩት ይህን ነው መብራት እና ውሀ በፈረቃ ባለበት ሀገር...,OFF,1.0
4824,4824,አየ ደክተር አብይ እዚህ ህዝብ እያለቀ አንተ ሽርጉድ ትላለህ ማለቃችን ተ...,OFF,1.0
7249,7249,ሰው ሁሉ በእንተርኔት ሆነ እንዴ ፀሌቱ,NOT,0.0
...,...,...,...,...
10111,10111,የወያኔ ጊዜውን የዘይት እጥረት ለመሸፈን የአለም ሀገራት የሸሹትንና ያገዱ...,NOT,0.0
2577,2577,ዶር አለሙ ስሜ የሜቴክ ዋና ዳሬክተር,NOT,0.0
6264,6264,እውነትን እውነት ሀሰትን ሀሰት በል ሀብታሙ አያሌው በእውነት ይለያልደስ ...,NOT,0.0
27175,27175,ጃዋር በእናትህ ሜዳ ላይ ከታደለ ጋ ተጋጠሙ ያኔ ወንድ ጀግና እልሀለሁ ካ...,OFF,1.0


In [7]:
#Find the maximum length tweet
print("max length tweet is:",np.max([len(x.split()) for x in amharic_data.tweet]))
print("min length tweet is:",np.min([len(x.split()) for x in amharic_data.tweet]))
print("mean length tweet is:",np.mean([len(x.split()) for x in amharic_data.tweet]))

max length tweet is: 254
min length tweet is: 1
mean length tweet is: 18.026133333333334


## Preparing to run BERT

In [8]:
#NEED to import and load both of these
#using the pretrained model called bert-base-cased
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-multilingual-uncased')

Some layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [9]:
bert_model.summary()

Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  167356416 
Total params: 167,356,416
Trainable params: 167,356,416
Non-trainable params: 0
_________________________________________________________________


In [10]:

max_length = 100 #use to to a bit larger than the mean tweet length

x_train = tokenizer([x for x in amharic_train.tweet], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_train = amharic_train.label




x_dev = tokenizer([x for x in amharic_dev.tweet], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_dev = amharic_dev.label

In [11]:
#Let's look at class imbalance
print('ratio of positive examples: ', np.sum(y_train==1)/len(y_train))
#y_train

ratio of positive examples:  0.5066666666666667


In [12]:
x_train

{'input_ids': <tf.Tensor: shape=(27000, 100), dtype=int32, numpy=
array([[101, 100, 100, ...,   0,   0,   0],
       [101, 100, 100, ...,   0,   0,   0],
       [101, 100, 100, ...,   0,   0,   0],
       ...,
       [101, 100, 100, ...,   0,   0,   0],
       [101, 100, 100, ...,   0,   0,   0],
       [101, 100, 100, ...,   0,   0,   0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(27000, 100), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(27000, 100), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

## Building the Classification Model

In [13]:
#From BERT_Fine_tuning Walkthrough Notebook/Session

def create_classification_model(hidden_size = 200, 
                                train_layers = -1, 
                                optimizer=tf.keras.optimizers.Adam()):
    """
    Build a simple classification model with BERT. Let's keep it simple and don't add dropout, layer norms, etc.
    """

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                  'token_type_ids': token_type_ids,
                  'attention_mask': attention_mask}


    #restrict training to the train_layers outer transformer layers
    if not train_layers == -1:

            retrain_layers = []

            for retrain_layer_number in range(train_layers):

                layer_code = '_' + str(11 - retrain_layer_number)
                retrain_layers.append(layer_code)

            for w in bert_model.weights:
                if not any([x in w.name for x in retrain_layers]):
                    w._trainable = False


    bert_out = bert_model(bert_inputs) #same as x_tiny example above, always set ouput to model acting on input

    
    #getting the CLS token, could change to bert_out[1]
    classification_token = tf.keras.layers.Lambda(lambda x: x[:,0,:], name='get_first_vector')(bert_out[0]) 


    hidden = tf.keras.layers.Dense(hidden_size, name='hidden_layer')(classification_token)

    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], 
                                          outputs=[classification])
    
    classification_model.compile(optimizer=optimizer,
                            loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                            metrics='accuracy')


    return classification_model

## Experimentation
Creating models and changing parameters

#### Model 1: Following Model 1 from BERT_Fine_tuning walkthrough notebook

In [14]:
classification_model = create_classification_model()  

In [16]:
#This took a long time, may want to increase batch_size for next run?
classification_model.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train,
                         validation_data=([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev),
                        epochs=3,
                        batch_size=10)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f798c2155b0>

In [17]:
predictions = classification_model.predict([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask], ) #output represents likelihood example was in the positive class
#these are all about the same and not very confident either way about whether example is in the class or not
predictions

array([[0.5082261],
       [0.5082261],
       [0.5082261],
       ...,
       [0.5082261],
       [0.5082261],
       [0.5082261]], dtype=float32)

In [18]:
# Generate generalization metrics
score = classification_model.evaluate([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

Test loss: 0.6933264136314392 / Test accuracy: 0.4986666738986969


In [22]:
#Creating some new columns & printing out a csv with the predicted labels
amharic_train["predicted_label"] = np.nan
amharic_train["predicted_stat"] = predictions
amharic_train.loc[(amharic_train["predicted_stat"] >= 0.5), "predicted_label"] = 1
amharic_train.loc[(amharic_train["predicted_stat"] < 0.5), "predicted_label"] = 0
amharic_train.to_csv("Amharic_train_predictions")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amharic_train["predicted_label"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amharic_train["predicted_stat"] = predictions
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

#### Model 2: Following Model 2 from BERT Walkthrough notebook
Updating learning rate

In [21]:
#do same thing as above but change learning rate in Adam below, need to get fresh bert model
try:
    del classification_model
except:
    pass

try:
    del bert_model
except:
    pass

tf.keras.backend.clear_session()
bert_model = TFBertModel.from_pretrained('bert-base-multilingual-uncased')

classification_model = create_classification_model(optimizer=tf.keras.optimizers.Adam(0.00005))

classification_model.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train,
                         validation_data=([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev),
                        epochs=5,
                        batch_size=8)

classification_model.predict([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask], 
                             batch_size=8, 
                             steps=2)


#This looks a little worse, not sure why it's now predicting 54% consistently

Some layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


array([[0.5439622],
       [0.5439622],
       [0.5439622],
       [0.5439622],
       [0.5439622],
       [0.5439622],
       [0.5439622],
       [0.5439621],
       [0.5439622],
       [0.5439622],
       [0.5439622],
       [0.5439622],
       [0.5439622],
       [0.5439622],
       [0.5439622],
       [0.5439622]], dtype=float32)

In [22]:
# Generate generalization metrics
score = classification_model.evaluate([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

Test loss: 0.6977327466011047 / Test accuracy: 0.4959999918937683


In [21]:
classification_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask_layer (InputLaye [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_ids_layer (InputLayer)    [(None, 100)]        0                                            
__________________________________________________________________________________________________
token_type_ids_layer (InputLaye [(None, 100)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 108310272   attention_mask_layer[0][0]       
                                                                 input_ids_layer[0][0]        

In [24]:
classification_model.save_model("models/BERT_multilingual_adam_v1")
tokenizer.save_pretrained("tokenizers/BERT_multilingual_v1")

AttributeError: 'Functional' object has no attribute 'save_model'