<a href="https://colab.research.google.com/github/YahyaGrb/Fine-tuned-CamelBERT-APCD-Arabic-Poem-Meter_detection/blob/main/CamelBERT_finteune_poem_meter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers datasets

In [None]:
import numpy as np
import tensorflow
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding, DefaultDataCollator, AutoConfig

In [None]:
# https://huggingface.co/docs/transformers/model_sharing?highlight=login#setup
# !huggingface-cli login #grant access to my private datasets
# add the token from https://huggingface.co/settings/tokens

In [None]:
from datasets import load_dataset

dataset = load_dataset("Yah216/APCD_only_meter_data", use_auth_token=True)

Using custom data configuration Yah216--APCD_only_meter_data-1b6538109644b18a


Downloading and preparing dataset csv/Yah216--APCD_only_meter_data to /root/.cache/huggingface/datasets/Yah216___csv/Yah216--APCD_only_meter_data-1b6538109644b18a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/213M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/Yah216___csv/Yah216--APCD_only_meter_data-1b6538109644b18a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

## Prep the data

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'target'],
        num_rows: 1739436
    })
})

In [None]:
shuffled_dataset = dataset.shuffle(seed=42)

In [None]:
LABELS = set(dataset['train']['target'])

In [None]:
# dataset.train_test_split(test_size=0.1)

In [None]:
NB_LABELS = len(LABELS)

In [None]:
lbl_dict = {}
for i,label in enumerate(LABELS):
  lbl_dict[label] = i

In [None]:
def encode_label(text,target):
  return dict(text = text, target = lbl_dict[target])

In [None]:
dataset = dataset.map(encode_label, input_columns = ['text','target'], num_proc = 8)



          

#0:   0%|          | 0/217430 [00:00<?, ?ex/s]

#1:   0%|          | 0/217430 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/217430 [00:00<?, ?ex/s]

  

#3:   0%|          | 0/217430 [00:00<?, ?ex/s]

#4:   0%|          | 0/217429 [00:00<?, ?ex/s]

  

#5:   0%|          | 0/217429 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/217429 [00:00<?, ?ex/s]

#7:   0%|          | 0/217429 [00:00<?, ?ex/s]

## tokenize data

In [None]:
def tokenize_dataset(dataset):
    encoded = tokenizer(
        dataset["text"],
        padding=True,
        truncation=True, max_length=20,
        return_tensors='np',
    )
    return encoded.data

## load the pre-trained model

In [None]:
checkpoint = "CAMeL-Lab/bert-base-arabic-camelbert-ca" # classic arabic could be more close to poems
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/468 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/297k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
tokenized_dataset = dataset.map(tokenize_dataset, batched=True, num_proc = 8)

         

#0:   0%|          | 0/218 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/218 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/218 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/218 [00:00<?, ?ba/s]

  

#4:   0%|          | 0/218 [00:00<?, ?ba/s]

#5:   0%|          | 0/218 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/218 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/218 [00:00<?, ?ba/s]

In [None]:
# TPU run
# import tensorflow as tf
# print("Tensorflow version " + tf.__version__)

# try:
#   tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
#   print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
# except ValueError:
#   raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

# tf.config.experimental_connect_to_cluster(tpu)
# tf.tpu.experimental.initialize_tpu_system(tpu)
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
# resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
# strategy = tf.distribute.TPUStrategy(resolver)
# batch_size= 8*tpu_strategy.num_replicas_in_sync

In [None]:
batch_size= 8#*tpu_strategy.num_replicas_in_sync

In [None]:
# data_collator = DefaultDataCollator(return_tensors="tf")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

train_dts = tokenized_dataset['train'].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["target"],
    shuffle=True,
    collate_fn = data_collator,
    batch_size=batch_size,
)

In [None]:
#instantiate the model
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam 


num_epochs = 3
start_lr = 0.00001
min_lr = 0.00001
max_lr = 0.00005 #* tpu_strategy.num_replicas_in_sync
rampup_epochs = 1
sustain_epochs = 0
exp_decay = .8

def lrfn(epoch):
  if epoch < rampup_epochs:
    return (max_lr - start_lr)/rampup_epochs * epoch + start_lr
  elif epoch < rampup_epochs + sustain_epochs:
    return max_lr
  else:
    return (max_lr - min_lr) * exp_decay**(epoch-rampup_epochs-sustain_epochs) + min_lr
import tensorflow as tf
lr_callback = tf.keras.callbacks.LearningRateScheduler(lambda epoch: lrfn(epoch), verbose=True)


def create_model():
  loss = tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  num_train_steps = (len(tokenized_dataset['train']['input_ids']) // batch_size) * num_epochs

  # lr_scheduler = PolynomialDecay(
  #     initial_learning_rate=5e-5,
  #     end_learning_rate=0.,
  #     decay_steps=num_train_steps
  #     )

  # opt = Adam(learning_rate=lr_scheduler)
  model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=NB_LABELS)
  model.compile(optimizer='adam', loss=loss, metrics=["accuracy"])

  return model

In [None]:
# with strategy.scope(): # creating the model in the TPUStrategy scope means we will train the model on the TPU
model = create_model()

model.summary()

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109081344 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  17687     
                                                                 
Total params: 109,099,031
Trainable params: 109,099,031
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(
                    train_dts,
                    batch_size=batch_size,
                    epochs=2, 
                    callbacks=[lr_callback]
                    )


Epoch 1: LearningRateScheduler setting learning rate to 1e-05.
Epoch 1/2

In [None]:
# test the model


In [None]:
# save the checkpoint
model.save_pretrained("model/meter_classifier")
