This Notebook is where we are fine-tuning a model for comment generation. We do that by fine-tuning the model for a Masked Language Replacement task using the following dataset: https://github.com/IsraelAbebe/An-Amharic-News-Text-classification-Dataset

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install transformers sentencepiece torch datasets --quiet

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Davlan/xlm-roberta-base-finetuned-amharic")
#tokenizer = AutoTokenizer.from_pretrained("Davlan/bert-base-multilingual-cased-finetuned-amharic")

In [None]:
from datasets import load_dataset
from datasets import Dataset
import pandas as pd

from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/gdrive/MyDrive/amharic/AmharicNewsDataset.csv')
df = df.drop(['category','date','views', 'link'], axis=1)
 
headline = df['headline']
article = df['article']

headline = headline.append(article)

textdf = pd.DataFrame({'text':headline.values})
textdf = textdf.dropna()

train, test = train_test_split(textdf, test_size=0.2)

train.to_csv("/content/gdrive/MyDrive/amharic/AmharicNewsDatasetTrunc2.csv", index=False)
test.to_csv("/content/gdrive/MyDrive/amharic/AmharicNewsDatasetTruncTest2.csv", index=False)

In [None]:
#data_files={"train": path_to_train.txt, "validation": path_to_validation.txt}
dataset = load_dataset('csv', data_files={"train": '/content/gdrive/MyDrive/amharic/AmharicNewsDatasetTrunc2.csv', "validation": '/content/gdrive/MyDrive/amharic/AmharicNewsDatasetTruncTest2.csv'})

Using custom data configuration default-a15eda5d8a6626ab


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-a15eda5d8a6626ab/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff...


0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-a15eda5d8a6626ab/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff. Subsequent calls will reuse this data.


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

  0%|          | 0/83 [00:00<?, ?ba/s]

  0%|          | 0/21 [00:00<?, ?ba/s]

In [None]:
# block_size = tokenizer.model_max_length
block_size = 128

In [None]:
def group_texts(examples):

    #print(examples.keys())

    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    "xlmr-mlm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    save_strategy="epoch",
    weight_decay=0.01,
    #save_total_limit = 3, # Only last 3 models are saved. Older ones are deleted.
    load_best_model_at_end=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-a15eda5d8a6626ab/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-6532b8f078945fbd.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-a15eda5d8a6626ab/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-02d1b2ec21efedeb.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-a15eda5d8a6626ab/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-22da69721761c2b1.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-a15eda5d8a6626ab/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-9f051f20527189aa.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-a15eda5d8a6626ab/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-58013143e87f044a.arrow
Loadi

In [None]:
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("Davlan/xlm-roberta-base-finetuned-amharic")

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 328984
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 123369


Epoch,Training Loss,Validation Loss
1,1.012,0.878551


***** Running Evaluation *****
  Num examples = 82212
  Batch size = 8
Saving model checkpoint to xlmr-mlm/checkpoint-41123
Configuration saved in xlmr-mlm/checkpoint-41123/config.json
Model weights saved in xlmr-mlm/checkpoint-41123/pytorch_model.bin


Epoch,Training Loss,Validation Loss
1,1.012,0.878551
2,0.9526,0.841523
3,0.8989,0.81407


***** Running Evaluation *****
  Num examples = 82212
  Batch size = 8
Saving model checkpoint to xlmr-mlm/checkpoint-82246
Configuration saved in xlmr-mlm/checkpoint-82246/config.json
Model weights saved in xlmr-mlm/checkpoint-82246/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 82212
  Batch size = 8
Saving model checkpoint to xlmr-mlm/checkpoint-123369
Configuration saved in xlmr-mlm/checkpoint-123369/config.json
Model weights saved in xlmr-mlm/checkpoint-123369/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from xlmr-mlm/checkpoint-123369 (score: 0.8140695691108704).


TrainOutput(global_step=123369, training_loss=0.9556707426063638, metrics={'train_runtime': 27063.4934, 'train_samples_per_second': 36.468, 'train_steps_per_second': 4.559, 'total_flos': 6.510899036141568e+16, 'train_loss': 0.9556707426063638, 'epoch': 3.0})

In [None]:
trainer.save_model('/content/gdrive/MyDrive/amharic/xlmr_save2')

Saving model checkpoint to /content/gdrive/MyDrive/amharic/xlmr_save2
Configuration saved in /content/gdrive/MyDrive/amharic/xlmr_save2/config.json
Model weights saved in /content/gdrive/MyDrive/amharic/xlmr_save2/pytorch_model.bin
