In [None]:
# run this first cell and then do "restart and run all" from the run menu
!pip install -U accelerate

In [None]:
##### CONFIG ####

dataset_name = "ritter"

model_dict = {1: "camembert-base", 2: "cmarkea/distilcamembert-base", 3: "bert-base-multilingual-cased", 4: "distilbert-base-multilingual-cased",
              5: "bert-base-multilingual-uncased", 6: "distilbert-base-multilingual-uncased"}

model_key = 2
model_ckpt = model_dict[model_key]

batch_size = 16
max_epochs = 20


use_drive = False
save_model = False

# output dir format (below)
output_dir=f"{model_ckpt}__finetuned__{dataset_name}"

In [None]:
!python --version

In [None]:
! pip install transformers datasets evaluate sentencepiece

In [None]:
import os
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, IntervalStrategy
from datasets import Features, Value, ClassLabel, Dataset, DatasetDict
import evaluate
import pandas as pd
import numpy as np
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report, accuracy_score, f1_score
import matplotlib.pyplot as plt
import torch

Les données sont sur seafile mais on peut aussi importer le document depuis son Drive 

In [None]:
![[ ! -d data ]] && mkdir data || echo "data directory exists"
!wget -O "data/ritter.csv" https://seafile.unistra.fr/f/0a0d7f59e86e4684b1d1/?dl=1

In [None]:
use_drive = False
if use_drive == True: 
    from google.colab import drive
    drive.mount("/content/drive")

In [None]:
mydf = "data/ritter.csv"

In [None]:
labelCol = 'label'

In [None]:
df = pd.read_csv(mydf, sep=",", dtype={'description': 'object',
                                            labelCol: 'category'})

In [None]:
df = df.drop_duplicates()
df = df.dropna(how='any')
df.info()

In [None]:
df['label'].value_counts()

In [None]:
df.head()

In [None]:
# Liste de classes et ajout d'un identifiant numérique pour chaque classe
class_names = sorted(df[labelCol].unique().categories.to_list())
label2id = {class_names[i]:i for i in range(len(class_names))}
id2label = {i:class_names[i] for i in range(len(class_names))}

In [None]:
class_names, label2id, id2label

In [None]:
# Paramètres

#batch_size = 32 # orig was 64 but I'm going down given CUDA OutOfMemory error
batch_size = 16 # for camembert-base, trying 16
# Proportion des données qui sera utilisée
scale = 0.2 

In [None]:
# nv dataFrame
data_df = pd.DataFrame()
data_df['text'] = df.description
# La classe cible est la région (province) sous forme d'identifiant numérique
data_df['label'] = df[labelCol].map(label2id)

In [None]:
data_df = data_df.reset_index(drop=True)

In [None]:
# Transformation du DataFrame en objet de type Dataset utilisé par HuggingFace
features = Features({'text': Value('string'),
                              'label': ClassLabel(names=class_names)})
data = Dataset.from_pandas(data_df, features=features)


In [None]:

trainvalid_test = data.train_test_split(test_size=0.2, shuffle=True, seed=12)
trainvalid_trainvalid = trainvalid_test["train"].train_test_split(test_size=0.2, shuffle=True, seed=12)
data = DatasetDict({"train": trainvalid_trainvalid["train"],
                        "valid": trainvalid_trainvalid["test"],
                        "test": trainvalid_test["test"]})

In [None]:
data

In [None]:
#steps = (epoch * examples)/batch size
total_steps = (6 * len(df))/16
total_steps

In [None]:
# this is unused unless use steps as strategy in training args
epoch_size = len(data['train'])/batch_size

In [None]:
print(f"EPOCH SIZE IS: {epoch_size}")

When needed, export the dataset to verify later that it's indeed the same for all experiments

In [None]:
if use_drive == True:
  dataset_bkps = "/content/drive/MyDrive/ner_model/datasets"
  current_dataset = "ritter"
  # os.mkdir(os.path.join(dataset_bkps, current_dataset))
  for split, split_data in data.items():
    split_data.to_csv(f"/content/drive/MyDrive/ner_model/datasets/ritter/{current_dataset}-{split}.csv")
    # print(f"!head -n5 {split_path_str}")

## Entrainement du modèle

In [None]:
# Chargement du tokéniseur pré-entraîné correspondant au modèle utilisé
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

In [None]:
%pdb

In [None]:
# Tokenisation de la totalité des données : chaque unité est remplacée par un identifiant numérique
tokenized_data = data.map(preprocess_function, batched=True, batch_size=None)

In [None]:
# Taille du vocabulaire
tokenizer.vocab_size

In [None]:
# Taille de contexte maximum
tokenizer.model_max_length

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    return acc

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=len(class_names), id2label=id2label, label2id=label2id
).to(device)

In [None]:
batch_size=16

In [None]:
from math import ceil
intended_eval_steps = ceil(len(data['train'])/batch_size)

In [None]:
# was recommended at https://stackoverflow.com/questions/69087044/early-stopping-in-bert-trainer-instances, but seems
# to no longer be necessary
# on 'eval_loss' as metric https://discuss.huggingface.co/t/early-stopping-training-using-validation-loss-as-the-metric-for-best-model/31378
if False:
  training_args = TrainingArguments(
      output_dir=f"{model_ckpt}__finetuned__{dataset_name}__{fraction_kept}__{run_number}",
      learning_rate=2e-5,
      num_train_epochs=max_epochs,
      evaluation_strategy=IntervalStrategy.STEPS,
      eval_steps = intended_eval_steps,
      save_steps = intended_eval_steps,
      save_total_limit = 3,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      weight_decay=0.01,
      save_strategy=IntervalStrategy.STEPS,
      load_best_model_at_end=True,
      metric_for_best_model='eval_loss',
  )

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=max_epochs,
    weight_decay=0.01,
    save_total_limit = 3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
)

Now we use train et validation for the fine-tuning. We'll only use test to test the model after fine-tuning

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
batch_size

In [None]:
trainer.train()

In [None]:
preds_output = trainer.predict(tokenized_data['test'])

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)

In [None]:
y_test = tokenized_data['test']['label']

In [None]:
labels = tokenized_data['test'].features['label'].names

In [None]:
# fraction kept to color map


In [None]:
def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    labels_for_fig = [l[0:4]+'.' for l in labels]
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=labels_for_fig)
    disp.plot(cmap="Greens", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.savefig(f"/content/drive/MyDrive/ner_model/datasets/output/distilcamembert_norm.pdf", format='pdf', bbox_inches='tight')
    plt.savefig(f"/content/drive/MyDrive/ner_model/datasets/output/disticamembert_norm.png", format='png', bbox_inches='tight')
    plt.show()

plot_confusion_matrix(y_preds, y_test, labels)

In [None]:
from mlxtend.plotting import plot_confusion_matrix
conf_matrix = confusion_matrix(y_test, y_preds)
fig, ax = plot_confusion_matrix(conf_mat=conf_matrix, figsize=(6, 6), cmap=plt.cm.Greens, class_names=class_names)
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Réels', fontsize=18)
plt.title('Matrice de confusion', fontsize=18)
# plt.show()
plt.savefig(f"/content/drive/MyDrive/ner_model/datasets/output/distilcamembert.pdf", format='pdf', bbox_inches='tight')
plt.savefig(f"/content/drive/MyDrive/ner_model/datasets/output/disticamembert.png", format='png', bbox_inches='tight')

In [None]:
print("Classification report:\n\n{}".format(classification_report(y_test, y_preds, digits=4)))

In [None]:
if use_drive == True:
    model.save_pretrained(f"/content/drive/MyDrive/ner_model/{model_ckpt}-finetuned-ritter")
    tokenizer.save_pretrained(f"/content/drive/MyDrive/ner_model/{model_ckpt}-finetuned-ritter")
else:
    model.save_pretrained(f"{model_ckpt}-finetuned-ritter")
    tokenizer.save_pretrained(f"{model_ckpt}-finetuned-ritter")