In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
! pip install transformers datasets
! pip install -U accelerate
! pip install -U transformers
! pip install evaluate

# Fine-tune a pretrained model

## Prepare a dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/receipt-ocr/PROCCESS_DATA.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102538 entries, 0 to 102537
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    102538 non-null  object
 1   label   102538 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.6+ MB


In [6]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
from datasets import Dataset, DatasetDict


# Assuming df is your Pandas DataFrame
train_text, val_text, train_labels, val_labels = train_test_split(df['text'].to_list() ,df['label'].to_list(),
                                                                            random_state = 42,
                                                                            test_size = 0.2,
                                                                            stratify = df['label'])



dataset_dict = {
    'train': {
        'label': train_labels,
        'text': train_text
    },
    'val': {
        'label': val_labels,
        'text': val_text
    }
}

In [None]:
for split in ['train', 'val']:
    temp = pd.DataFrame(dataset_dict[split])[["text","label"]]
    dataset_dict[split] = Dataset.from_pandas(temp)

# DatasetDict'i Dataset formatına dönüştürme
dataset = DatasetDict(dataset_dict)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 82030
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 20508
    })
})


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dbmdz/convbert-base-turkish-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/82030 [00:00<?, ? examples/s]

Map:   0%|          | 0/20508 [00:00<?, ? examples/s]

## Train

## Train with PyTorch Trainer

In [None]:
pd.DataFrame(dataset["train"]["label"]).value_counts()

0    42007
1    26337
2    13686
dtype: int64

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("dbmdz/convbert-base-turkish-cased", num_labels=3)

model.safetensors:   0%|          | 0.00/427M [00:00<?, ?B/s]

Some weights of ConvBertForSequenceClassification were not initialized from the model checkpoint at dbmdz/convbert-base-turkish-cased and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Evaluate

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

If you'd like to monitor your evaluation metrics during fine-tuning, specify the `evaluation_strategy` parameter in your training arguments to report the evaluation metric at the end of each epoch:

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="Fine-tune-dbmdz/convbert-base-turkish-cased",
    push_to_hub=True,
    do_train=True,
    do_eval=True,
    num_train_epochs=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    learning_rate=5e-5,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=1,
    load_best_model_at_end=True
)


### Trainer

Create a [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) object with your model, training arguments, training and test datasets, and evaluation function:

In [None]:
from transformers import Trainer, EarlyStoppingCallback

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)],
    compute_metrics=compute_metrics,
)

Then fine-tune your model by calling [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train):

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3662,0.378602,0.871904
2,1.008,1.006377,0.512142
3,1.0066,1.007328,0.512093
4,1.0092,1.006884,0.512093
5,1.0062,1.006561,0.512093


TrainOutput(global_step=51270, training_loss=0.8443850358606245, metrics={'train_runtime': 13638.1024, 'train_samples_per_second': 601.477, 'train_steps_per_second': 75.186, 'total_flos': 1.038700086547968e+17, 'train_loss': 0.8443850358606245, 'epoch': 5.0})

In [None]:
trainer.push_to_hub()

events.out.tfevents.1710202977.870d41ddbf10.3152.0:   0%|          | 0.00/28.6k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Apoksk1/convbert-base-turkish-cased/commit/6adea47795037df63555ddc64f26f393ed17e742', commit_message='End of training', commit_description='', oid='6adea47795037df63555ddc64f26f393ed17e742', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
trainer.save_model("/content/drive/MyDrive/receipt-ocr/Fine-tune-dbmdz/convbert-base-turkish-cased")

In [None]:
trainer.save_tokenizer("/content/drive/MyDrive/receipt-ocr/Fine-tune-dbmdz/convbert-base-turkish-cased")

In [None]:
predictions = trainer.predict(tokenized_datasets["val"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
import evaluate

metric = evaluate.load("accuracy")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8907743319680125}

In [None]:
from sklearn.metrics import classification_report

def evaluate_metrics(predictions, labels):
    # Convert predictions and labels into lists
    preds_list = predictions.tolist()
    labels_list = labels.tolist()

    # Calculate classification report
    report = classification_report(labels_list, preds_list)

    return report

print(evaluate_metrics(predictions=preds, labels=predictions.label_ids))

              precision    recall  f1-score   support

           0       0.91      0.94      0.93     10502
           1       0.96      0.93      0.94      6584
           2       0.70      0.66      0.68      3422

    accuracy                           0.89     20508
   macro avg       0.86      0.84      0.85     20508
weighted avg       0.89      0.89      0.89     20508

