# FineTuning BERT for Multi-Class Classification with custom datasets

In [None]:
import torch, os
from google.colab import drive

drive.mount("/content/drive")
if os.getcwd() != "/content/drive/My Drive/akademi/Packt NLP with Transformers/CH05":
    os.chdir("drive/MyDrive/akademi/Packt NLP with Transformers/CH05")

In [None]:
!pip install transformers datasets

## Loading Pre-Trained Model

In [None]:
from torch import cuda

device = "cuda" if cuda.is_available() else "cpu"
device

### Obtaining and Preparing downstream task data

In [None]:
if "TTC4900.csv" not in os.listdir():
    !wget  https://raw.githubusercontent.com/savasy/TurkishTextClassification/master/TTC4900.csv
else:
    print("Already there !")

In [None]:
import pandas as pd

data = pd.read_csv("TTC4900.csv")
data = data.sample(frac=1.0, random_state=42)
data.head(5)

In [None]:
labels = ["teknoloji", "ekonomi", "saglik", "siyaset", "kultur", "spor", "dunya"]
NUM_LABELS = len(labels)
id2label = {i: l for i, l in enumerate(labels)}
label2id = {l: i for i, l in enumerate(labels)}

In [None]:
label2id

In [None]:
data["labels"] = data.category.map(lambda x: label2id[x.strip()])

In [None]:
data.head()

In [None]:
data.category.value_counts().plot(kind="pie", figsize=(8, 8))

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained(
    "dbmdz/bert-base-turkish-uncased", max_length=512
)

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "dbmdz/bert-base-turkish-uncased",
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label2id,
)
model.to(device)

## Splitting data


In [None]:
SIZE = data.shape[0]

train_texts = list(data.text[: SIZE // 2])
val_texts = list(data.text[SIZE // 2 : (3 * SIZE) // 4])
test_texts = list(data.text[(3 * SIZE) // 4 :])

train_labels = list(data.labels[: SIZE // 2])
val_labels = list(data.labels[SIZE // 2 : (3 * SIZE) // 4])
test_labels = list(data.labels[(3 * SIZE) // 4 :])

In [None]:
len(train_texts), len(val_texts), len(test_texts)

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
from torch.utils.data import Dataset


class MyDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = MyDataset(train_encodings, train_labels)
val_dataset = MyDataset(val_encodings, val_labels)
test_dataset = MyDataset(test_encodings, test_labels)

## Training with Trainer Class

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro"
    )
    acc = accuracy_score(labels, preds)
    return {"Accuracy": acc, "F1": f1, "Precision": precision, "Recall": recall}

In [None]:
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir="./TTC4900Model",
    do_train=True,
    do_eval=True,
    #  The number of epochs, defaults to 3.0
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    # Number of steps used for a linear warmup
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy="steps",
    # TensorBoard log directory
    logging_dir="./multi-class-logs",
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="epoch",
    fp16=True,
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    # the pre-trained model that will be fine-tuned
    model=model,
    # training arguments that we defined above
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
q = [
    trainer.evaluate(eval_dataset=data)
    for data in [train_dataset, val_dataset, test_dataset]
]
pd.DataFrame(q, index=["train", "val", "test"]).iloc[:, :5]

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

In [None]:
def predict(text):
    inputs = tokenizer(
        text, padding=True, truncation=True, max_length=512, return_tensors="pt"
    ).to("cuda")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return probs, probs.argmax(), model.config.id2label[probs.argmax().item()]

In [None]:
# Example #1
text = "Fenerbahçeli futbolcular kısa paslarla hazırlık çalışması yaptılar"
predict(text)

## Save and Re-Load saved model for inference

In [None]:
!pip install transformers

In [None]:
# saving the fine tuned model & tokenizer
model_path = "turkish-text-classification-model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
model_path = "turkish-text-classification-model"
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
nlp("Sinemada hangi filmler oynuyor bugün")

In [None]:
nlp("Dolar ve Euro bugün yurtiçi piyasalarda yükseldi")

In [None]:
nlp(
    "Bayern Münih ile Barcelona bugün karşı karşıya geliyor. Maçı İngiliz hakem James Watts yönetecek!"
)