<a href="https://colab.research.google.com/github/hselino/complaint_analysis/blob/main/class_bturk_fine_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets scikit-learn evaluate

Collecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Using cached datasets-3.5.0-py3-none-any.whl (491 kB)
Using cached evaluate-0.4.3-py3-none-any.whl (84 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Using cached fsspec-2024.12.0-py3-none-any.whl (183 kB)
Using cached multiprocess-0.70.16-py311-none-any.whl (143 kB)
Using cached xxhash-3.5.0-cp311-c

In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/prj/segmented_class.csv")
df = df.drop(columns=["Unnamed: 0"], errors='ignore')


In [None]:
df.head()

Unnamed: 0,complaint_text,category
0,kendi homepoint finansal olarak kısmen ol cidd...,ödeme süreci sırasında sorun
1,equifax kredi raporum sahtekarlık uyarım var r...,raporunuzun uygunsuz kullanımı
2,son zaman aldatmaca işle para kimlik çal kişis...,borçlu borç alma denemeleri
3,dağ zirves financial tan kredi ön plan çık ilg...,borçlu servic iletişime geçilemiyor
4,arizona doğum tarih ss transunion llc tüket ci...,raporunuzdaki yanlış bilgiler


In [None]:
df = df.dropna()
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['category'])

# Save mapping for later
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
num_labels = len(label_mapping)


In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['complaint_text'].tolist(), df['encoded_label'].tolist(), test_size=0.2, random_state=42
)


In [None]:
from transformers import AutoTokenizer

model_name = "dbmdz/bert-base-turkish-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

In [None]:
import torch

class TurkishDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = TurkishDataset(train_encodings, train_labels)
val_dataset = TurkishDataset(val_encodings, val_labels)


In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = AutoModelForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-cased", num_labels=num_labels)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,2.7258,2.549479,0.361,0.255136,0.252427,0.361
2,2.3063,2.326189,0.3915,0.311331,0.274518,0.3915
3,2.0253,2.263855,0.403,0.3347,0.32751,0.403
4,1.7543,2.276029,0.419,0.36035,0.34236,0.419
5,1.4758,2.375791,0.4215,0.363795,0.357676,0.4215


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=5000, training_loss=2.1230431030273436, metrics={'train_runtime': 299.8132, 'train_samples_per_second': 266.833, 'train_steps_per_second': 33.354, 'total_flos': 2634323374080000.0, 'train_loss': 2.1230431030273436, 'epoch': 5.0})

In [None]:
model_path = "/content/drive/MyDrive/prj/fine_tuned_model-class_model"

# Save model, tokenizer, and config
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


('/content/drive/MyDrive/prj/fine_tuned_model-class_model/tokenizer_config.json',
 '/content/drive/MyDrive/prj/fine_tuned_model-class_model/special_tokens_map.json',
 '/content/drive/MyDrive/prj/fine_tuned_model-class_model/vocab.txt',
 '/content/drive/MyDrive/prj/fine_tuned_model-class_model/added_tokens.json',
 '/content/drive/MyDrive/prj/fine_tuned_model-class_model/tokenizer.json')

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [None]:
def predict(texts):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize and send tensors to same device as model
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predictions = torch.argmax(probs, dim=-1)

    # Move predictions to CPU before converting to numpy
    predictions = predictions.cpu().numpy()
    labels = label_encoder.inverse_transform(predictions)
    return labels



In [None]:
sample_texts = [
    "15 Mart 2025 tarihinde Adana Barajyolu Ziraat Bankası şube ATM'sinde 13 bin para yatırma işlemi gerçekleştirdim, ancak ATM para haznesine nakiti koyduğumda para haznesi kapandı ve kartımı geri iade etti, (normalde para sayıp ardından ekrana tutarın yansıması onayla kısmı gelmesi gerekti) yani paramı yuttu. Ortak ATM olduğu için işlemi QNB kartım ile gerçekleştirdim, Ziraat Bankası aradığımda kendi bankanı ara dedi. QNB bankası ise talep oluşturduklarını söyledi. Tam 15 gün geçti sonuçlanmadı, dönüş yapılmıyor dendi. Bu sefer Ziraat Barajyolu şubesine gittim tekrar, sert ve kırıcı üsluplarıyla karşılaştım. Bana kendi bankanın şube müdürlüğüne ulaşması lazım! Yazılı dilekçe ver o bankadan! Dedi. Zaman ayırıp tekrar gittim böyle bir işlem olmadığını işleyişinde bu şekilde olmadığını söylediler. Bugün 23. Gün alenen parama el konulduğunu düşünüyorum. Bu kadar işlek bir yerde her gün para sayılır fazla tutar bellidir. Bir talebe bu kadar süre dönülmemesi ve iş bilmemeleri şok edici. Gelişme olursa yazımı güncelleyeceğim"
]

predict(sample_texts)


array(['hesabı yönetmek'], dtype=object)