In [19]:
!pip install transformers datasets scikit-learn pandas



In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

In [23]:
from google.colab import drive
drive.mount('/content/drive')

file_path = "/content/drive/MyDrive/normalized_dataset.csv"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
from huggingface_hub import login

login(token="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")

In [33]:
import csv

data = []
with open(file_path, "r", encoding="utf-8") as file:
    reader = csv.reader(file)
    for row in reader:
        if len(row) >= 3:
            text = ",".join(row[1:-1]).strip()
            label = row[-1].strip()
            data.append((text, label))

df = pd.DataFrame(data, columns=["text", "label"])

In [34]:
unique_labels = sorted(df["label"].unique())
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}
df["label"] = df["label"].map(label2id)

In [35]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

In [37]:
from torch.utils.data import Dataset, DataLoader
tokenizer = AutoTokenizer.from_pretrained("boun-tabi-LMG/TURNA")

class ComplaintDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = ComplaintDataset(train_texts, train_labels, tokenizer)
val_dataset = ComplaintDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.68k [00:00<?, ?B/s]

In [38]:
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from tqdm import tqdm
from torch.optim import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(
    "boun-tabi-LMG/TURNA",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

model.train()
for batch in tqdm(train_loader, desc="Training"):
    optimizer.zero_grad()
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()


config.json:   0%|          | 0.00/824 [00:00<?, ?B/s]

You are using a model of type t5 to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/4.57G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at boun-tabi-LMG/TURNA and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.L

In [39]:
save_path = "/content/drive/MyDrive/turna_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('/content/drive/MyDrive/turna_model/tokenizer_config.json',
 '/content/drive/MyDrive/turna_model/special_tokens_map.json',
 '/content/drive/MyDrive/turna_model/tokenizer.json')

In [40]:
import json
with open(f"{save_path}/label_map.json", "w", encoding="utf-8") as f:
    json.dump({"label2id": label2id, "id2label": id2label}, f, ensure_ascii=False, indent=4)