In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
import torch

dataset = load_dataset("stefan-it/offenseval2020_tr")

def preprocess_function(examples):
    # The label is at the beginning of each text, we'll extract it
    labels = [label.split("__label__")[1][:3] for label in examples['text']]  # assuming the label is within the text
    texts = [text.split("__label__")[1][4:] for text in examples['text']]  # Remove the label part from the text

    # Map the labels to integers (adjust according to your labels)
    label_map = {"NOT": 0, "OFF": 1}  # Modify as needed
    labels = [label_map[label] for label in labels]

    encoding = tokenizer(texts,truncation=True, max_length=128)

    return {**encoding, 'labels': torch.tensor(labels), "text": texts}


tokenizer = AutoTokenizer.from_pretrained("dbmdz/distilbert-base-turkish-cased")


encoded_dataset = dataset.map(preprocess_function, batched=True)
print(encoded_dataset["train"][0])

In [None]:
import numpy as np
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score,balanced_accuracy_score
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

def classify_with_embeddings_batch(sentences, model, labels):
    
    label_embeddings = model.encode(labels, prompt_name="Classification")

    sentence_embeddings = model.encode(sentences, prompt_name="Classification")

    similarities = model.similarity(sentence_embeddings, label_embeddings)

    
    predicted_idx = similarities.argmax(1)
    return predicted_idx

In [None]:
def evaluate_model(dataset, model, labels, batch_size=32):
    all_predictions = []
    all_true_labels = []

    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i+batch_size]

        sentences = batch['text']
        true_labels = batch['labels']
        predicted_idx = classify_with_embeddings_batch(sentences, model, labels)


        predicted_labels = [labels[idx] for idx in predicted_idx]
        all_predictions.extend(predicted_labels)
        all_true_labels.extend([labels[true_label] for true_label in true_labels])

    accuracy = accuracy_score(all_true_labels, all_predictions)
    balanced_accuracy = balanced_accuracy_score(all_true_labels, all_predictions)
    f1 = f1_score(all_true_labels, all_predictions,pos_label=labels[0]) # f1 needs one of the labels yo understand which one is positive (when input is text)

    return accuracy, balanced_accuracy, f1

In [6]:
# Evaluate model on train, validation, and test splits

model = SentenceTransformer("unsloth/embeddinggemma-300m")
labels = ["Nefret söylemi, küfür, ofansif", "saygılı ifade"]

for split in ['train', 'validation']:
    accuracy, balanced_accuracy, f1 = evaluate_model(encoded_dataset[split], model,labels,batch_size=128)
    print(f'Accuracy on {split} set: {accuracy:.4f}')
    print(f'Balanced Accuracy on {split} set: {balanced_accuracy:.4f}')
    print(f'F1 Score on {split} set: {f1:.4f}')
    print()

100%|██████████| 235/235 [01:25<00:00,  2.75it/s]


Accuracy on train set: 0.6185
Balanced Accuracy on train set: 0.4666
F1 Score on train set: 0.7513



100%|██████████| 14/14 [00:05<00:00,  2.78it/s]

Accuracy on validation set: 0.6213
Balanced Accuracy on validation set: 0.4683
F1 Score on validation set: 0.7536






In [None]:
def embedding_extraction(dataset, model,batch_size=32):
    embedding_list = []

    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i+batch_size]
        sentence_embeddings = model.encode(batch['text'])

        embedding_list.append(sentence_embeddings)

    return np.concatenate(embedding_list)

In [39]:
train_X = embedding_extraction(encoded_dataset['train'],model,batch_size=128)
val_X = embedding_extraction(encoded_dataset['validation'],model,batch_size=128)
test_X = embedding_extraction(encoded_dataset['test'],model,batch_size=128)

100%|██████████| 235/235 [01:09<00:00,  3.37it/s]
100%|██████████| 14/14 [00:04<00:00,  3.37it/s]
100%|██████████| 28/28 [00:08<00:00,  3.36it/s]


In [28]:
from sklearn import svm
clf = svm.SVC()

In [40]:
clf.fit(train_X, np.array(encoded_dataset['train']["labels"]))

In [55]:
y_pred = clf.predict(train_X)
y_gold = np.array(encoded_dataset['train']["labels"])
print(accuracy_score(y_gold, y_pred))
print(balanced_accuracy_score(y_gold, y_pred))
print(f1_score(y_gold, y_pred))

0.8300666666666666
0.5628382865724161
0.2245208396714329


In [46]:
y_pred = clf.predict(val_X)
y_gold = np.array(encoded_dataset['validation']["labels"])
print(accuracy_score(y_gold, y_pred))
print(balanced_accuracy_score(y_gold, y_pred))
print(f1_score(y_gold, y_pred))

0.8149202733485194
0.5206287352820695
0.08450704225352113
