In [70]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score, classification_report
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from transformers import AdamW
from tqdm import tqdm

In [71]:
language="all"
modelname="TurkishBert"
input = "all"
test = "all"

In [72]:
df_tr_train = pd.read_csv('train_gpt4_tr.csv')
df_it_train = pd.read_csv('train_gpt4_it.csv')
df_en_train = pd.read_csv('train_gpt4_en.csv')
df_jp_train = pd.read_csv('train_gpt4_jp.csv',encoding='euc-jp')
# Concatenate all dataframes in the list into a single dataframe
df_train = pd.concat([df_tr_train, df_it_train, df_en_train, df_jp_train], ignore_index=True)

df_tr_eval = pd.read_csv('eval_gpt4_tr.csv')
df_it_eval = pd.read_csv('eval_gpt4_it.csv')
df_en_eval = pd.read_csv('eval_gpt4_en.csv')
df_jp_eval= pd.read_csv('eval_gpt4_jp.csv',encoding='euc-jp')
# Concatenate all dataframes in the list into a single dataframe
df_eval = pd.concat([df_tr_eval, df_it_eval, df_en_eval, df_jp_eval], ignore_index=True)

In [73]:
category_map = {
    'mecaz': 'idiom',
    'figurato': 'idiom',
    'figurative':'idiom',
    'gerçek': 'nonidiom',
    'letterale': 'nonidiom',
    'literal':'nonidiom'

}

df_train['category'] = df_train['category'].replace(category_map)
df_eval['category'] = df_eval['category'].replace(category_map)

In [74]:
df_tr_test = pd.read_csv('dodiom_tr_test.csv')
df_it_test = pd.read_csv('dodiom_it_test.csv')
df_en_test = pd.read_csv('cook_en_test.csv')
df_jp_test = pd.read_csv('jp_test.csv',encoding='euc-jp')

In [75]:
df_en_test['category'] = df_en_test['category'].replace(category_map)
df_jp_test['category'] = df_jp_test['category'].replace(category_map)

In [76]:
df_train['category'] = df_train['category'].str.lower()
df_eval['category'] = df_eval['category'].str.lower()
df_tr_test['category'] = df_tr_test['category'].str.lower()
df_it_test['category'] = df_it_test['category'].str.lower()
df_en_test['category'] = df_en_test['category'].str.lower()
df_jp_test['category'] = df_jp_test['category'].str.lower()

In [77]:
df_train.head()

Unnamed: 0,submission,category,idiom,type
0,Turnuvada rakiplerini teknik bilgisiyle alt ed...,idiom,Başa geçmek,Zero-shot
1,"Keşif gezisi sırasında, bilinmeyen birçok doğa...",idiom,Ortaya çıkmak,Enhanced-prompting
2,Yoğun çalışma programına ve meydan okumalara r...,idiom,Üste çıkmak,Zero-shot
3,"Toplantıda başkanın tutumları, aslında sorumlu...",idiom,Rol oynamak,Enhanced-prompting
4,"Ekip, yeni pazarlama stratejisini belirlemek i...",idiom,Kolları sıvamak,Zero-shot


In [78]:
df_eval.head()

Unnamed: 0,submission,category,idiom,type
0,"Sanatçı, eserlerindeki gerçek duygularını orta...",idiom,Ortaya koymak,Enhanced-prompting
1,Eleştirilerin ardından bir savunma duvarı meyd...,idiom,Meydana gelmek,Zero-shot
2,Şirketteki gizli toplantıyı kazara sızdırınca ...,idiom,Ayvayı yemek,Zero-shot
3,"Yazar, romanında yalnızlık temasını ele alarak...",idiom,Ele almak,Zero-shot
4,"Kariyer basamaklarını hızla tırmanırken, engeb...",idiom,Sıkı durmak,Zero-shot


In [79]:
# Encode the 'category' column
label_encoder = LabelEncoder()
df_train['category_encoded'] = label_encoder.fit_transform(df_train['category'])
df_eval['category_encoded'] = label_encoder.transform(df_eval['category'])
df_tr_test['category_encoded'] = label_encoder.transform(df_tr_test['category'])
df_it_test['category_encoded'] = label_encoder.transform(df_it_test['category'])
df_en_test['category_encoded'] = label_encoder.transform(df_en_test['category'])
df_jp_test['category_encoded'] = label_encoder.transform(df_jp_test['category'])

In [80]:
df_train

Unnamed: 0,submission,category,idiom,type,category_encoded
0,Turnuvada rakiplerini teknik bilgisiyle alt ed...,idiom,Başa geçmek,Zero-shot,0
1,"Keşif gezisi sırasında, bilinmeyen birçok doğa...",idiom,Ortaya çıkmak,Enhanced-prompting,0
2,Yoğun çalışma programına ve meydan okumalara r...,idiom,Üste çıkmak,Zero-shot,0
3,"Toplantıda başkanın tutumları, aslında sorumlu...",idiom,Rol oynamak,Enhanced-prompting,0
4,"Ekip, yeni pazarlama stratejisini belirlemek i...",idiom,Kolları sıvamak,Zero-shot,0
...,...,...,...,...,...
27675,彼は手に乗せた昆虫を慎重に観察した。,nonidiom,手に乗る,Enhanced-prompting,1
27676,植木を移植する際には、根を傷つけないように注意深く下ろす必要がある。,nonidiom,根を下ろす,Enhanced-prompting,1
27677,彼は古い本の箱を開けると、中から劇場用の古い幕を見つけた。,nonidiom,幕を開ける,Zero-shot,1
27678,子供が自転車から落ち、腕の骨が折れる事故が起きました。,nonidiom,骨が折れる,Zero-shot,1


In [81]:
df_eval

Unnamed: 0,submission,category,idiom,type,category_encoded
0,"Sanatçı, eserlerindeki gerçek duygularını orta...",idiom,Ortaya koymak,Enhanced-prompting,0
1,Eleştirilerin ardından bir savunma duvarı meyd...,idiom,Meydana gelmek,Zero-shot,0
2,Şirketteki gizli toplantıyı kazara sızdırınca ...,idiom,Ayvayı yemek,Zero-shot,0
3,"Yazar, romanında yalnızlık temasını ele alarak...",idiom,Ele almak,Zero-shot,0
4,"Kariyer basamaklarını hızla tırmanırken, engeb...",idiom,Sıkı durmak,Zero-shot,0
...,...,...,...,...,...
6915,彼はハロウィンの飾りとして、カボチャの実を色とりどりのリボンで結んでいた。,nonidiom,実を結ぶ,Enhanced-prompting,1
6916,手が塞がっていると、その重い荷物を別の場所に移動することは難しい。,nonidiom,手がない,Zero-shot,1
6917,バイクを運転中に転倒し、運転手の腕の骨が折れる事故が発生した。,nonidiom,骨が折れる,Enhanced-prompting,1
6918,子供たちは遊びながら小川に柳の枝を差し、水の流れを楽しんでいた。,nonidiom,水を差す,Zero-shot,1


In [82]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [83]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")

# Tokenize the data
train_encodings = tokenizer(list(df_train['submission']), truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(list(df_eval['submission']), truncation=True, padding=True, return_tensors="pt")

# Convert to torch tensors
train_labels = torch.tensor(df_train['category_encoded'].values)
val_labels = torch.tensor(df_eval['category_encoded'].values)


# Prepare to datasets
train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)


# DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]



In [None]:
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-uncased", num_labels=num_labels)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-6)

epochs = 4
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Average loss across all batches
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} | Train Loss: {avg_train_loss}")

    # Evaluation step
    model.eval()
    total_eval_accuracy = 0
    for batch in tqdm(val_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        accuracy = (predictions == batch['labels']).cpu().numpy().mean() * 100
        total_eval_accuracy += accuracy

    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f"Epoch {epoch + 1}/{epochs} | Validation Accuracy: {avg_val_accuracy:.2f}%")


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 13%|█▎        | 434/3460 [01:16<08:44,  5.77it/s]

In [None]:
test_encodings_tr = tokenizer(list(df_tr_test['submission']), truncation=True, padding=True, return_tensors="pt")
test_labels_tr = torch.tensor(df_tr_test['category_encoded'].values)
test_dataset_tr = TextDataset(test_encodings_tr, test_labels_tr)
test_loader_tr = DataLoader(test_dataset_tr, batch_size=8, shuffle=False)

In [None]:
# Ensure the model is in evaluation mode
model.eval()

# Move the model to the appropriate device
model.to(device)

# Initialize variables to track accuracy
total_correct = 0
total_predictions = 0

# Store true labels and predictions for F1 score calculation
all_labels = []
all_predictions = []

# No gradient updates needed for evaluation
with torch.no_grad():
    for batch in test_loader_tr:
        # Move batch to the same device as the model
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)

        # Get predictions
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        # Update tracking variables
        total_correct += (predictions == batch['labels']).sum().item()
        total_predictions += predictions.size(0)

        # Store predictions and true labels
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

# Calculate accuracy
test_accuracy = total_correct / total_predictions
print(f'Test Accuracy: {test_accuracy:.4f}')

# Calculate F1 scores
f1_weighted = f1_score(all_labels, all_predictions, average='weighted')
f1_macro = f1_score(all_labels, all_predictions, average='macro')

print(f'Weighted F1 Score: {f1_weighted:.4f}')
print(f'Macro F1 Score: {f1_macro:.4f}')


In [None]:
# Generate classification report
report = classification_report(all_labels, all_predictions, digits=4)
print(f'Classification Report for {modelname}, language {language.upper()}, Train {input.upper()}, Test {test.upper()}:')
print(report)