In [18]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score, classification_report
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from transformers import AdamW
from tqdm import tqdm

In [2]:
language="all"
modelname="DistilBert"
input = "all"
test = "all"

In [3]:
df_tr_train = pd.read_csv('train_gpt4_tr.csv')
df_it_train = pd.read_csv('train_gpt4_it.csv')
df_en_train = pd.read_csv('train_gpt4_en.csv')
df_jp_train = pd.read_csv('train_gpt4_jp.csv',encoding='euc-jp')
# Concatenate all dataframes in the list into a single dataframe
df_train = pd.concat([df_tr_train, df_it_train, df_en_train, df_jp_train], ignore_index=True)

df_tr_eval = pd.read_csv('eval_gpt4_tr.csv')
df_it_eval = pd.read_csv('eval_gpt4_it.csv')
df_en_eval = pd.read_csv('eval_gpt4_en.csv')
df_jp_eval= pd.read_csv('eval_gpt4_jp.csv',encoding='euc-jp')
# Concatenate all dataframes in the list into a single dataframe
df_eval = pd.concat([df_tr_eval, df_it_eval, df_en_eval, df_jp_eval], ignore_index=True)

In [4]:
category_map = {
    'mecaz': 'idiom',
    'figurato': 'idiom',
    'figurative':'idiom',
    'gerçek': 'nonidiom',
    'letterale': 'nonidiom',
    'literal':'nonidiom'

}

df_train['category'] = df_train['category'].replace(category_map)
df_eval['category'] = df_eval['category'].replace(category_map)

In [5]:
df_tr_test = pd.read_csv('dodiom_tr_test.csv')
df_it_test = pd.read_csv('dodiom_it_test.csv')
df_en_test = pd.read_csv('cook_en_test.csv')
df_jp_test = pd.read_csv('jp_test.csv',encoding='euc-jp')

In [6]:
df_en_test['category'] = df_en_test['category'].replace(category_map)
df_jp_test['category'] = df_jp_test['category'].replace(category_map)

In [7]:
df_train['category'] = df_train['category'].str.lower()
df_eval['category'] = df_eval['category'].str.lower()
df_tr_test['category'] = df_tr_test['category'].str.lower()
df_it_test['category'] = df_it_test['category'].str.lower()
df_en_test['category'] = df_en_test['category'].str.lower()
df_jp_test['category'] = df_jp_test['category'].str.lower()

In [8]:
df_train.head()

Unnamed: 0,submission,category,idiom,type
0,Turnuvada rakiplerini teknik bilgisiyle alt ed...,idiom,Başa geçmek,Zero-shot
1,"Keşif gezisi sırasında, bilinmeyen birçok doğa...",idiom,Ortaya çıkmak,Enhanced-prompting
2,Yoğun çalışma programına ve meydan okumalara r...,idiom,Üste çıkmak,Zero-shot
3,"Toplantıda başkanın tutumları, aslında sorumlu...",idiom,Rol oynamak,Enhanced-prompting
4,"Ekip, yeni pazarlama stratejisini belirlemek i...",idiom,Kolları sıvamak,Zero-shot


In [9]:
df_eval.head()

Unnamed: 0,submission,category,idiom,type
0,"Sanatçı, eserlerindeki gerçek duygularını orta...",idiom,Ortaya koymak,Enhanced-prompting
1,Eleştirilerin ardından bir savunma duvarı meyd...,idiom,Meydana gelmek,Zero-shot
2,Şirketteki gizli toplantıyı kazara sızdırınca ...,idiom,Ayvayı yemek,Zero-shot
3,"Yazar, romanında yalnızlık temasını ele alarak...",idiom,Ele almak,Zero-shot
4,"Kariyer basamaklarını hızla tırmanırken, engeb...",idiom,Sıkı durmak,Zero-shot


In [10]:
# Encode the 'category' column
label_encoder = LabelEncoder()
df_train['category_encoded'] = label_encoder.fit_transform(df_train['category'])
df_eval['category_encoded'] = label_encoder.transform(df_eval['category'])
df_tr_test['category_encoded'] = label_encoder.transform(df_tr_test['category'])
df_it_test['category_encoded'] = label_encoder.transform(df_it_test['category'])
df_en_test['category_encoded'] = label_encoder.transform(df_en_test['category'])
df_jp_test['category_encoded'] = label_encoder.transform(df_jp_test['category'])

In [11]:
df_train

Unnamed: 0,submission,category,idiom,type,category_encoded
0,Turnuvada rakiplerini teknik bilgisiyle alt ed...,idiom,Başa geçmek,Zero-shot,0
1,"Keşif gezisi sırasında, bilinmeyen birçok doğa...",idiom,Ortaya çıkmak,Enhanced-prompting,0
2,Yoğun çalışma programına ve meydan okumalara r...,idiom,Üste çıkmak,Zero-shot,0
3,"Toplantıda başkanın tutumları, aslında sorumlu...",idiom,Rol oynamak,Enhanced-prompting,0
4,"Ekip, yeni pazarlama stratejisini belirlemek i...",idiom,Kolları sıvamak,Zero-shot,0
...,...,...,...,...,...
27675,彼は手に乗せた昆虫を慎重に観察した。,nonidiom,手に乗る,Enhanced-prompting,1
27676,植木を移植する際には、根を傷つけないように注意深く下ろす必要がある。,nonidiom,根を下ろす,Enhanced-prompting,1
27677,彼は古い本の箱を開けると、中から劇場用の古い幕を見つけた。,nonidiom,幕を開ける,Zero-shot,1
27678,子供が自転車から落ち、腕の骨が折れる事故が起きました。,nonidiom,骨が折れる,Zero-shot,1


In [12]:
df_eval

Unnamed: 0,submission,category,idiom,type,category_encoded
0,"Sanatçı, eserlerindeki gerçek duygularını orta...",idiom,Ortaya koymak,Enhanced-prompting,0
1,Eleştirilerin ardından bir savunma duvarı meyd...,idiom,Meydana gelmek,Zero-shot,0
2,Şirketteki gizli toplantıyı kazara sızdırınca ...,idiom,Ayvayı yemek,Zero-shot,0
3,"Yazar, romanında yalnızlık temasını ele alarak...",idiom,Ele almak,Zero-shot,0
4,"Kariyer basamaklarını hızla tırmanırken, engeb...",idiom,Sıkı durmak,Zero-shot,0
...,...,...,...,...,...
6915,彼はハロウィンの飾りとして、カボチャの実を色とりどりのリボンで結んでいた。,nonidiom,実を結ぶ,Enhanced-prompting,1
6916,手が塞がっていると、その重い荷物を別の場所に移動することは難しい。,nonidiom,手がない,Zero-shot,1
6917,バイクを運転中に転倒し、運転手の腕の骨が折れる事故が発生した。,nonidiom,骨が折れる,Enhanced-prompting,1
6918,子供たちは遊びながら小川に柳の枝を差し、水の流れを楽しんでいた。,nonidiom,水を差す,Zero-shot,1


In [13]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [14]:
# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(list(df_train['submission']), truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(list(df_eval['submission']), truncation=True, padding=True, return_tensors="pt")

# Convert to torch tensors
train_labels = torch.tensor(df_train['category_encoded'].values)
val_labels = torch.tensor(df_eval['category_encoded'].values)


# Prepare to datasets
train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)


# DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



In [15]:
num_labels = len(label_encoder.classes_)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-6)

epochs = 4
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Average loss across all batches
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} | Train Loss: {avg_train_loss}")

    # Evaluation step
    model.eval()
    total_eval_accuracy = 0
    for batch in tqdm(val_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        accuracy = (predictions == batch['labels']).cpu().numpy().mean() * 100
        total_eval_accuracy += accuracy

    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f"Epoch {epoch + 1}/{epochs} | Validation Accuracy: {avg_val_accuracy:.2f}%")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
100%|██████████| 3460/3460 [06:17<00:00,  9.17it/s]


Epoch 1/4 | Train Loss: 0.37305194447032375


100%|██████████| 865/865 [00:20<00:00, 41.88it/s]


Epoch 1/4 | Validation Accuracy: 88.93%


100%|██████████| 3460/3460 [06:24<00:00,  9.00it/s]


Epoch 2/4 | Train Loss: 0.2043396956801436


100%|██████████| 865/865 [00:20<00:00, 41.71it/s]


Epoch 2/4 | Validation Accuracy: 91.78%


100%|██████████| 3460/3460 [06:24<00:00,  8.99it/s]


Epoch 3/4 | Train Loss: 0.14241811432290252


100%|██████████| 865/865 [00:20<00:00, 41.77it/s]


Epoch 3/4 | Validation Accuracy: 92.93%


100%|██████████| 3460/3460 [06:25<00:00,  8.98it/s]


Epoch 4/4 | Train Loss: 0.10679561979293142


100%|██████████| 865/865 [00:20<00:00, 41.73it/s]

Epoch 4/4 | Validation Accuracy: 93.42%





In [19]:
test_encodings_tr = tokenizer(list(df_tr_test['submission']), truncation=True, padding=True, return_tensors="pt")
test_labels_tr = torch.tensor(df_tr_test['category_encoded'].values)
test_dataset_tr = TextDataset(test_encodings_tr, test_labels_tr)
test_loader_tr = DataLoader(test_dataset_tr, batch_size=8, shuffle=False)

In [20]:
# Ensure the model is in evaluation mode
model.eval()

# Move the model to the appropriate device
model.to(device)

# Initialize variables to track accuracy
total_correct = 0
total_predictions = 0

# Store true labels and predictions for F1 score calculation
all_labels = []
all_predictions = []

# No gradient updates needed for evaluation
with torch.no_grad():
    for batch in test_loader_tr:
        # Move batch to the same device as the model
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)

        # Get predictions
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        # Update tracking variables
        total_correct += (predictions == batch['labels']).sum().item()
        total_predictions += predictions.size(0)

        # Store predictions and true labels
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

# Calculate accuracy
test_accuracy = total_correct / total_predictions
print(f'Test Accuracy: {test_accuracy:.4f}')

# Calculate F1 scores
f1_weighted = f1_score(all_labels, all_predictions, average='weighted')
f1_macro = f1_score(all_labels, all_predictions, average='macro')

print(f'Weighted F1 Score: {f1_weighted:.4f}')
print(f'Macro F1 Score: {f1_macro:.4f}')


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Test Accuracy: 0.6406
Weighted F1 Score: 0.6402
Macro F1 Score: 0.6405


In [21]:
# Generate classification report
report = classification_report(all_labels, all_predictions, digits=4)
print(f'Classification Report for {modelname}, language {language.upper()}, Train {input.upper()}, Test {test.upper()}:')
print(report)

Classification Report for DistilBert, language ALL, Train ALL, Test ALL:
              precision    recall  f1-score   support

           0     0.5965    0.7018    0.6449       969
           1     0.6939    0.5874    0.6362      1115

    accuracy                         0.6406      2084
   macro avg     0.6452    0.6446    0.6405      2084
weighted avg     0.6486    0.6406    0.6402      2084



In [22]:
test_encodings_it = tokenizer(list(df_it_test['submission']), truncation=True, padding=True, return_tensors="pt")
test_labels_it = torch.tensor(df_it_test['category_encoded'].values)
test_dataset_it = TextDataset(test_encodings_it, test_labels_it)
test_loader_it = DataLoader(test_dataset_it, batch_size=8, shuffle=False)

In [23]:
# Ensure the model is in evaluation mode
model.eval()

# Move the model to the appropriate device
model.to(device)

# Initialize variables to track accuracy
total_correct = 0
total_predictions = 0

# Store true labels and predictions for F1 score calculation
all_labels = []
all_predictions = []

# No gradient updates needed for evaluation
with torch.no_grad():
    for batch in test_loader_it:
        # Move batch to the same device as the model
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)

        # Get predictions
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        # Update tracking variables
        total_correct += (predictions == batch['labels']).sum().item()
        total_predictions += predictions.size(0)

        # Store predictions and true labels
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

# Calculate accuracy
test_accuracy = total_correct / total_predictions
print(f'Test Accuracy: {test_accuracy:.4f}')

# Calculate F1 scores
f1_weighted = f1_score(all_labels, all_predictions, average='weighted')
f1_macro = f1_score(all_labels, all_predictions, average='macro')

print(f'Weighted F1 Score: {f1_weighted:.4f}')
print(f'Macro F1 Score: {f1_macro:.4f}')


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Test Accuracy: 0.6567
Weighted F1 Score: 0.6594
Macro F1 Score: 0.6528


In [24]:
# Generate classification report
report = classification_report(all_labels, all_predictions, digits=4)
print(f'Classification Report for {modelname}, language {language.upper()}, Train {input.upper()}, Test {test.upper()}:')
print(report)

Classification Report for DistilBert, language ALL, Train ALL, Test ALL:
              precision    recall  f1-score   support

           0     0.7371    0.6483    0.6899      1345
           1     0.5704    0.6688    0.6157       939

    accuracy                         0.6567      2284
   macro avg     0.6537    0.6586    0.6528      2284
weighted avg     0.6686    0.6567    0.6594      2284



In [25]:
test_encodings_en = tokenizer(list(df_en_test['submission']), truncation=True, padding=True, return_tensors="pt")
test_labels_en = torch.tensor(df_en_test['category_encoded'].values)
test_dataset_en = TextDataset(test_encodings_en, test_labels_en)
test_loader_en = DataLoader(test_dataset_en, batch_size=8, shuffle=False)

In [26]:
# Ensure the model is in evaluation mode
model.eval()

# Move the model to the appropriate device
model.to(device)

# Initialize variables to track accuracy
total_correct = 0
total_predictions = 0

# Store true labels and predictions for F1 score calculation
all_labels = []
all_predictions = []

# No gradient updates needed for evaluation
with torch.no_grad():
    for batch in test_loader_en:
        # Move batch to the same device as the model
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)

        # Get predictions
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        # Update tracking variables
        total_correct += (predictions == batch['labels']).sum().item()
        total_predictions += predictions.size(0)

        # Store predictions and true labels
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

# Calculate accuracy
test_accuracy = total_correct / total_predictions
print(f'Test Accuracy: {test_accuracy:.4f}')

# Calculate F1 scores
f1_weighted = f1_score(all_labels, all_predictions, average='weighted')
f1_macro = f1_score(all_labels, all_predictions, average='macro')

print(f'Weighted F1 Score: {f1_weighted:.4f}')
print(f'Macro F1 Score: {f1_macro:.4f}')


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Test Accuracy: 0.8587
Weighted F1 Score: 0.8528
Macro F1 Score: 0.7778


In [27]:
# Generate classification report
report = classification_report(all_labels, all_predictions, digits=4)
print(f'Classification Report for {modelname}, language {language.upper()}, Train {input.upper()}, Test {test.upper()}:')
print(report)

Classification Report for DistilBert, language ALL, Train ALL, Test ALL:
              precision    recall  f1-score   support

           0     0.8872    0.9380    0.9119       629
           1     0.7254    0.5787    0.6438       178

    accuracy                         0.8587       807
   macro avg     0.8063    0.7583    0.7778       807
weighted avg     0.8515    0.8587    0.8528       807



In [28]:
test_encodings_jp = tokenizer(list(df_jp_test['submission']), truncation=True, padding=True, return_tensors="pt")
test_labels_jp = torch.tensor(df_jp_test['category_encoded'].values)
test_dataset_jp = TextDataset(test_encodings_jp, test_labels_jp)
test_loader_jp = DataLoader(test_dataset_jp, batch_size=8, shuffle=False)

In [29]:
# Ensure the model is in evaluation mode
model.eval()

# Move the model to the appropriate device
model.to(device)

# Initialize variables to track accuracy
total_correct = 0
total_predictions = 0

# Store true labels and predictions for F1 score calculation
all_labels = []
all_predictions = []

# No gradient updates needed for evaluation
with torch.no_grad():
    for batch in test_loader_jp:
        # Move batch to the same device as the model
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)

        # Get predictions
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        # Update tracking variables
        total_correct += (predictions == batch['labels']).sum().item()
        total_predictions += predictions.size(0)

        # Store predictions and true labels
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

# Calculate accuracy
test_accuracy = total_correct / total_predictions
print(f'Test Accuracy: {test_accuracy:.4f}')

# Calculate F1 scores
f1_weighted = f1_score(all_labels, all_predictions, average='weighted')
f1_macro = f1_score(all_labels, all_predictions, average='macro')

print(f'Weighted F1 Score: {f1_weighted:.4f}')
print(f'Macro F1 Score: {f1_macro:.4f}')


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Test Accuracy: 0.6794
Weighted F1 Score: 0.6789
Macro F1 Score: 0.6115


In [30]:
# Generate classification report
report = classification_report(all_labels, all_predictions, digits=4)
print(f'Classification Report for {modelname}, language {language.upper()}, Train {input.upper()}, Test {test.upper()}:')
print(report)

Classification Report for DistilBert, language ALL, Train ALL, Test ALL:
              precision    recall  f1-score   support

           0     0.7722    0.7757    0.7740     10781
           1     0.4516    0.4466    0.4491      4458

    accuracy                         0.6794     15239
   macro avg     0.6119    0.6112    0.6115     15239
weighted avg     0.6784    0.6794    0.6789     15239

