In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score, classification_report
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from transformers import AdamW
from tqdm import tqdm

In [2]:
language="en"
modelname="EnglishBert"
input = "gpt4"
test = "COOK"

In [3]:
if input == "gpt4":
  gpt4_flag=True
else:
  gpt4_flag=False

In [4]:
# JP encoding='euc-jp' other encoding normal CHECK ALWAYS DATASETS
df_train = pd.read_csv(f'train_gpt4_en.csv')
df_eval = pd.read_csv(f'eval_gpt4_en.csv')
df_test = pd.read_csv(f'cook_en_test.csv')

In [5]:
def get_category_mapping(language):
    if language in ['en', 'jp']:
        category_mapping = {
            'figurative': 'idiom',
            'literal': 'nonidiom'
        }
    elif language == 'tr':
        category_mapping = {
            'mecaz': 'idiom',
            'gerçek': 'nonidiom'
        }
    elif language == 'it':
        category_mapping = {
            'figurato': 'idiom',
            'letterale': 'nonidiom'
        }
    else:
        raise ValueError(f"Language '{language}' is not supported.")

    return category_mapping

In [6]:
category_mapping = get_category_mapping(language)
print(category_mapping)

{'figurative': 'idiom', 'literal': 'nonidiom'}


In [7]:
if gpt4_flag or language in ['en', 'jp']:
  df_train['category'] = df_train['category'].map(category_mapping)
df_train

Unnamed: 0,submission,category,idiom,type
0,"They had a fling with sustainable living, inst...",idiom,have fling,Zero-shot
1,The innovative approach to recycling in small ...,idiom,have future,Zero-shot
2,She lost her cool after discovering that the p...,idiom,lose cool,Enhanced-prompting
3,"Are they really prepared to give notice, or is...",idiom,give notice,Enhanced-prompting
4,The principle of fair trade started to take ro...,idiom,take root,Zero-shot
...,...,...,...,...
8475,"In preparation for the upcoming tournament, th...",nonidiom,move goalpost,Enhanced-prompting
8476,"When the lights flickered, it quickly caught t...",nonidiom,catch attention,Zero-shot
8477,"Before the rain could spoil the crop, they rus...",nonidiom,make hay,Zero-shot
8478,"During the tug of war, they had to pull a weig...",nonidiom,pull weight,Enhanced-prompting


In [8]:
if gpt4_flag or language in ['en', 'jp']:
  df_eval['category'] = df_eval['category'].map(category_mapping)
df_eval

Unnamed: 0,submission,category,idiom,type
0,Clumsily bumping into the lamp post while text...,idiom,see star,Zero-shot
1,"He feared the end of the fiscal year, as it of...",idiom,give sack,Zero-shot
2,"Concerned about the impact on the environment,...",idiom,blow whistle,Zero-shot
3,The small town's commitment to preserving its ...,idiom,take root,Zero-shot
4,Artificial intelligence has not taken root in ...,idiom,take root,Enhanced-prompting
...,...,...,...,...
2115,I had a smile on my face as I flung the frisbe...,nonidiom,have fling,Zero-shot
2116,The chef instructed the apprentice to hold the...,nonidiom,hold fire,Enhanced-prompting
2117,"In a tragic forest encounter, a bear managed t...",nonidiom,catch death,Enhanced-prompting
2118,The electrician cautiously avoided touching an...,nonidiom,touch nerve,Enhanced-prompting


In [9]:
if language in ['en', 'jp']:
  df_test['category'] = df_test['category'].map(category_mapping)

In [10]:
df_test

Unnamed: 0,category,submission,idiom
0,nonidiom,"He put his cigarette to his lips , drew in smo...",blow smoke
1,nonidiom,"She &apos;s lying on the bed , blowing smoke a...",blow smoke
2,nonidiom,The TV presenter sucked extravagantly on her c...,blow smoke
3,nonidiom,He blows cigarette smoke irritably across the ...,blow smoke
4,nonidiom,Test the alarm regularly by pressing the test ...,blow smoke
...,...,...,...
802,idiom,The death of Phyllis Henley touched these nerv...,touch nerve
803,idiom,Mikhail Gorbachev avoided touching sensitive n...,touch nerve
804,idiom,Jim Eggleton &apos;s murder touched a nerve th...,touch nerve
805,idiom,Salgado &apos;s photo-essay opens a window ont...,touch nerve


In [11]:
df_train['category'] = df_train['category'].str.lower()
df_eval['category'] = df_eval['category'].str.lower()
df_test['category'] = df_test['category'].str.lower()

In [12]:
df_train.head()

Unnamed: 0,submission,category,idiom,type
0,"They had a fling with sustainable living, inst...",idiom,have fling,Zero-shot
1,The innovative approach to recycling in small ...,idiom,have future,Zero-shot
2,She lost her cool after discovering that the p...,idiom,lose cool,Enhanced-prompting
3,"Are they really prepared to give notice, or is...",idiom,give notice,Enhanced-prompting
4,The principle of fair trade started to take ro...,idiom,take root,Zero-shot


In [13]:
df_eval.head()

Unnamed: 0,submission,category,idiom,type
0,Clumsily bumping into the lamp post while text...,idiom,see star,Zero-shot
1,"He feared the end of the fiscal year, as it of...",idiom,give sack,Zero-shot
2,"Concerned about the impact on the environment,...",idiom,blow whistle,Zero-shot
3,The small town's commitment to preserving its ...,idiom,take root,Zero-shot
4,Artificial intelligence has not taken root in ...,idiom,take root,Enhanced-prompting


In [14]:
df_test.head()

Unnamed: 0,category,submission,idiom
0,nonidiom,"He put his cigarette to his lips , drew in smo...",blow smoke
1,nonidiom,"She &apos;s lying on the bed , blowing smoke a...",blow smoke
2,nonidiom,The TV presenter sucked extravagantly on her c...,blow smoke
3,nonidiom,He blows cigarette smoke irritably across the ...,blow smoke
4,nonidiom,Test the alarm regularly by pressing the test ...,blow smoke


In [15]:
# Encode the 'category' column
label_encoder = LabelEncoder()
df_train['category_encoded'] = label_encoder.fit_transform(df_train['category'])
df_eval['category_encoded'] = label_encoder.transform(df_eval['category'])
df_test['category_encoded'] = label_encoder.transform(df_test['category'])

In [16]:
df_train

Unnamed: 0,submission,category,idiom,type,category_encoded
0,"They had a fling with sustainable living, inst...",idiom,have fling,Zero-shot,0
1,The innovative approach to recycling in small ...,idiom,have future,Zero-shot,0
2,She lost her cool after discovering that the p...,idiom,lose cool,Enhanced-prompting,0
3,"Are they really prepared to give notice, or is...",idiom,give notice,Enhanced-prompting,0
4,The principle of fair trade started to take ro...,idiom,take root,Zero-shot,0
...,...,...,...,...,...
8475,"In preparation for the upcoming tournament, th...",nonidiom,move goalpost,Enhanced-prompting,1
8476,"When the lights flickered, it quickly caught t...",nonidiom,catch attention,Zero-shot,1
8477,"Before the rain could spoil the crop, they rus...",nonidiom,make hay,Zero-shot,1
8478,"During the tug of war, they had to pull a weig...",nonidiom,pull weight,Enhanced-prompting,1


In [17]:
df_eval

Unnamed: 0,submission,category,idiom,type,category_encoded
0,Clumsily bumping into the lamp post while text...,idiom,see star,Zero-shot,0
1,"He feared the end of the fiscal year, as it of...",idiom,give sack,Zero-shot,0
2,"Concerned about the impact on the environment,...",idiom,blow whistle,Zero-shot,0
3,The small town's commitment to preserving its ...,idiom,take root,Zero-shot,0
4,Artificial intelligence has not taken root in ...,idiom,take root,Enhanced-prompting,0
...,...,...,...,...,...
2115,I had a smile on my face as I flung the frisbe...,nonidiom,have fling,Zero-shot,1
2116,The chef instructed the apprentice to hold the...,nonidiom,hold fire,Enhanced-prompting,1
2117,"In a tragic forest encounter, a bear managed t...",nonidiom,catch death,Enhanced-prompting,1
2118,The electrician cautiously avoided touching an...,nonidiom,touch nerve,Enhanced-prompting,1


In [18]:
df_test

Unnamed: 0,category,submission,idiom,category_encoded
0,nonidiom,"He put his cigarette to his lips , drew in smo...",blow smoke,1
1,nonidiom,"She &apos;s lying on the bed , blowing smoke a...",blow smoke,1
2,nonidiom,The TV presenter sucked extravagantly on her c...,blow smoke,1
3,nonidiom,He blows cigarette smoke irritably across the ...,blow smoke,1
4,nonidiom,Test the alarm regularly by pressing the test ...,blow smoke,1
...,...,...,...,...
802,idiom,The death of Phyllis Henley touched these nerv...,touch nerve,0
803,idiom,Mikhail Gorbachev avoided touching sensitive n...,touch nerve,0
804,idiom,Jim Eggleton &apos;s murder touched a nerve th...,touch nerve,0
805,idiom,Salgado &apos;s photo-essay opens a window ont...,touch nerve,0


In [19]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [20]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the data
train_encodings = tokenizer(list(df_train['submission']), truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(list(df_eval['submission']), truncation=True, padding=True, return_tensors="pt")
test_encodings = tokenizer(list(df_test['submission']), truncation=True, padding=True, return_tensors="pt")
# Convert to torch tensors
train_labels = torch.tensor(df_train['category_encoded'].values)
val_labels = torch.tensor(df_eval['category_encoded'].values)
test_labels = torch.tensor(df_test['category_encoded'].values)

# Prepare to datasets
train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)
test_dataset = TextDataset(test_encodings, test_labels)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [None]:
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-6)

epochs = 4
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Average loss across all batches
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} | Train Loss: {avg_train_loss}")

    # Evaluation step
    model.eval()
    total_eval_accuracy = 0
    for batch in tqdm(val_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        accuracy = (predictions == batch['labels']).cpu().numpy().mean() * 100
        total_eval_accuracy += accuracy

    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f"Epoch {epoch + 1}/{epochs} | Validation Accuracy: {avg_val_accuracy:.2f}%")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
100%|██████████| 1060/1060 [01:29<00:00, 11.87it/s]


Epoch 1/4 | Train Loss: 0.17605543283236055


100%|██████████| 265/265 [00:04<00:00, 54.52it/s]


Epoch 1/4 | Validation Accuracy: 98.21%


100%|██████████| 1060/1060 [01:29<00:00, 11.80it/s]


Epoch 2/4 | Train Loss: 0.04159482886535906


100%|██████████| 265/265 [00:04<00:00, 53.12it/s]


Epoch 2/4 | Validation Accuracy: 98.44%


 12%|█▏        | 130/1060 [00:11<01:19, 11.71it/s]

In [None]:
# Ensure the model is in evaluation mode
model.eval()

# Move the model to the appropriate device
model.to(device)

# Initialize variables to track accuracy
total_correct = 0
total_predictions = 0

# Store true labels and predictions for F1 score calculation
all_labels = []
all_predictions = []

# No gradient updates needed for evaluation
with torch.no_grad():
    for batch in test_loader:
        # Move batch to the same device as the model
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)

        # Get predictions
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        # Update tracking variables
        total_correct += (predictions == batch['labels']).sum().item()
        total_predictions += predictions.size(0)

        # Store predictions and true labels
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

# Calculate accuracy
test_accuracy = total_correct / total_predictions
print(f'Test Accuracy: {test_accuracy:.4f}')

# Calculate F1 scores
f1_weighted = f1_score(all_labels, all_predictions, average='weighted')
f1_macro = f1_score(all_labels, all_predictions, average='macro')

print(f'Weighted F1 Score: {f1_weighted:.4f}')
print(f'Macro F1 Score: {f1_macro:.4f}')


In [None]:
# Generate classification report
report = classification_report(all_labels, all_predictions, digits=4)
print(f'Classification Report for {modelname}, language {language.upper()}, Train {input.upper()}, Test {test.upper()}:')
print(report)