# **1. Setup and Imports**

In [1]:
!pip install transformers datasets torch sklearn
!pip install datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, pipeline
from datasets import load_dataset, Dataset
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/trainset/orientation/orientation-tr-train.tsv'
data = pd.read_csv(file_path, delimiter='\t', encoding='utf-8')


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Collecting datasets
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from dat

# **2. Printing the Data Set**

In [2]:
print(data.head())

        id                           speaker sex  \
0  tr00000  ca2031caa4032c51980160359953d507   M   
1  tr00001  4cee0addb3c69f6866869b180f90d45f   M   
2  tr00002  b3d7f76d74ec268492f8190ca123a6b2   M   
3  tr00003  722efac7138c8197a9d1e97eed3a8b18   M   
4  tr00004  be82a4ade406ec6774a0a2e38f6957e3   M   

                                                text  \
0  Yeni yasama döneminin ülkemiz için, milletimiz...   
1  Sayın Başkan, değerli milletvekilleri; bugün, ...   
2  Sayın Başkanım, öncelikle yüce Meclisin Başkan...   
3  24’üncü Dönem Meclis Başkanlığına seçilmenizde...   
4  24’üncü Yasama Dönemimizin tüm milletvekilleri...   

                                             text_en  label  
0  Mr. President, dear lawmakers, I salute you, a...      1  
1  Mr. President, members of lawmakers, as I spea...      1  
2  Mr. President, I'm here to share with you the ...      1  
3  Mr. President, under the principles determined...      1  
4  Mr. President, dear lawmakers, I ask 

# **3. Tokenization and Dataset Preparation**

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def tokenize_and_prepare(data, text_column):
    texts = data[text_column].fillna("").tolist()  # Handle missing values
    encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    return {
        'input_ids': encodings['input_ids'].detach().numpy(),
        'attention_mask': encodings['attention_mask'].detach().numpy(),
        'labels': data['label'].to_numpy()
    }

encoded_data_en = tokenize_and_prepare(data, 'text_en')
encoded_data_orig = tokenize_and_prepare(data, 'text')

train_data, test_data = train_test_split(data, test_size=0.1, stratify=data['label'])

def create_dataset(encoded_data, indices):
    return Dataset.from_dict({
        'input_ids': np.array([encoded_data['input_ids'][i] for i in indices]),
        'attention_mask': np.array([encoded_data['attention_mask'][i] for i in indices]),
        'labels': np.array([encoded_data['labels'][i] for i in indices])
    })

train_dataset_en = create_dataset(encoded_data_en, train_data.index)
test_dataset_en = create_dataset(encoded_data_en, test_data.index)
train_dataset_orig = create_dataset(encoded_data_orig, train_data.index)
test_dataset_orig = create_dataset(encoded_data_orig, test_data.index)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

# **4. Model Training and Evaluation Function**

In [4]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir='./logs',
    report_to='none'  # Disable external logging
)

model_en = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
trainer_en = Trainer(
    model=model_en,
    args=training_args,
    train_dataset=train_dataset_en,
    eval_dataset=test_dataset_en,
    compute_metrics=compute_metrics
)
trainer_en.train()

model_orig = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
trainer_orig = Trainer(
    model=model_orig,
    args=training_args,
    train_dataset=train_dataset_orig,
    eval_dataset=test_dataset_orig,
    compute_metrics=compute_metrics
)
trainer_orig.train()





model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6814,0.679899,0.581784,0.735605,0.581784,1.0
2,0.6839,0.679717,0.581784,0.735605,0.581784,1.0
3,0.681,0.679805,0.581784,0.735605,0.581784,1.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6873,0.679624,0.581784,0.735605,0.581784,1.0
2,0.6823,0.680153,0.581784,0.735605,0.581784,1.0
3,0.6817,0.680075,0.581784,0.735605,0.581784,1.0


TrainOutput(global_step=5448, training_loss=0.6841923997965853, metrics={'train_runtime': 4665.9839, 'train_samples_per_second': 9.338, 'train_steps_per_second': 1.168, 'total_flos': 1.146427490414592e+16, 'train_loss': 0.6841923997965853, 'epoch': 3.0})

# **5. Zero-Shot Inference and Evaluation Function**

In [5]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import pipeline

zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

sample_texts_en = test_data['text_en'].dropna().sample(2).tolist()
sample_texts_orig = test_data['text'].dropna().sample(2).tolist()

predictions_en = zero_shot_classifier(sample_texts_en, candidate_labels=["left", "right"])
predictions_orig = zero_shot_classifier(sample_texts_orig, candidate_labels=["left", "right"])
print("Zero-shot classification results for English text:", predictions_en)
print("Zero-shot classification results for Original language text:", predictions_orig)

def evaluate_zero_shot_batch(model, texts, true_labels, candidate_labels):

    texts = texts.dropna().tolist()
    true_labels = true_labels[:len(texts)]

    batch_predictions = model(texts, candidate_labels=candidate_labels, batch_size=16)
    preds = [0 if pred['labels'][0] == "left" else 1 for pred in batch_predictions]

    acc = accuracy_score(true_labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='binary')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


zero_shot_results_en = evaluate_zero_shot_batch(
    zero_shot_classifier, test_data['text_en'], test_data['label'], ["left", "right"]
)

zero_shot_results_orig = evaluate_zero_shot_batch(
    zero_shot_classifier, test_data['text'], test_data['label'], ["left", "right"]
)

print("Zero-shot evaluation results for English text:", zero_shot_results_en)
print("Zero-shot evaluation results for Original language text:", zero_shot_results_orig)

eval_results_en = trainer_en.evaluate()
eval_results_orig = trainer_orig.evaluate()

print("Evaluation results for Fine-tuned English model:", eval_results_en)
print("Evaluation results for Fine-tuned Original language model:", eval_results_orig)


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


Zero-shot classification results for English text: [{'sequence': "Mr. President, valued lawmakers are actually the name of the history of intensive cultural, political, economic and social relations on the north-south-south-west axis as of Turkey's location. Caucasus, the Caspian region, the Middle East, North Africa, the Balkans are one of the main areas. There has been a period of concentration of relations with one of these regions, and other areas have been carefully maintained. <p> In this sense, the Balkans are a very important geography in Turkey's history, and in this sense, they have always been a front for social, cultural, political and economic relations, especially after the collapse of the Soviets, one of Turkey's major agendas. Participating in the new global system of 90 post-90s has also made it easier for Turkey to develop relations with these countries. <p> Romania is one of the Balkan countries where bilateral relations have developed rapidly over the past 20 years.

Evaluation results for Fine-tuned English model: {'eval_loss': 0.6797170639038086, 'eval_accuracy': 0.5817843866171004, 'eval_f1': 0.7356051703877791, 'eval_precision': 0.5817843866171004, 'eval_recall': 1.0, 'eval_runtime': 44.9199, 'eval_samples_per_second': 35.931, 'eval_steps_per_second': 4.497, 'epoch': 3.0}
Evaluation results for Fine-tuned Original language model: {'eval_loss': 0.679623544216156, 'eval_accuracy': 0.5817843866171004, 'eval_f1': 0.7356051703877791, 'eval_precision': 0.5817843866171004, 'eval_recall': 1.0, 'eval_runtime': 45.0639, 'eval_samples_per_second': 35.816, 'eval_steps_per_second': 4.483, 'epoch': 3.0}
