# **1. Setup and Imports**

In [2]:
!pip install transformers datasets torch sklearn
!pip install datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, pipeline
from datasets import Dataset
from google.colab import drive

drive.mount('/content/drive')
file_path = '/content/drive/My Drive/trainset/power/power-tr-train.tsv'
data = pd.read_csv(file_path, delimiter='\t', encoding='utf-8')
data = data.dropna(subset=['text', 'text_en', 'label'])




Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **2. Printing the Data Set**

In [3]:
print(data.head())

        id                           speaker sex  \
0  tr18146  ca2031caa4032c51980160359953d507   M   
1  tr18147  4cee0addb3c69f6866869b180f90d45f   M   
2  tr18148  b3d7f76d74ec268492f8190ca123a6b2   M   
3  tr18149  722efac7138c8197a9d1e97eed3a8b18   M   
4  tr18150  fcc61122f3553c57ae207adeb1a1af84   M   

                                                text  \
0  Yeni yasama döneminin ülkemiz için, milletimiz...   
1  Sayın Başkan, değerli milletvekilleri; bugün, ...   
2  Sayın Başkanım, öncelikle yüce Meclisin Başkan...   
3  24’üncü Dönem Meclis Başkanlığına seçilmenizde...   
4  Usul tartışmasında 2 kişi lehte 2 kişi aleyhte...   

                                             text_en  label  
0  Mr. President, dear lawmakers, I salute you, a...      0  
1  Mr. President, members of lawmakers, as I spea...      0  
2  Mr. President, I'm here to share with you the ...      0  
3  Mr. President, under the principles determined...      0  
4  Two in favour of two in the legal deb

# **3. Tokenization and Dataset Preparation**

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def tokenize_and_prepare(data, text_column):
    texts = data[text_column].tolist()
    encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    return {
        'input_ids': encodings['input_ids'].detach().numpy(),
        'attention_mask': encodings['attention_mask'].detach().numpy(),
        'labels': data['label'].to_numpy()
    }


encoded_data_en = tokenize_and_prepare(data, 'text_en')
encoded_data_orig = tokenize_and_prepare(data, 'text')
train_data, test_data = train_test_split(data, test_size=0.1, stratify=data['label'], random_state=42)

def create_dataset(encoded_data, indices):
    return Dataset.from_dict({
        'input_ids': np.array([encoded_data['input_ids'][i] for i in indices]),
        'attention_mask': np.array([encoded_data['attention_mask'][i] for i in indices]),
        'labels': np.array([encoded_data['labels'][i] for i in indices])
    })

train_dataset_en = create_dataset(encoded_data_en, train_data.index)
test_dataset_en = create_dataset(encoded_data_en, test_data.index)
train_dataset_orig = create_dataset(encoded_data_orig, train_data.index)
test_dataset_orig = create_dataset(encoded_data_orig, test_data.index)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

# **4. Model Training and Evaluation Function**

In [5]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


training_args = TrainingArguments(
    output_dir='./results_task2',
    num_train_epochs=3,
    evaluation_strategy='epoch',
    save_strategy="epoch",
    per_device_train_batch_size=8,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    logging_dir='./logs_task2',
    report_to='none'  # Disable external logging
)

# Train and evaluate the model for English text
model_en = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
trainer_en = Trainer(
    model=model_en,
    args=training_args,
    train_dataset=train_dataset_en,
    eval_dataset=test_dataset_en,
    compute_metrics=compute_metrics
)
trainer_en.train()
eval_results_en = trainer_en.evaluate()
print("Evaluation results for English model:", eval_results_en)

# Train and evaluate the model for Original Language text
model_orig = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
trainer_orig = Trainer(
    model=model_orig,
    args=training_args,
    train_dataset=train_dataset_orig,
    eval_dataset=test_dataset_orig,
    compute_metrics=compute_metrics
)
trainer_orig.train()
eval_results_orig = trainer_orig.evaluate()
print("Evaluation results for Original Language model:", eval_results_orig)




model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7023,0.698293,0.485911,0.0,0.0,0.0
2,0.6987,0.693227,0.485911,0.0,0.0,0.0
3,0.6937,0.692752,0.514089,0.679073,0.514089,1.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation results for English model: {'eval_loss': 0.692752480506897, 'eval_accuracy': 0.5140885566417481, 'eval_f1': 0.6790733004177744, 'eval_precision': 0.5140885566417481, 'eval_recall': 1.0, 'eval_runtime': 52.0286, 'eval_samples_per_second': 33.424, 'eval_steps_per_second': 4.19, 'epoch': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6985,0.699093,0.485911,0.0,0.0,0.0
2,0.6961,0.692826,0.514089,0.679073,0.514089,1.0
3,0.693,0.67985,0.638298,0.585912,0.712,0.497763


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation results for Original Language model: {'eval_loss': 0.6798498034477234, 'eval_accuracy': 0.6382978723404256, 'eval_f1': 0.5859117840684661, 'eval_precision': 0.712, 'eval_recall': 0.49776286353467564, 'eval_runtime': 52.1595, 'eval_samples_per_second': 33.34, 'eval_steps_per_second': 4.179, 'epoch': 3.0}


# **5. Zero-Shot Inference and Evaluation Function**

In [7]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
sample_texts_en = test_data['text_en'].dropna().sample(5).tolist()
sample_texts_orig = test_data['text'].dropna().sample(5).tolist()

predictions_en = zero_shot_classifier(sample_texts_en, candidate_labels=["coalition", "opposition"])
predictions_orig = zero_shot_classifier(sample_texts_orig, candidate_labels=["coalition", "opposition"])
print("Zero-shot classification results for English text:", predictions_en)
print("Zero-shot classification results for original language text:", predictions_orig)
def evaluate_zero_shot_batch(model, texts, true_labels, candidate_labels):

    texts = texts.dropna().tolist()
    true_labels = true_labels[:len(texts)]

    batch_predictions = model(texts, candidate_labels=candidate_labels, batch_size=16)
    preds = [0 if pred['labels'][0] == "coalition" else 1 for pred in batch_predictions]


    acc = accuracy_score(true_labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='binary')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

zero_shot_results_en = evaluate_zero_shot_batch(
    zero_shot_classifier, test_data['text_en'], test_data['label'], ["coalition", "opposition"]
)

zero_shot_results_orig = evaluate_zero_shot_batch(
    zero_shot_classifier, test_data['text'], test_data['label'], ["coalition", "opposition"]
)


print("Zero-shot evaluation results for English text:", zero_shot_results_en)
print("Zero-shot evaluation results for Original Language text:", zero_shot_results_orig)


Device set to use cuda:0


Zero-shot classification results for original language text: [{'sequence': 'Ben Ticaret Odası Başkanıyken Sayın Bakanımla birkaç sefer… Bakanım işaret ediyor, Mânia Planı; anladı ne soracağımı, gördüm. Yıllardır Ankara’nın çözülmeyen bir Mânia Planı sorunu var. Nedir Mânia Planı, kısaca arz edeyim: Birçoğunuz Çukurambar’da oturuyorsunuz, birçoğunuz Ümitköy’de, Batıkent’te, Çayyolu’nda oturuyorsunuz; Sayın Komutanım da çok iyi bilirler konuyu. Hepimizin üzerinden sabah 7, 8, 9, keyiflerine göre helikopterler geçiyor, “pat pat pat” duyarsınız. Sabah yataktayken 8-10 tane helikopter geçer. Ben araştırdım, dedim ki: “Ya, bunlar niye geçiyor?” Dediler ki: “Eğitim yapıyorlar.” Acemi pilotlar benim evimin üzerinde eğitim yapıyor, Armada’nın üzerinde eğitim yapıyor, alışveriş merkezlerinin üstünde eğitim yapıyor. Ya, böyle bir şey olur mu yani acemi pilotların bizim üzerimizde eğitim yapması? <p> Ben o zaman Oda Başkanıyım. Sayın Bakanla defalarca konuştuk, cevap yok. Allah nasip etti, milletv