In [21]:
!pip install transformers datasets torch scikit-learn gdown evaluate



In [22]:
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
import numpy as np
import os
import torch
import evaluate

In [23]:
target_dir = "/kaggle/working/st2data"
os.makedirs(target_dir, exist_ok=True)
!gdown "https://drive.google.com/uc?id=1u5C4o_fmjL5nQ_RtgLDShuG97Ix6_KGK" -O "/kaggle/working/st2data/train.jsonl"
!gdown "https://drive.google.com/uc?id=1rNQTkhkVG9nzcT97Nk_WyJd80ZaacT0-" -O "/kaggle/working/st2data/val.jsonl"

Downloading...
From (original): https://drive.google.com/uc?id=1u5C4o_fmjL5nQ_RtgLDShuG97Ix6_KGK
From (redirected): https://drive.google.com/uc?id=1u5C4o_fmjL5nQ_RtgLDShuG97Ix6_KGK&confirm=t&uuid=100e2b22-3a6f-469c-8173-99e05ecb217f
To: /kaggle/working/st2data/train.jsonl
100%|█████████████████████████████████████████| 662M/662M [00:02<00:00, 233MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1rNQTkhkVG9nzcT97Nk_WyJd80ZaacT0-
From (redirected): https://drive.google.com/uc?id=1rNQTkhkVG9nzcT97Nk_WyJd80ZaacT0-&confirm=t&uuid=55867de3-5056-4a17-a17f-8522b16a933e
To: /kaggle/working/st2data/val.jsonl
100%|████████████████████████████████████████| 140M/140M [00:02<00:00, 58.4MB/s]


In [24]:
# Function to load and extract required fields
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)
            data.append({"text": entry["text"], "language": entry["language"], "label": entry["label"]})
    return pd.DataFrame(data)  # Convert to DataFrame

# Load training and validation data into DataFrames
train_df = load_data("st2data/train.jsonl")
val_df = load_data("st2data/val.jsonl")

# Display first few rows
print("Training Data:")
print(train_df.head())

print("\nValidation Data:")
print(val_df.head())

# text, language, label, source_dataset, model, label_text

Training Data:
                                                text language  label
0  Have you ever had to wait for something for a ...  English      4
1  But now, things were not so simple._SEP_The gi...  English      3
2  Dear Editor,  I am writing to express my opini...  English      4
3  Humans once wielded formidable magical power. ...  English      4
4  Here is a way that I had to be patient, and we...  English      4

Validation Data:
                                                text language  label
0  In a brief speech with a main theme of comprom...  English      0
1  HOW TO MAKE: Fall Squash Vegetarian Brioche\nI...  English      0
2  The paper models the relation extraction probl...  English      3
3  This paper considers the problem of model-base...  English      3
4  Paper Summary:\n\nAuthors investigate identity...  English      3


In [25]:
# converting the data to the hugging face format.
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

## **Tokenization**

In [26]:
id2label = {0: "fully human-written", 1: "human-written, then machine-polished", 2: "machine-written, then machine-humanized", 3: "human-initiated, then machine-continued",
            4: "deeply-mixed text; where some parts are written by a human and some are generated by a machine", 5: "machine-written, then human-edited"}
label2id = {"fully human-written": 0, "human-written, then machine-polished": 1, "machine-written, then machine-humanized": 2, "human-initiated, then machine-continued": 3,
            "deeply-mixed text; where some parts are written by a human and some are generated by a machine": 4, "machine-written, then human-edited": 5}

model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True  # This will allow loading despite size mismatch
)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Remove original text column (not needed after tokenization)
train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at tabularisai/multilingual-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/288918 [00:00<?, ? examples/s]

Map:   0%|          | 0/72661 [00:00<?, ? examples/s]

## **Training Arguments**

In [27]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
)



## **Metrics**

In [28]:
def compute_metrics(eval_pred):
    f1_metric = evaluate.load("f1")
    recall_metric = evaluate.load("recall")
    accuracy_metric = evaluate.load("accuracy")

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    results = {}
    # Micro F1-score
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="micro"))
    # Macro F1-score
    results["macro_f1"] = f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]
    # Macro Recall
    results["macro_recall"] = recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"]
    # Accuracy
    results["accuracy"] = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]

    return results

## **Trainer**

In [29]:
# Update Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Include custom metrics
)

## **Train the Model**

In [30]:
!wandb login d263ae15255e15e9e2e1943f80a700ad7d7a2c6c

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [31]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33msamiyaalizaidi[0m ([33msamiyaalizaidi-habib-university[0m). Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,F1,Macro F1,Macro Recall,Accuracy
1,0.1802,3.921772,0.542974,0.548655,0.624768,0.542974
2,0.1025,4.117655,0.568297,0.594018,0.668964,0.568297


Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



TrainOutput(global_step=36116, training_loss=0.17898570940679115, metrics={'train_runtime': 23468.5976, 'train_samples_per_second': 24.622, 'train_steps_per_second': 1.539, 'total_flos': 7.6549892014891e+16, 'train_loss': 0.17898570940679115, 'epoch': 2.0})

## **Evaluate the Model**

In [32]:
# Evaluate on validation set
results = trainer.evaluate()
print(results)  # Will now include F1-score



{'eval_loss': 4.117654800415039, 'eval_f1': 0.5682966102861232, 'eval_macro_f1': 0.5940179596862004, 'eval_macro_recall': 0.6689635307150489, 'eval_accuracy': 0.5682966102861232, 'eval_runtime': 857.2672, 'eval_samples_per_second': 84.759, 'eval_steps_per_second': 5.298, 'epoch': 2.0}


## **Saving the Model**

In [33]:
model.save_pretrained("./st2modelv1")
tokenizer.save_pretrained("./st2tokenizerv1")

('./st2tokenizerv1/tokenizer_config.json',
 './st2tokenizerv1/special_tokens_map.json',
 './st2tokenizerv1/vocab.txt',
 './st2tokenizerv1/added_tokens.json',
 './st2tokenizerv1/tokenizer.json')