<a href="https://colab.research.google.com/github/jonathanaveo/PRESA/blob/main/SEMANTIC_FRAMING_POC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
import warnings


In [None]:
# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

def normalize_label_studio_data(file_path, annotator_name):
    """
    Parses different Label Studio JSON formats into a standardized list of dicts.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    cleaned_data = []

    # Define the list of specific labels we want to extract
    target_labels = [
        "threat_danger", "economic_impact", "political_conflict",
        "nationalism_patriotism", "conspiracy_hidden_agenda",
        "victimization", "heroism", "scandal_corruption"
    ]

    for item in data:
        entry = {}

        # --- HANDLING TEXT (The Join Key) ---
        # Try to find the article text. Format 1/2 puts it in 'data', Format 3 puts it at top level.
        if 'data' in item and 'article' in item['data']:
            text = item['data']['article'] # Format 1 & 2
        elif 'article' in item:
            text = item['article'] # Format 3
        else:
            continue # Skip if no text found

        # Normalize text to ensure matching (trim whitespace)
        entry['article_hash'] = text.strip()
        entry['full_text'] = text
        entry['annotator'] = annotator_name

        # --- HANDLING LABELS ---

        # Check if Format 1/2 (Nested 'annotations' -> 'result')
        if 'annotations' in item and len(item['annotations']) > 0:
            results = item['annotations'][0]['result']
            for res in results:
                label_name = res['from_name']
                # Extract choice value (usually a list like ['1'] or ['0'])
                try:
                    val = int(res['value']['choices'][0])
                except (KeyError, IndexError, ValueError):
                    val = 0 # Default to 0 if missing or error

                if label_name in target_labels:
                    entry[label_name] = val

        # Check if Format 3 (Flat keys)
        else:
            for label in target_labels:
                if label in item:
                    try:
                        entry[label] = int(item[label])
                    except ValueError:
                        entry[label] = 0

        # Ensure all target labels exist in the entry, default to 0 if missing
        for label in target_labels:
            if label not in entry:
                entry[label] = 0

        cleaned_data.append(entry)

    return pd.DataFrame(cleaned_data)

In [None]:
# --- 1. Load and Normalize Data ---
print("Loading and normalizing files...")
df1 = normalize_label_studio_data('annotator_1.json', 'A1')
df2 = normalize_label_studio_data('annotator_2.json', 'A2')
df3 = normalize_label_studio_data('annotator_3.json', 'A3')

print(f"Raw Counts -> A1: {len(df1)}, A2: {len(df2)}, A3: {len(df3)}")

Loading and normalizing files...
Raw Counts -> A1: 53, A2: 50, A3: 73


In [None]:
# --- 2. Find Intersection (Common Articles) ---
# We use the article text as the unique identifier.
# We perform an inner join on all three.

# Rename columns to distinguish annotators
cols_to_rename = [
    "threat_danger", "economic_impact", "political_conflict",
    "nationalism_patriotism", "conspiracy_hidden_agenda",
    "victimization", "heroism", "scandal_corruption"
]

df1_renamed = df1.rename(columns={c: f"{c}_A1" for c in cols_to_rename})
df2_renamed = df2.rename(columns={c: f"{c}_A2" for c in cols_to_rename})
df3_renamed = df3.rename(columns={c: f"{c}_A3" for c in cols_to_rename})

# Merge A1 and A2
merged_df = pd.merge(
    df1_renamed,
    df2_renamed[['article_hash'] + [f"{c}_A2" for c in cols_to_rename]],
    on='article_hash',
    how='inner'
)

# Merge result with A3
final_df = pd.merge(
    merged_df,
    df3_renamed[['article_hash'] + [f"{c}_A3" for c in cols_to_rename]],
    on='article_hash',
    how='inner'
)

print(f"✅ Cleaned Dataset Size (Intersection): {len(final_df)} articles")

✅ Cleaned Dataset Size (Intersection): 55 articles


In [None]:
# --- 3. Calculate Inter-Annotator Agreement (Cohen's Kappa) ---
# Since we have 3 annotators, we usually calculate Fleiss' Kappa,
# or the average Pairwise Cohen's Kappa. Here we calculate Average Pairwise Cohen's Kappa per label.

agreement_report = []

for label in cols_to_rename:
    # Get vectors
    a1_scores = final_df[f"{label}_A1"]
    a2_scores = final_df[f"{label}_A2"]
    a3_scores = final_df[f"{label}_A3"]

    # Calculate pairwise kappas
    k12 = cohen_kappa_score(a1_scores, a2_scores)
    k23 = cohen_kappa_score(a2_scores, a3_scores)
    k13 = cohen_kappa_score(a1_scores, a3_scores)

    # Average
    avg_kappa = np.mean([k12, k23, k13])

    agreement_report.append({
        "Label": label,
        "Avg Kappa": round(avg_kappa, 3),
        "A1-A2": round(k12, 3),
        "A2-A3": round(k23, 3),
        "A1-A3": round(k13, 3)
    })

In [None]:
# --- 4. Generate Majority Vote (Ground Truth for Training) ---
# For training mBERT, you usually want the "correct" label.
# We calculate the mode (majority vote). If there is a tie (e.g., 0, 1, ? - rare in binary 3 annotators),
# we usually take the max or handle specifically. With 3 binary annotators, ties are impossible (sum is 0, 1, 2, or 3).

for label in cols_to_rename:
    # Sum the rows for this label
    total_score = final_df[f"{label}_A1"] + final_df[f"{label}_A2"] + final_df[f"{label}_A3"]
    # If sum >= 2, then majority is 1. Else 0.
    final_df[f"{label}_final"] = (total_score >= 2).astype(int)

In [None]:
# --- 5. Export ---
# Display Report
results_df = pd.DataFrame(agreement_report)
print("\n--- Inter-Annotator Agreement (Cohen's Kappa) ---")
print("Interpretation: <0: Poor, 0-0.2: Slight, 0.2-0.4: Fair, 0.4-0.6: Moderate, 0.6-0.8: Substantial, 0.8-1: Perfect")
print(results_df)

# Save the final aligned JSON for mBERT training
# We keep only the text and the final majority-voted labels
output_columns = ['full_text'] + [f"{c}_final" for c in cols_to_rename]
export_df = final_df[output_columns]

# Rename columns back to original names for cleaner JSON
export_df.columns = ['text'] + cols_to_rename

output_filename = "aligned_annotations_for_mbert.json"
export_df.to_json(output_filename, orient='records', indent=4, force_ascii=False)
print(f"\n✅ Automatically saved aligned file: {output_filename}")

#View first few rows
print("\nPreview of aligned data:")
print(export_df.head())


--- Inter-Annotator Agreement (Cohen's Kappa) ---
Interpretation: <0: Poor, 0-0.2: Slight, 0.2-0.4: Fair, 0.4-0.6: Moderate, 0.6-0.8: Substantial, 0.8-1: Perfect
                      Label  Avg Kappa  A1-A2  A2-A3  A1-A3
0             threat_danger      0.583  0.549  0.617  0.581
1           economic_impact      0.676  0.711  0.697  0.621
2        political_conflict      0.515  0.507  0.363  0.673
3    nationalism_patriotism      0.616  0.484  0.854  0.511
4  conspiracy_hidden_agenda      0.407  0.331  0.497  0.394
5             victimization      0.152  0.341 -0.138  0.254
6                   heroism      0.180  0.410  0.110  0.021
7        scandal_corruption      0.326  0.300  0.374  0.305

✅ Automatically saved aligned file: aligned_annotations_for_mbert.json

Preview of aligned data:
                                                text  threat_danger  \
0  Sa isang screenshot na ibinahagi ni Davao City...              0   
1  Usap-usapan ngayon sa social media ang umano'y...     

In [None]:
# !pip install transformers datasets torch
!pip install transformers datasets torch



In [None]:
#Import necessary libraries

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

In [None]:
# --- Step 1: Load Your Aligned Data ---
# Load the clean JSON we created in the previous step
df = pd.read_json("aligned_annotations_for_mbert.json")

# Define the columns you want to train on (Target Labels)
label_cols = [
    "threat_danger", "economic_impact", "political_conflict",
    "nationalism_patriotism", "conspiracy_hidden_agenda",
    "victimization", "heroism", "scandal_corruption"
]

# Create a 'labels' column containing a list of floats [0.0, 1.0, ...]
# This format is required for HuggingFace's multi-label classification
df['labels'] = df[label_cols].values.tolist()
df['labels'] = df['labels'].apply(lambda x: [float(i) for i in x])

# Split the data: 80% Train, 20% Validation
# We use a random_state for reproducibility
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert pandas DataFrames to HuggingFace Dataset objects
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df[['text', 'labels']]),
    'validation': Dataset.from_pandas(val_df[['text', 'labels']])
})

print(f"Dataset Loaded. Train size: {len(dataset['train'])}, Val size: {len(dataset['validation'])}")

Dataset Loaded. Train size: 44, Val size: 11


In [None]:
# --- Step 2: Tokenization ---
# We use 'bert-base-multilingual-cased' which is excellent for Tagalog/English mix
model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    # Truncation=True ensures texts longer than 512 tokens are cut off
    # Padding='max_length' ensures all sequences are the same length (optional but recommended for batches)
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

print("\nTokenizing data (this may take a moment)...")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove the raw 'text' column as the model only needs 'input_ids', 'attention_mask', and 'labels'
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

print("Tokenization complete.")
print("Sample keys:", tokenized_datasets['train'][0].keys())

# --- Step 3: Model Setup ---
# We initialize mBERT with the exact number of labels we have (8)
# problem_type="multi_label_classification" automatically sets up the correct loss function (BCEWithLogitsLoss)

num_labels = len(label_cols)
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

# Label mapping (Optional: helps when predicting later)
id2label = {idx: label for idx, label in enumerate(label_cols)}
label2id = {label: idx for idx, label in enumerate(label_cols)}
model.config.id2label = id2label
model.config.label2id = label2id

print(f"\nModel initialized successfully with {num_labels} labels.")
print("Ready for training!")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]


Tokenizing data (this may take a moment)...


Map:   0%|          | 0/44 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Tokenization complete.
Sample keys: dict_keys(['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'])


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model initialized successfully with 8 labels.
Ready for training!


In [None]:
# --- Step 4: Define Metrics for Multi-Label ---
# We need a custom function because standard accuracy doesn't work well for multi-label
def multi_label_metrics(predictions, labels, threshold=0.5):
    # Apply Sigmoid to get probabilities (0 to 1) because model outputs raw logits
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    # Convert probabilities to binary 0 or 1 based on threshold
    y_pred = np.zeros(probs.shape)
    y_pred[probs >= threshold] = 1

    # Calculate metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)

    metrics = {
        'f1_micro': f1_micro_average,
        'f1_macro': f1_macro_average,
        'accuracy': accuracy
    }
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

In [None]:
# --- Step 5: Define Training Arguments ---
# These control how the model learns
batch_size = 8  # Lower this to 4 if you get "Out of Memory" errors
metric_name = "f1_micro"

args = TrainingArguments(
    output_dir=f"mbert-finetuned-news",
    eval_strategy = "epoch",   # Evaluate every epoch
    save_strategy = "epoch",         # Save model every epoch
    learning_rate=2e-5,              # Standard for BERT (1e-5 to 5e-5)
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,              # 5 loops over the data
    weight_decay=0.01,
    load_best_model_at_end=True,     # Load the best model when finished
    metric_for_best_model=metric_name,
    save_total_limit=2,              # Only keep the last 2 models to save space
    seed=42
)

In [None]:
# --- Step 6: Initialize Trainer ---
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
# --- Step 7: Start Training ---
print("Starting training...")
trainer.train()

Starting training...


[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjaveo46[0m ([33mjaveo46-university-of-the-cordilleras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,Accuracy
1,No log,0.600153,0.0,0.0,0.181818
2,No log,0.532911,0.0,0.0,0.181818
3,No log,0.505683,0.0,0.0,0.181818
4,No log,0.502706,0.0,0.0,0.181818
5,No log,0.505306,0.0,0.0,0.181818


TrainOutput(global_step=30, training_loss=0.5457217534383138, metrics={'train_runtime': 390.4577, 'train_samples_per_second': 0.563, 'train_steps_per_second': 0.077, 'total_flos': 57887550504960.0, 'train_loss': 0.5457217534383138, 'epoch': 5.0})

In [None]:
# --- Step 8: Evaluate Final Model ---
print("\nFinal Evaluation on Validation Set:")
trainer.evaluate()


Final Evaluation on Validation Set:


{'eval_loss': 0.6001532077789307,
 'eval_f1_micro': 0.0,
 'eval_f1_macro': 0.0,
 'eval_accuracy': 0.18181818181818182,
 'eval_runtime': 0.3764,
 'eval_samples_per_second': 29.221,
 'eval_steps_per_second': 5.313,
 'epoch': 5.0}

In [None]:
# --- Step 9: Save the Model ---
trainer.save_model("my_final_mbert_model")
print("\nModel saved to 'my_final_mbert_model'")


Model saved to 'my_final_mbert_model'


In [None]:
#Predict text

def predict_custom(text, model, tokenizer, threshold=0.3): # Lowered threshold to 0.3
    # 1. Tokenize text
    encoding = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    encoding = {k: v.to(model.device) for k,v in encoding.items()}

    # 2. Run Inference
    model.eval()
    with torch.no_grad():
        outputs = model(**encoding)

    # 3. Get Probabilities (Sigmoid)
    logits = outputs.logits
    probs = torch.sigmoid(logits).cpu().numpy()[0]

    # 4. Print Results
    print(f"\nText: '{text}'")
    print("-" * 40)
    for idx, score in enumerate(probs):
        label_name = model.config.id2label[idx]
        # Visual bar for confidence
        bar = "█" * int(score * 20)
        status = "" if score > threshold else " "
        print(f"{status} {label_name:<25}: {score:.4f}  {bar}")

# --- TEST IT ---
# Try a sentence that clearly fits one of your categories
test_text = "Duterte lihim na nag-confess sa ICC na fake lang ang drug war para maging hero siya sa Pilipino masses!"
predict_custom(test_text, model, tokenizer)


Text: 'Duterte lihim na nag-confess sa ICC na fake lang ang drug war para maging hero siya sa Pilipino masses!'
----------------------------------------
 threat_danger            : 0.3678  ███████
 economic_impact          : 0.4081  ████████
 political_conflict       : 0.4611  █████████
 nationalism_patriotism   : 0.4156  ████████
 conspiracy_hidden_agenda : 0.4292  ████████
 victimization            : 0.4318  ████████
 heroism                  : 0.4498  ████████
 scandal_corruption       : 0.4469  ████████
