In [1]:
!pip install numpy torch pandas scikit-learn transformers tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
import os
import json

def create_all_labels_from_files(narratives_file, subnarratives_file):
    narratives = []
    subnarratives = []

    with open(narratives_file, "r", encoding="utf-8") as nf:
        narratives = [line.strip() for line in nf if line.strip()]

    with open(subnarratives_file, "r", encoding="utf-8") as sf:
        subnarratives = [line.strip() for line in sf if line.strip()]

    return sorted(narratives), sorted(subnarratives)

def process_annotations(annotations_file):
    annotations = {}
    with open(annotations_file, "r", encoding="utf-8") as af:
        for line in af:
            parts = line.strip().split("\t")
            if len(parts) < 3:
                print(f"Skipped invalid line: {line.strip()}")
                continue
            article_id, narratives, subnarratives = parts

            narratives_list = list(set(
                narrative.strip() for narrative in narratives.split(";") if narrative.strip()
            ))
            subnarratives_list = list(set(
                subnarrative.strip() for subnarrative in subnarratives.split(";")
                if subnarrative.strip() and subnarrative.strip() != "Other"
            ))

            annotations[article_id] = {
                "narratives": narratives_list,
                "subnarratives": subnarratives_list
            }
    return annotations

def load_raw_data(raw_folder):
    raw_data = {}
    for filename in os.listdir(raw_folder):
        if filename.endswith(".txt"):
            with open(os.path.join(raw_folder, filename), "r", encoding="utf-8") as f:
                raw_data[filename] = f.read()
    return raw_data

def create_dataset(raw_folder, annotations):
    raw_data = load_raw_data(raw_folder)
    dataset = []
    for article_id, content in raw_data.items():
        labels = annotations.get(article_id, {})
        dataset.append({
            "article_id": article_id,
            "content": content,
            "narratives": labels.get("narratives", []),
            "subnarratives": labels.get("subnarratives", [])
        })
    return dataset

def save_all_labels_to_json(narratives, subnarratives, output_file):
    all_labels = []
    for idx, narrative in enumerate(narratives):
        all_labels.append({"label": narrative, "type": "N", "idx": idx})
    for idx, subnarrative in enumerate(subnarratives):
        all_labels.append({"label": subnarrative, "type": "S", "idx": idx})
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump({"labels": all_labels}, f, ensure_ascii=False, indent=4)

def save_dataset_to_json(dataset, output_file):
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)

def main():
    current_dir = os.getcwd()


    raw_folder = os.path.join(current_dir, "data", "raw-documents")
    annotations_file = os.path.join(current_dir, "data", "subtask-2-annotations.txt")
    narratives_file = os.path.join(current_dir, "data", "subtask2_narratives.txt")
    subnarratives_file = os.path.join(current_dir, "data", "subtask2_subnarratives.txt")
    all_labels_file = os.path.join(current_dir, "data", "all_labels.json")
    output_dataset_file = os.path.join(current_dir, "data", "training_dataset.json")

    print("Loading labels...")
    narratives, subnarratives = create_all_labels_from_files(narratives_file, subnarratives_file)
    save_all_labels_to_json(narratives, subnarratives, all_labels_file)
    print(f"Saved all labels to {all_labels_file}")

    print("Processing annotations...")
    annotations = process_annotations(annotations_file)

    print("Creating dataset...")
    dataset = create_dataset(raw_folder, annotations)
    save_dataset_to_json(dataset, output_dataset_file)
    print(f"Saved dataset to {output_dataset_file}")

if __name__ == "__main__":
    main()


Loading labels...
Saved all labels to /content/data/all_labels.json
Processing annotations...
Creating dataset...
Saved dataset to /content/data/training_dataset.json


In [4]:
import json
import random
import pandas as pd
from transformers import pipeline
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm

with open("data/training_dataset.json", "r") as f:
    dataset = json.load(f)

dataset = [x for x in dataset if x["narratives"]]

# building prompt for the LLM
def build_prompt(k_examples, test_text):
    prompt = "You are a media analyst. Based on the news text, identify all applicable narratives.\n\n"
    for example in k_examples:
        prompt += f"Text: {example['content'].strip()}\n"
        labels = ", ".join(example['narratives']) if example['narratives'] else "Other"
        prompt += f"Labels: {labels}\n\n"
    prompt += f"Text: {test_text.strip()}\nLabels:"
    return prompt


In [None]:

classifier = pipeline("text-generation", model="tiiuae/falcon-7b-instruct", max_new_tokens=100)

#3-shot learning

k = 3
test_samples = random.sample(dataset, 20)
y_true = []
y_pred = []

for test in tqdm(test_samples):
    support_pool = [x for x in dataset if x['article_id'] != test['article_id']]
    shots = random.sample(support_pool, k)
    prompt = build_prompt(shots, test['content'])
    output = classifier(prompt)[0]['generated_text']

    prediction = output.split("Labels:")[-1].strip().split("\n")[0]
    predicted_labels = [p.strip() for p in prediction.split(",") if p.strip()] or ["Other"]

    y_true.append(test['narratives'])
    y_pred.append(predicted_labels)

all_labels = sorted(list({l for sub in y_true + y_pred for l in sub}))
label_map = {label: i for i, label in enumerate(all_labels)}

def binarize(y):
    bin_mat = []
    for row in y:
        row_vec = [0] * len(all_labels)
        for lbl in row:
            if lbl in label_map:
                row_vec[label_map[lbl]] = 1
        bin_mat.append(row_vec)
    return bin_mat

Y_true = binarize(y_true)
Y_pred = binarize(y_pred)

print("\nClassification Report:")
print(classification_report(Y_true, Y_pred, target_names=all_labels, zero_division=0))
print("Accuracy:", accuracy_score(Y_true, Y_pred))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]