In [2]:
!pip install numpy torch pandas scikit-learn transformers tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
import os
import json

def create_all_labels_from_files(narratives_file, subnarratives_file):
    narratives = []
    subnarratives = []

    with open(narratives_file, "r", encoding="utf-8") as nf:
        narratives = [line.strip() for line in nf if line.strip()]

    with open(subnarratives_file, "r", encoding="utf-8") as sf:
        subnarratives = [line.strip() for line in sf if line.strip()]

    return sorted(narratives), sorted(subnarratives)

def process_annotations(annotations_file):
    annotations = {}
    with open(annotations_file, "r", encoding="utf-8") as af:
        for line in af:
            parts = line.strip().split("\t")
            if len(parts) < 3:
                print(f"Skipped invalid line: {line.strip()}")
                continue
            article_id, narratives, subnarratives = parts

            narratives_list = list(set(
                narrative.strip() for narrative in narratives.split(";") if narrative.strip()
            ))
            subnarratives_list = list(set(
                subnarrative.strip() for subnarrative in subnarratives.split(";")
                if subnarrative.strip() and subnarrative.strip() != "Other"
            ))

            annotations[article_id] = {
                "narratives": narratives_list,
                "subnarratives": subnarratives_list
            }
    return annotations

def load_raw_data(raw_folder):
    raw_data = {}
    for filename in os.listdir(raw_folder):
        if filename.endswith(".txt"):
            with open(os.path.join(raw_folder, filename), "r", encoding="utf-8") as f:
                raw_data[filename] = f.read()
    return raw_data

def create_dataset(raw_folder, annotations):
    raw_data = load_raw_data(raw_folder)
    dataset = []
    for article_id, content in raw_data.items():
        labels = annotations.get(article_id, {})
        dataset.append({
            "article_id": article_id,
            "content": content,
            "narratives": labels.get("narratives", []),
            "subnarratives": labels.get("subnarratives", [])
        })
    return dataset

def save_all_labels_to_json(narratives, subnarratives, output_file):
    all_labels = []
    for idx, narrative in enumerate(narratives):
        all_labels.append({"label": narrative, "type": "N", "idx": idx})
    for idx, subnarrative in enumerate(subnarratives):
        all_labels.append({"label": subnarrative, "type": "S", "idx": idx})
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump({"labels": all_labels}, f, ensure_ascii=False, indent=4)

def save_dataset_to_json(dataset, output_file):
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)

def main():
    current_dir = os.getcwd()


    raw_folder = os.path.join(current_dir, "data", "raw-documents")
    annotations_file = os.path.join(current_dir, "data", "subtask-2-annotations.txt")
    narratives_file = os.path.join(current_dir, "data", "subtask2_narratives.txt")
    subnarratives_file = os.path.join(current_dir, "data", "subtask2_subnarratives.txt")
    all_labels_file = os.path.join(current_dir, "data", "all_labels.json")
    output_dataset_file = os.path.join(current_dir, "data", "training_dataset.json")

    print("Loading labels...")
    narratives, subnarratives = create_all_labels_from_files(narratives_file, subnarratives_file)
    save_all_labels_to_json(narratives, subnarratives, all_labels_file)
    print(f"Saved all labels to {all_labels_file}")

    print("Processing annotations...")
    annotations = process_annotations(annotations_file)

    print("Creating dataset...")
    dataset = create_dataset(raw_folder, annotations)
    save_dataset_to_json(dataset, output_dataset_file)
    print(f"Saved dataset to {output_dataset_file}")

if __name__ == "__main__":
    main()


Loading labels...
Saved all labels to /content/data/all_labels.json
Processing annotations...
Creating dataset...
Saved dataset to /content/data/training_dataset.json


In [7]:
import json
import random
import pandas as pd
from transformers import pipeline
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm

with open("data/training_dataset.json", "r") as f:
    dataset = json.load(f)


dataset = [x for x in dataset if x["narratives"]]


all_labels = sorted(list({label for x in dataset for label in x["narratives"]}))
label_map = {label: i for i, label in enumerate(all_labels)}



def build_prompt(k_examples, test_text):
    prompt = (
        "You are a narrative classification assistant. "
        "Given a news article, your task is to assign one or more of the following narrative categories:\n"
        f"{', '.join(all_labels)}.\n"
        "If none of them apply, return: Other.\n"
        "Respond with a comma-separated list of labels.\n\n"
    )
    for example in k_examples:
        prompt += f"Text: {example['content'].strip()}\n"
        labels = ", ".join(example['narratives']) if example['narratives'] else "Other"
        prompt += f"Labels: {labels}\n\n"
    prompt += f"Text: {test_text.strip()}\nLabels:"
    return prompt

In [5]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `panosk007` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-au

In [8]:

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

llm = pipeline(
    "text-generation",
    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    max_new_tokens=100
)

#3-shot learning

k = 3
test_samples = random.sample(dataset, 100)
y_true = []
y_pred = []

for test in tqdm(test_samples):
    support_pool = [x for x in dataset if x['article_id'] != test['article_id']]
    shots = random.sample(support_pool, k=min(k, len(support_pool)))
    prompt = build_prompt(shots, test['content'])
    output = llm(prompt)[0]['generated_text']

    prediction_line = output.split("Labels:")[-1].strip().split("\n")[0]
    predicted_labels = [p.strip() for p in prediction_line.split(",") if p.strip() in all_labels]
    predicted_labels = predicted_labels if predicted_labels else ["Other"]

    y_true.append(test['narratives'])
    y_pred.append(predicted_labels)


def binarize(y):
    bin_mat = []
    for row in y:
        row_vec = [0] * len(all_labels)
        for lbl in row:
            if lbl in label_map:
                row_vec[label_map[lbl]] = 1
        bin_mat.append(row_vec)
    return bin_mat

Y_true = binarize(y_true)
Y_pred = binarize(y_pred)


print("\nClassification Report:")
print(classification_report(Y_true, Y_pred, target_names=all_labels, zero_division=0))
print("Accuracy:", accuracy_score(Y_true, Y_pred))


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0
  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 1/100 [00:04<07:52,  4.77s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 2/100 [00:09<07:20,  4.49s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 3/100 [00:13<07:07,  4.40s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 4/100 [00:17<06:58,  4.36s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  5%|▌         | 5/100 [00:21<06:50,  4.32s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  6%|▌         | 6/100 [00:26<06:47,  4.34s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|▋         | 7/100 [00:30<06:40,  4.31s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  8%|▊         | 8/100 


Classification Report:
                                                        precision    recall  f1-score   support

                          CC: Amplifying Climate Fears       0.00      0.00      0.00         0
                      CC: Climate change is beneficial       0.00      0.00      0.00         1
              CC: Controversy about green technologies       0.00      0.00      0.00         4
                     CC: Criticism of climate movement       0.36      0.33      0.35        12
                     CC: Criticism of climate policies       0.17      0.22      0.19         9
         CC: Criticism of institutions and authorities       0.29      0.12      0.17        17
                        CC: Downplaying climate change       0.44      0.44      0.44         9
       CC: Green policies are geopolitical instruments       0.00      0.00      0.00         1
 CC: Hidden plots by secret schemes of powerful groups       0.67      0.40      0.50        10
          CC: Q




In [1]:
!jupyter nbconvert --ClearMetadataPreprocessor.enabled=True \
  --ClearOutputPreprocessor.enabled=True \
  --to notebook --inplace fewshot_narrative_classification.ipynb


This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr