In [1]:
# Cell 1: Imports
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
# Hugging Face login
from huggingface_hub import login
login("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")

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_model_name = "meta-llama/Llama-2-7b-hf"
peft_model_name = "FinGPT/fingpt-mt_llama2-7b_lora"

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # <--- ADD THIS LINE

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, peft_model_name)

2025-05-24 21:03:07.449205: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748120587.661680      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748120587.719315      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/12.6M [00:00<?, ?B/s]

In [4]:
# Cell 3: Load the ChanceFocus/flare-ectsum dataset (limit to 20)
dataset = load_dataset("ChanceFocus/flare-ectsum", split="test")

print(dataset.features)  # Shows available columns

texts = dataset["text"][:495]      # Take only the first 20 documents
references = dataset["answer"][:495]   # Take only the first 20 summaries

README.md:   0%|          | 0.00/488 [00:00<?, ?B/s]

(…)-00000-of-00001-8ef60b4155c29bac.parquet:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/495 [00:00<?, ? examples/s]

{'id': Value(dtype='string', id=None), 'query': Value(dtype='string', id=None), 'answer': Value(dtype='string', id=None), 'label': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'text': Value(dtype='string', id=None)}


In [5]:
# Cell 4: Generate Summaries with prompt engineering

def generate_summary(text, max_new_tokens=64):
    # Add your prompt here
    prompt = f"Summarize the following text in 1-2 sentences:\n\n{text}\n\nSummary:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    summary_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, num_beams=4)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary
    
generated_summaries = []
for text in tqdm(texts, desc="Generating summaries"):
    generated_summaries.append(generate_summary(text))

Generating summaries: 100%|██████████| 495/495 [24:19<00:00,  2.95s/it]


In [7]:
!pip install rouge_score --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [8]:
# Install the evaluate library if not present
!pip install evaluate --quiet

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
import evaluate

# Tokenize for F1 (token-based)
def tokenize(s):
    return s.lower().split()

true_token_lists = [tokenize(s) for s in references]
pred_token_lists = [tokenize(s) for s in generated_summaries]

mlb = MultiLabelBinarizer()
mlb.fit(true_token_lists + pred_token_lists)

y_true_bin = mlb.transform(true_token_lists)
y_pred_bin = mlb.transform(pred_token_lists)

micro = f1_score(y_true_bin, y_pred_bin, average='micro', zero_division=0) * 100
macro = f1_score(y_true_bin, y_pred_bin, average='macro', zero_division=0) * 100
weighted = f1_score(y_true_bin, y_pred_bin, average='weighted', zero_division=0) * 100

print("Evaluation (in pts / ECTs):")
print(f"- Micro F1:    {micro:.2f} pts")
print(f"- Macro F1:    {macro:.2f} pts")
print(f"- Weighted F1: {weighted:.2f} pts")

# Compute ROUGE
rouge = evaluate.load("rouge")
rouge_scores = rouge.compute(predictions=generated_summaries, references=references)

print("\nROUGE (bullet-point results):")
print(f"- ROUGE-1: {rouge_scores['rouge1']*100:.2f} pts")
print(f"- ROUGE-2: {rouge_scores['rouge2']*100:.2f} pts")
print(f"- ROUGE-L: {rouge_scores['rougeL']*100:.2f} pts")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.9.0.13 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu12 9.3.0.75 which is incompatible.
torch 2.6.0+cu124 requires nvidi

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]


ROUGE (bullet-point results):
- ROUGE-1: 1.86 pts
- ROUGE-2: 0.02 pts
- ROUGE-L: 1.83 pts
