# M+ Model - HotpotQA Multi-hop QA Evaluation

논문 코드(longbench_pred.py) 기반으로 M+ 모델 지원 추가  
L4 GPU (24GB) 메모리에 맞게 최적화

## 1. Setup & Imports

In [3]:
!pip cache purge
!pip uninstall pyarrow pandas datasets -y
!pip install pyarrow==14.0.0 pandas==2.0.3 datasets==2.18.0

Files removed: 1039 (6389.8 MB)
Found existing installation: pyarrow 14.0.0
Can't uninstall 'pyarrow'. No files were found to uninstall.
Found existing installation: pandas 2.0.3
Uninstalling pandas-2.0.3:
  Successfully uninstalled pandas-2.0.3
Found existing installation: datasets 2.18.0
Uninstalling datasets-2.18.0:
  Successfully uninstalled datasets-2.18.0
Collecting pandas==2.0.3
  Downloading pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting datasets==2.18.0
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Downloading pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m88.1 MB/s[0m  [33m0:00:00[0m
[?25hDownloading datasets-2.18.0-py3-none-any.whl (510 kB)
[0mInstalling collected packages: pandas, datasets
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [datasets]1/2[0m [datasets]

In [1]:
import os
import sys
import json
import torch
import numpy as np
import random
import re
import string
from collections import Counter
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer
import gc

# 현재 디렉토리를 path에 추가 (modeling_mplus import 위해)
sys.path.insert(0, os.getcwd())

  import pynvml  # type: ignore[import]


AttributeError: module 'pyarrow' has no attribute '__version__'

## 2. Configuration

In [None]:
# 설정
CONFIG = {
    'model_path': 'YuWangX/mplus-8b',
    'num_samples': 100,
    'max_gen': 32,
    'split_model': True,
    'seed': 42,
    'output_dir': './results_hotpotqa',
    'chunk_size': 256,  # 메모리 절약을 위해 256으로 설정
    'dataset': 'hotpotqa'
}

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

## 3. Helper Functions

In [None]:
def seed_everything(seed):
    """재현성을 위한 시드 설정"""
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


def normalize_answer(s):
    """정답 정규화 (SQuAD 스타일)"""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    """F1 스코어 계산"""
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()

    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return 0

    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return 0

    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    """Exact Match 스코어 계산"""
    return normalize_answer(prediction) == normalize_answer(ground_truth)


def compute_metrics(predictions, ground_truths):
    """전체 메트릭 계산"""
    em_scores = []
    f1_scores = []

    for pred, truths in zip(predictions, ground_truths):
        if isinstance(truths, list):
            em = max(exact_match_score(pred, truth) for truth in truths)
            f1 = max(f1_score(pred, truth) for truth in truths)
        else:
            em = exact_match_score(pred, truths)
            f1 = f1_score(pred, truths)

        em_scores.append(em)
        f1_scores.append(f1)

    return {
        'exact_match': np.mean(em_scores) * 100,
        'f1': np.mean(f1_scores) * 100,
        'total_samples': len(predictions)
    }


def get_prompt_format(dataset_name):
    """데이터셋별 프롬프트 포맷"""
    prompts = {
        "hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
        "2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
        "musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
    }
    return prompts.get(dataset_name, prompts["hotpotqa"])

print("Helper functions loaded!")

## 4. Set Seed

In [3]:
seed_everything(CONFIG['seed'])
print(f"Seed set to {CONFIG['seed']}")

NameError: name 'seed_everything' is not defined

## 5. Load Dataset

In [None]:
print(f"Loading {CONFIG['dataset']} dataset from LongBench...")

full_dataset = load_dataset(
    'THUDM/LongBench',
    CONFIG['dataset'],
    split='test',
    trust_remote_code=True
)

# 샘플 수 제한
if CONFIG['num_samples'] and CONFIG['num_samples'] < len(full_dataset):
    indices = list(range(CONFIG['num_samples']))
    dataset = full_dataset.select(indices)
else:
    dataset = full_dataset

print(f"Loaded {len(dataset)} samples")
print(f"\nSample data:")
print(f"  Question: {dataset[0]['input'][:100]}...")
print(f"  Answers: {dataset[0]['answers']}")

## 6. Load Model

In [None]:
from modeling_mplus import MPlus

print(f"Loading M+ model from {CONFIG['model_path']}...")
print(f"Split model: {CONFIG['split_model']}")

# GPU 메모리 정리
gc.collect()
torch.cuda.empty_cache()

if CONFIG['split_model']:
    model = MPlus.from_pretrained(
        CONFIG['model_path'],
        device_map='auto',
        torch_dtype=torch.bfloat16
    )
else:
    model = MPlus.from_pretrained(
        CONFIG['model_path'],
        torch_dtype=torch.bfloat16
    ).cuda()

model.eval()

# 메모리 사용량 출력
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    print(f"GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")

## 7. Load Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")
print(f"Vocab size: {tokenizer.vocab_size}")

## 8. Run Evaluation

In [None]:
prompt_format = get_prompt_format(CONFIG['dataset'])

# 모델의 device 확인
if hasattr(model, 'device'):
    device = model.device
else:
    device = next(model.parameters()).device

# 초기 메모리 백업 (논문 방식)
backup_memory = model.memory.clone().detach().cpu()

predictions = []
ground_truths = []
results_detail = []

print(f"\nEvaluating {len(dataset)} samples...")
print(f"Chunk size: {CONFIG['chunk_size']}")

for idx, sample in enumerate(tqdm(dataset, desc="Evaluating")):
    try:
        # 메모리 복원 (각 샘플마다 초기 상태로)
        model.memory.data = backup_memory.clone().detach().to(model.memory.device)

        # 프롬프트 생성
        prompt = prompt_format.format(context=sample['context'], input=sample['input'])

        # 토큰화
        prompt_ids = tokenizer(prompt, add_special_tokens=False, truncation=False).input_ids

        # 컨텍스트를 청크로 분할하여 메모리에 주입
        contexts_ids = []
        remaining_ids = prompt_ids.copy()

        # 마지막 청크는 생성을 위해 남겨둠
        while len(remaining_ids) > CONFIG['chunk_size']:
            contexts_ids.append(remaining_ids[:CONFIG['chunk_size']])
            remaining_ids = remaining_ids[CONFIG['chunk_size']:]

        # 마지막 부분은 sentence로 사용 (생성 입력)
        sentence_ids = remaining_ids if remaining_ids else contexts_ids.pop()

        # 메모리 주입
        with torch.no_grad():
            for context_chunk in contexts_ids:
                context_tensor = torch.tensor(context_chunk).unsqueeze(0).cuda()
                attention_mask = torch.ones(len(context_chunk) + model.num_tokens).long().unsqueeze(0).cuda()

                model.inject_memory(
                    context_tensor,
                    attention_mask,
                    update_memory=True
                )

            # 생성
            sentence_tensor = torch.tensor(sentence_ids).unsqueeze(0).cuda()
            gen_attention_mask = torch.ones(
                len(sentence_ids) + model.num_blocks * model.num_tokens
            ).unsqueeze(0).long().cuda()

            output = model.generate(
                input_ids=sentence_tensor,
                attention_mask=gen_attention_mask,
                max_new_tokens=CONFIG['max_gen'],
                num_beams=1,
                do_sample=False,
                temperature=1.0,
                pad_token_id=tokenizer.pad_token_id
            )[0]

            # 생성된 부분만 추출
            pred = tokenizer.decode(output[len(sentence_ids):], skip_special_tokens=True)

        # 정답 추출
        answers = sample['answers']
        if isinstance(answers, str):
            answers = [answers]

        predictions.append(pred.strip())
        ground_truths.append(answers)

        # 상세 결과 저장
        results_detail.append({
            'idx': idx,
            'question': sample['input'],
            'prediction': pred.strip(),
            'ground_truth': answers,
            'context_length': len(prompt_ids)
        })

        # 매 샘플마다 메모리 정리
        torch.cuda.empty_cache()
        gc.collect()

    except Exception as e:
        print(f"\nError at sample {idx}: {e}")
        predictions.append("")
        ground_truths.append(sample.get('answers', [""]))
        torch.cuda.empty_cache()
        gc.collect()
        continue

print("\nEvaluation complete!")

## 9. Compute Metrics

In [None]:
metrics = compute_metrics(predictions, ground_truths)

print("=" * 60)
print("RESULTS")
print("=" * 60)
print(f"Dataset: {CONFIG['dataset']}")
print(f"Total Samples: {metrics['total_samples']}")
print(f"Exact Match: {metrics['exact_match']:.2f}%")
print(f"F1 Score: {metrics['f1']:.2f}%")
print("=" * 60)

## 10. Save Results

In [None]:
os.makedirs(CONFIG['output_dir'], exist_ok=True)

output_file = os.path.join(
    CONFIG['output_dir'],
    f"mplus_{CONFIG['dataset']}_n{CONFIG['num_samples']}_seed{CONFIG['seed']}.json"
)

results = {
    'config': CONFIG,
    'metrics': metrics,
    'predictions': results_detail
}

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"Results saved to: {output_file}")

## 11. Sample Predictions

In [None]:
print("=" * 60)
print("SAMPLE PREDICTIONS (first 10)")
print("=" * 60)

for i, detail in enumerate(results_detail[:10]):
    print(f"\n[{i+1}] Question: {detail['question'][:100]}...")
    print(f"    Prediction: {detail['prediction']}")
    print(f"    Ground Truth: {detail['ground_truth']}")
    
    # EM/F1 계산
    em = max(exact_match_score(detail['prediction'], gt) for gt in detail['ground_truth'])
    f1 = max(f1_score(detail['prediction'], gt) for gt in detail['ground_truth'])
    print(f"    EM: {em}, F1: {f1:.3f}")