# 05: Evaluate Local LLMs for Systematic Review Screening

## Summary
This notebook evaluates **local LLMs only** on the systematic review screening task. No proprietary data is sent to external APIs - all inference runs locally using Ollama.

**Models Evaluated:**
- Llama 3.2 (Meta)
- Mistral (Mistral AI)
- Other local models available via Ollama

**Pipeline Position:** Sixth notebook - evaluates model performance on ground truth data.

**What this notebook does:**
1. Loads ground truth validation set
2. Constructs prompts for screening decision
3. Runs inference using local Ollama models
4. Evaluates predictions against ground truth
5. Computes metrics: accuracy, precision, recall, F1

**Input:** `Data/ground_truth_validation_set.csv`

**Output:** `Data/results/eval_*.csv`, model comparison metrics

**Requirements:**
- Ollama installed and running locally
- Models pulled: `ollama pull llama3.2`, `ollama pull mistral`

**IMPORTANT:** No external API calls are made. All processing is local to protect proprietary Cochrane content.

In [None]:
# Install required packages for local LLM inference
%pip install -q ollama pandas scikit-learn tqdm

In [None]:
# Set up paths and load ground truth data
import os
from pathlib import Path
import pandas as pd
import ollama
from tqdm.notebook import tqdm
from datetime import datetime

notebook_dir = Path.cwd()
project_root = notebook_dir if (notebook_dir / "Data").exists() else notebook_dir.parent
DATA_DIR = project_root / "Data"
RESULTS_DIR = DATA_DIR / "results"
RESULTS_DIR.mkdir(exist_ok=True)

GROUND_TRUTH_CSV = DATA_DIR / "ground_truth_validation_set.csv"

ground_truth = pd.read_csv(GROUND_TRUTH_CSV)
print(f"Loaded {len(ground_truth):,} examples")
print(ground_truth['category'].value_counts())

In [None]:
# Check available Ollama models
try:
    models = ollama.list()
    print("Available local models:")
    for model in models.get('models', []):
        print(f"  - {model['name']}")
except Exception as e:
    print(f"Error connecting to Ollama: {e}")
    print("Make sure Ollama is running: ollama serve")

In [None]:
# Define prompt templates for screening task

ZERO_SHOT_PROMPT = """You are a systematic review screening assistant. Your task is to determine whether a study should be INCLUDED or EXCLUDED from a systematic review based on its abstract.

Study Abstract:
{abstract}

Based on the abstract above, should this study be INCLUDED or EXCLUDED from a systematic review?
Respond with only one word: INCLUDED or EXCLUDED"""

COT_PROMPT = """You are a systematic review screening assistant. Your task is to determine whether a study should be INCLUDED or EXCLUDED from a systematic review based on its abstract.

Study Abstract:
{abstract}

Think step by step:
1. What is the study design?
2. What is the intervention or topic?
3. What outcomes are measured?
4. Is this likely to be relevant for inclusion?

Based on your analysis, respond with your final decision: INCLUDED or EXCLUDED"""

def create_prompt(abstract: str, use_cot: bool = False) -> str:
    template = COT_PROMPT if use_cot else ZERO_SHOT_PROMPT
    return template.format(abstract=abstract[:3000])  # Truncate long abstracts

In [None]:
# Define function to run local LLM inference
import re

def extract_decision(response: str) -> int:
    """Extract INCLUDED (1) or EXCLUDED (0) from LLM response."""
    response_lower = response.lower()
    if 'included' in response_lower and 'excluded' not in response_lower:
        return 1
    elif 'excluded' in response_lower:
        return 0
    # Check for last occurrence if both present
    included_pos = response_lower.rfind('included')
    excluded_pos = response_lower.rfind('excluded')
    if included_pos > excluded_pos:
        return 1
    elif excluded_pos > included_pos:
        return 0
    return -1  # Could not determine

def run_evaluation(model_name: str, data: pd.DataFrame, use_cot: bool = False) -> pd.DataFrame:
    """Run evaluation on a dataset using a local Ollama model."""
    results = []
    
    for idx, row in tqdm(data.iterrows(), total=len(data), desc=f"{model_name}"):
        prompt = create_prompt(row['abstract'], use_cot=use_cot)
        
        try:
            response = ollama.generate(model=model_name, prompt=prompt)
            response_text = response.get('response', '')
            prediction = extract_decision(response_text)
        except Exception as e:
            response_text = f"ERROR: {e}"
            prediction = -1
        
        results.append({
            'study_pmid': row['study_pmid'],
            'label': row['label'],
            'prediction': prediction,
            'response': response_text[:500],  # Truncate for storage
            'correct': prediction == row['label']
        })
    
    return pd.DataFrame(results)

In [None]:
# Run evaluation on local models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Models to evaluate (adjust based on what you have installed)
MODELS = [
    ('llama3.2', False),  # Zero-shot
    ('llama3.2', True),   # Chain-of-thought
    ('mistral', False),   # Zero-shot
    ('mistral', True),    # Chain-of-thought
]

# Sample for faster testing (set to None for full evaluation)
SAMPLE_SIZE = 100
eval_data = ground_truth.sample(n=SAMPLE_SIZE, random_state=42) if SAMPLE_SIZE else ground_truth

all_results = {}

for model_name, use_cot in MODELS:
    prompt_type = 'cot' if use_cot else 'zero_shot'
    run_name = f"{model_name}_{prompt_type}"
    
    print(f"\nEvaluating: {run_name}")
    
    try:
        results = run_evaluation(model_name, eval_data, use_cot=use_cot)
        all_results[run_name] = results
        
        # Save results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = RESULTS_DIR / f"eval_{run_name.replace('.', '_')}_{timestamp}.csv"
        results.to_csv(output_file, index=False)
        print(f"  Saved to {output_file.name}")
        
        # Compute metrics
        valid = results[results['prediction'] != -1]
        if len(valid) > 0:
            acc = accuracy_score(valid['label'], valid['prediction'])
            prec = precision_score(valid['label'], valid['prediction'], zero_division=0)
            rec = recall_score(valid['label'], valid['prediction'], zero_division=0)
            f1 = f1_score(valid['label'], valid['prediction'], zero_division=0)
            print(f"  Accuracy: {acc:.3f} | Precision: {prec:.3f} | Recall: {rec:.3f} | F1: {f1:.3f}")
    except Exception as e:
        print(f"  Error: {e}")

In [None]:
# Create model comparison summary
comparison_rows = []

for run_name, results in all_results.items():
    valid = results[results['prediction'] != -1]
    if len(valid) == 0:
        continue
    
    comparison_rows.append({
        'model': run_name,
        'n_evaluated': len(valid),
        'n_errors': len(results) - len(valid),
        'accuracy': accuracy_score(valid['label'], valid['prediction']),
        'precision': precision_score(valid['label'], valid['prediction'], zero_division=0),
        'recall': recall_score(valid['label'], valid['prediction'], zero_division=0),
        'f1': f1_score(valid['label'], valid['prediction'], zero_division=0)
    })

comparison = pd.DataFrame(comparison_rows)
comparison = comparison.sort_values('f1', ascending=False)

print("\n" + "="*80)
print("MODEL COMPARISON (sorted by F1 score)")
print("="*80)
print(comparison.to_string(index=False))

# Save comparison
comparison.to_csv(RESULTS_DIR / "model_comparison.csv", index=False)

In [None]:
# Display confusion matrices for best models
import matplotlib.pyplot as plt
import seaborn as sns

if len(all_results) > 0:
    fig, axes = plt.subplots(1, min(4, len(all_results)), figsize=(4*min(4, len(all_results)), 4))
    if len(all_results) == 1:
        axes = [axes]
    
    for ax, (run_name, results) in zip(axes, list(all_results.items())[:4]):
        valid = results[results['prediction'] != -1]
        if len(valid) > 0:
            cm = confusion_matrix(valid['label'], valid['prediction'])
            sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap='Blues')
            ax.set_title(run_name)
            ax.set_ylabel('True')
            ax.set_xlabel('Predicted')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Summary
print("\n" + "="*60)
print("EVALUATION COMPLETE")
print("="*60)
print(f"Models evaluated: {len(all_results)}")
print(f"Examples per model: {len(eval_data)}")
print(f"Results saved to: {RESULTS_DIR}")
print("\nNote: All inference was performed locally - no data sent to external APIs.")