# Libraries

In [3]:
from dataclasses import dataclass
import time
import numpy as np
from transformers import pipeline
from src.engine.engine import Engine, EngineConfig
from src.ml.inference.master import MasterConfig, GenerationConfig
from pathlib import Path
from typing import List, Dict, Any, Tuple, Set, Callable
import matplotlib.pyplot as plt

# Key metrics

In [2]:
@dataclass
class EvaluationMetrics:
    coherence: float
    context_relevance: float
    entity_consistency: float
    dnd_style: float
    memory_utilization: float
    response_time: float

# A class for comparative testing

In [4]:
class AugmentedLMComparator:
    def __init__(self, base_model_path: str, augmented_config: EngineConfig):
        self.base_model = pipeline(
            "text-generation", 
            model=base_model_path,
            device_map="auto"
        )
        self.augmented_engine = Engine(augmented_config)
        self.preambular = augmented_config.master_config.preambular.strip()
    
    def compare_responses(self, test_cases: List[Dict[str, str]]) -> Dict[str, List[Dict[str, float]]]:
        results = {"base": [], "augmented": []}
        
        for case in test_cases:
            prompt = case["prompt"]
            context = case.get("context", "")
            
            full_base_prompt = f"{self.preambular}\n\nContext: {context}\n\nPlayer: {prompt}\nDM:"
            base_time, base_response = self._time_execution(
                lambda: self.base_model(full_base_prompt, max_new_tokens=128, temperature=0.7)[0]['generated_text']
            )
            base_response = base_response.replace(full_base_prompt, "")
            
            augmented_time, augmented_response = self._time_execution(
                lambda: self.augmented_engine.dialog(statement=prompt))
            augmented_text = augmented_response.text
            
            base_metrics = self._evaluate_response(prompt, context, base_response)
            base_metrics["response_time"] = base_time
            
            augmented_metrics = self._evaluate_response(prompt, context, augmented_text)
            augmented_metrics["response_time"] = augmented_time
            augmented_metrics["memory_utilization"] = self._calc_memory_utilization()
            
            results["base"].append(base_metrics)
            results["augmented"].append(augmented_metrics)
        
        return results
    
    def _time_execution(self, func: Callable) -> Tuple[float, Any]:
        start_time = time.time()
        result = func()
        return time.time() - start_time, result
    
    def _evaluate_response(self, prompt: str, context: str, response: str) -> Dict[str, float]:
        return {
            "coherence": self._calc_coherence(prompt, response),
            "context_relevance": self._calc_context_relevance(context, response),
            "entity_consistency": self._calc_entity_consistency(context, response),
            "dnd_style": self._calc_dnd_style(response)
        }
    
    def _calc_coherence(self, prompt: str, response: str) -> float:
        classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
        result = classifier(
            response,
            candidate_labels=["coherent", "incoherent"],
            hypothesis_template="This response is {} to the prompt."
        )
        return result['scores'][0]
    
    def _calc_context_relevance(self, context: str, response: str) -> float:
        cross_encoder = pipeline("text-classification", model="cross-encoder/stsb-roberta-large")
        result = cross_encoder((context, response))
        return result[0]['score']
    
    def _calc_entity_consistency(self, context: str, response: str) -> float:
        context_entities = self._extract_entities(context)
        response_entities = self._extract_entities(response)
        
        if not context_entities:
            return 1.0
        
        consistent = sum(1 for ent in response_entities if ent in context_entities)
        return consistent / len(response_entities) if response_entities else 1.0
    
    def _extract_entities(self, text: str) -> Set[str]:
        entities = self.augmented_engine.ner.extract(text)
        return {e.text for e in entities}
    
    def _calc_dnd_style(self, response: str) -> float:
        classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
        result = classifier(
            response,
            candidate_labels=["D&D fantasy", "modern", "sci-fi", "historical"],
            multi_label=True
        )
        return next(score for label, score in zip(result['labels'], result['scores']) 
                   if label == "D&D fantasy")
    
    def _calc_memory_utilization(self) -> float:
        memory = self.augmented_engine.db
        if not memory.entries:
            return 0.0
        return min(1.0, len(memory.accessed_entries) / len(memory.entries)) if hasattr(memory, 'accessed_entries') else 0.0
    
    def aggregate_results(self, results: Dict[str, List[Dict[str, float]]]) -> Dict[str, Dict[str, float]]:
        aggregated = {}
        for model_type in ["base", "augmented"]:
            model_results = results[model_type]
            aggregated[model_type] = {
                metric: np.mean([r.get(metric, 0) for r in model_results])
                for metric in model_results[0].keys()
            }
        return aggregated

In [5]:
preambular = '''
**The Tale of the Stolen Star**  

### **Player Character**  
- **Race:**Human  
- **Class:** Rogue (Thief)  
- **Inventory:** Lockpicks, a silver dagger (engraved with a crescent moon), a hooded cloak lined with hidden pockets, a stolen signet ring (unbeknownst to him, cursed), and a small pouch of glowing blue sand.  
- **Abilities:** Expert lockpicker, nimble fingers, stealth mastery, and an uncanny ability to sense traps.  
- **Backstory:** Born in the slums of **Hollow’s End**, **Riley Quickfingers** learned early that survival meant taking what others wouldn’t miss—or wouldn’t notice. Though he steals, he has a code: never from those who can’t afford to lose it. His latest job, however, might have crossed a line he didn’t see coming.  

---

### **Key NPCs**  

1. **Booster "The Barrel" Durnan**  
   - **Role:** Tavernkeeper of **Booster’s Tavern**, retired thief.  
   - **Appearance:** A burly human with a shaved head, a thick black beard, and a permanent smirk. His arms are covered in faded jailhouse tattoos.  
   - **Personality:** Gruff but fair, with a soft spot for fellow rogues who show potential. Hates cheats and bullies.  
   - **Motive:** Runs a neutral ground for thieves and informants but keeps a tight leash on trouble. Knows more than he lets on.  

2. **Seraphine Duskwhisper**  
   - **Role:** A mysterious elven scholar searching for lost artifacts.  
   - **Appearance:** Tall, pale, with silver hair tied in intricate braids. Wears a long, dark-blue coat lined with arcane symbols.  
   - **Personality:** Coldly polite, always calculating. Speaks in riddles when annoyed.  
   - **Motive:** Hired Riley to retrieve an artifact—**The Star of Luminis**—but didn’t mention its dangerous nature.  

3. **Gristle the Snitch**  
   - **Role:** A weaselly street informant.  
   - **Appearance:** A scrawny half-elf with a lazy eye and a habit of chewing his nails.  
   - **Personality:** Nervous, twitchy, and always looking over his shoulder.  
   - **Motive:** Will sell anyone out for the right price but knows every secret in town.  

---

### **Enemies**  

1. **The Black Hand Syndicate**  
   - **Type:** Organized thieves' guild.  
   - **Appearance:** Black leather armor, red scarves, and silver daggers.  
   - **Combat Behavior:** Fight dirty—ambushes, poison, and overwhelming numbers.  
   - **Objective:** Retrieve the stolen **Star of Luminis** and silence Riley.  

2. **The Hollow Specter**  
   - **Type:** A shadowy entity bound to the cursed ring Riley carries.  
   - **Appearance:** A shifting, humanoid figure made of smoke and whispers.  
   - **Combat Behavior:** Phases through walls, drains strength with a touch.  
   - **Objective:** Reclaim the ring—or claim Riley’s soul instead.  

---

### **Main Location: Hollow’s End**  
A crumbling port city built atop ancient ruins, where the wealthy live in marble towers while the poor scurry through sewers and alleyways. The city thrives on secrets, and the **Star of Luminis**—a relic said to reveal hidden truths—has just been stolen from the Syndicate’s vault. Now, the underworld is in chaos.  

---

### **Sub-Locations**  

1. **Booster’s Tavern**  
   - A dimly lit den of ale and intrigue. The air smells of roasted meat and spilled mead. The walls are covered in wanted posters, and the floorboards creak with hidden compartments beneath.  

2. **The Shivering Market**  
   - A black-market bazaar where stolen goods change hands. The scent of exotic spices mixes with the metallic tang of smuggled weapons. Shadows move unnaturally here.  

3. **The Syndicate’s Den**  
   - A fortified gambling hall with red velvet curtains and rigged games. Guards lurk in every corner, watching for intruders.  

4. **The Whispering Catacombs**  
   - Beneath the city, these tunnels are lined with skulls that seem to murmur secrets to those who listen.  

5. **The Clocktower of Old Veyne**  
   - A rusted, ancient tower where Seraphine conducts her research. The gears inside hum with latent magic.  

---

### **Artifacts**  

1. **The Star of Luminis**  
   - A palm-sized crystal shard that glows faintly blue. Reveals hidden messages when held under moonlight.  

2. **The Cursed Signet Ring**  
   - Silver, with a black onyx stone. Whispers names of the dead at midnight.  

3. **The Gauntlet of Shadows**  
   - A blackened steel glove that lets the wearer phase through objects briefly.  

4. **The Tome of Whispers**  
   - A book that writes itself with secrets overheard nearby.  

5. **The Lantern of the Lost**  
   - When lit, reveals invisible creatures—but also attracts them.  

6. **The Dagger of Echoes**  
   - Strikes silently but leaves behind phantom sounds of past killings.  

7. **The Mask of a Thousand Faces**  
   - Changes the wearer’s appearance—but sometimes, the faces linger too long.  

---

### **The Story Unfolds…**  
Riley thought stealing the **Star of Luminis** would be just another job. But now, the Syndicate wants it back, the Hollow Specter stalks him, and Seraphine’s true motives are unclear. The only safe place left is **Booster’s Tavern**—but even there, trust is a currency spent quickly.  

Will Riley uncover the Star’s secret before the city’s shadows swallow him whole?
'''

In [None]:
config = EngineConfig(
    vector_db_path=Path('tmp/db'),
    number_of_remind_items=5,
    master_config=MasterConfig(
        path=Path('Qwen/Qwen3-0.6B'),
        preambular=preambular,
        generation_config=GenerationConfig(temperature=0.7, max_new_tokens=128),
    ),
    ner_model_path=Path('models/ner'),
    embedding_model_path=Path(
        'sentence-transformers/all-MiniLM-L6-v2'
    )
)

comparator = AugmentedLMComparator(
    base_model_path="Qwen/Qwen3-0.6B",
    augmented_config=config
)

Device set to use cuda:0


# Test dataset of D&D scenarios

In [None]:
DND_TEST_CASES = [
    {
        "context": "You are in the 'Drunken Dragon' tavern. The bartender is cleaning mugs. "
                   "There's a map on the wall marking a dragon's cave. "
                   "Your companion Garrick mentioned you could find a guide here.",
        "prompt": "Ask the bartender about the dragon's cave"
    },
    {
        "context": "You found an ancient artifact - the Amulet of Moonlight. "
                   "A wood elf warned you that the amulet is cursed. "
                   "Your party includes a wizard who studies ancient relics.",
        "prompt": "Show the amulet to the wizard and ask about the curse"
    },
    {
        "context": "After defeating goblins, you found a key with an eagle symbol. "
                   "You previously saw the same symbol on the gates of an abandoned fortress. "
                   "Your mercenary Karsten specializes in lock picking.",
        "prompt": "Ask Karsten to examine the key and the fortress gates"
    },
    {
        "context": "The village elder gave you a quest to find the Crystal of Truth. "
                   "Local rumors say it's guarded by a sphinx in the Whispering Desert. "
                   "You notice suspicious footprints leading east.",
        "prompt": "Follow the footprints toward the desert"
    }
]

# Evaluation

In [None]:
results = comparator.compare_responses(DND_TEST_CASES)
aggregated = comparator.aggregate_results(results)

In [None]:
print("\n" + "="*60)
print("Augmented D&D System Evaluation".center(60))
print("="*60)

metrics = ["coherence", "context_relevance", "entity_consistency", 
           "dnd_style", "memory_utilization", "response_time"]

print(f"{'Metric':<20} | {'Base LLM':<10} | {'Augmented':<10} | {'Delta':<8}")
for metric in metrics:
    base_val = aggregated["base"].get(metric, 0)
    aug_val = aggregated["augmented"].get(metric, 0)
    delta = aug_val - base_val
    
    print(f"{metric:<20} | {base_val:.4f}    | {aug_val:.4f}    | {delta:>+7.4f}")

print("\nKey Insights:")
print(f"- Average response time: Base={aggregated['base']['response_time']:.2f}s, "
      f"Augmented={aggregated['augmented']['response_time']:.2f}s")
print(f"- Memory utilization: {aggregated['augmented']['memory_utilization']:.1%}")
print(f"- D&D style consistency improvement: {100 * (aug_val - base_val):.1f}%")

In [None]:
def plot_results(aggregated):
    metrics = list(aggregated["base"].keys())
    base_vals = [aggregated["base"][m] for m in metrics]
    enh_vals = [aggregated["enhanced"][m] for m in metrics]
    
    x = np.arange(len(metrics))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(12, 7))
    rects1 = ax.bar(x - width/2, base_vals, width, label='Base')
    rects2 = ax.bar(x + width/2, enh_vals, width, label='Enhanced')
    
    ax.set_ylabel('Evaluation')
    ax.set_title('Model performance comparison')
    ax.set_xticks(x)
    ax.set_xticklabels(metrics, rotation=45)
    ax.legend()
    
    plt.tight_layout()
    plt.savefig("model_comparison.png")

In [None]:
plot_results(aggregated)