In [4]:
import sys

!pip install numpy pandas sentence-transformers umap-learn plotly scikit-learn
print(f"Python version: {sys.version}")
print("‚úÖ Pakker installert!")

Collecting numpy
  Using cached numpy-2.3.5-cp314-cp314-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pandas
  Using cached pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting pandas
  Using cached pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting umap-learn
  Using cached umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting umap-learn
  Using cached umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting plotly
  Using cached plotly-6.5.0-py3-none-any.whl.metadata (8.5 kB)
Collecting plotly
  Using cached plotly-6.5.0-py3-none-any.whl.metadata (8.5 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp314-cp314-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scikit-learn
  Using 

# üß† Embedding-modeller for RAG

**M√•l:** Forst√• hvordan valg av embedding-modell p√•virker retrieval-kvaliteten i et RAG-system.

## Hva er en embedding?

En embedding er en **numerisk representasjon** av tekst som fanger opp semantisk mening. 
Tekst som betyr lignende ting vil ha vektorer som ligger n√¶r hverandre i vektorrommet.

```
"Hva er dokumentavgift?" ‚Üí [0.12, -0.34, 0.56, ..., 0.78]  (768 dimensjoner)
"Dokumentavgift forklart" ‚Üí [0.11, -0.32, 0.55, ..., 0.79]  (lignende vektor!)
"Oppskrift p√• pizza"     ‚Üí [0.89, 0.23, -0.45, ..., -0.12] (helt annen vektor)
```

## Hvorfor er valg av modell viktig?

| Aspekt | P√•virkning |
|--------|------------|
| **Dimensjonalitet** | H√∏yere = mer presis, men tregere og mer lagring |
| **Spr√•kst√∏tte** | Kritisk for norske dokumenter! |
| **Treningsdata** | Modeller trent p√• juridisk tekst forst√•r juss bedre |
| **Modellst√∏rrelse** | St√∏rre modeller er ofte bedre, men tregere |

## üì¶ Installer pakker

In [7]:
# Kj√∏r dette f√∏rst for √• installere n√∏dvendige pakker
# UMAP krever Python <3.14, s√• vi bruker PCA som alternativ
!pip install sentence-transformers plotly pandas numpy scikit-learn

Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting umap-learn
  Using cached umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting umap-learn
  Using cached umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting plotly
  Using cached plotly-6.5.0-py3-none-any.whl.metadata (8.5 kB)
Collecting plotly
  Using cached plotly-6.5.0-py3-none-any.whl.metadata (8.5 kB)
Collecting pandas
  Using cached pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting pandas
  Using cached pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting numpy
  Using cached numpy-2.3.5-cp314-cp314-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting numpy
  Using cached numpy-2.3.5-cp314-cp314-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp314-cp314-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scikit-learn
  Using cached scikit_l

In [8]:
import json
import time
from pathlib import Path
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'numpy'

## üìÑ Last inn chunks fra chunking-steget

Vi bruker chunks som allerede er laget av chunking-notebooken.

In [None]:
# Last inn metadata chunks (beste kvalitet fra chunking-steget)
chunks_path = Path("../output/chunks_metadata.jsonl")

chunks = []
with open(chunks_path, "r", encoding="utf-8") as f:
    for line in f:
        chunks.append(json.loads(line))

print(f"üìä Lastet {len(chunks)} chunks")
print(f"\nüìù Eksempel chunk:")
print(f"   Text: {chunks[20]['text'][:100]}...")
print(f"   Context: {chunks[20].get('context', 'N/A')}")

## ü§ñ Definer embedding-modeller

Vi sammenligner **tre forskjellige open source modeller** med ulike egenskaper:

| Modell | Dimensjoner | Spr√•k | St√∏rrelse | Bruksomr√•de |
|--------|-------------|-------|-----------|-------------|
| `all-MiniLM-L6-v2` | 384 | Engelsk | 80 MB | Rask, generell |
| `paraphrase-multilingual-MiniLM-L12-v2` | 384 | 50+ spr√•k | 420 MB | Flerspr√•klig |
| `multilingual-e5-large` | 1024 | 100 spr√•k | 2.2 GB | State-of-the-art |

In [None]:
# Definer modellene vi skal teste
MODELS = {
    "MiniLM-EN": {
        "name": "all-MiniLM-L6-v2",
        "dim": 384,
        "desc": "Rask engelsk modell",
        "color": "#FF6B6B"
    },
    "MiniLM-Multi": {
        "name": "paraphrase-multilingual-MiniLM-L12-v2", 
        "dim": 384,
        "desc": "Flerspr√•klig (50+ spr√•k)",
        "color": "#4ECDC4"
    },
    "E5-Large": {
        "name": "intfloat/multilingual-e5-large",
        "dim": 1024,
        "desc": "State-of-the-art multilingual",
        "color": "#45B7D1"
    }
}

print("ü§ñ Modeller som skal testes:")
for key, info in MODELS.items():
    print(f"   ‚Ä¢ {key}: {info['desc']} ({info['dim']} dim)")

In [None]:
# Last inn modellene (kan ta litt tid f√∏rste gang)
models = {}

for key, info in MODELS.items():
    print(f"‚è≥ Laster {key}...")
    start = time.time()
    models[key] = SentenceTransformer(info["name"])
    elapsed = time.time() - start
    print(f"   ‚úÖ Lastet p√• {elapsed:.1f}s")

print("\nüéâ Alle modeller lastet!")

## üî¨ Generer embeddings

Vi genererer embeddings for alle chunks med hver modell og m√•ler tiden.

In [None]:
# Hent ut tekst fra chunks
texts = [c["text"] for c in chunks]

# For E5-modellen m√• vi legge til "passage: " prefix
texts_e5 = [f"passage: {t}" for t in texts]

# Generer embeddings med hver modell
embeddings = {}
timing = {}

for key, model in models.items():
    print(f"\n‚è≥ Genererer embeddings med {key}...")
    
    # E5 krever "passage:" prefix for dokumenter
    input_texts = texts_e5 if "E5" in key else texts
    
    start = time.time()
    emb = model.encode(input_texts, show_progress_bar=True, normalize_embeddings=True)
    elapsed = time.time() - start
    
    embeddings[key] = emb
    timing[key] = elapsed
    
    print(f"   ‚úÖ {len(texts)} chunks p√• {elapsed:.1f}s ({len(texts)/elapsed:.1f} chunks/sek)")
    print(f"   üìê Shape: {emb.shape}")

## üìä Sammenlign ytelse

In [None]:
# Lag sammenligning
comparison_data = []
for key, info in MODELS.items():
    emb = embeddings[key]
    comparison_data.append({
        "Modell": key,
        "Dimensjoner": info["dim"],
        "Tid (sek)": f"{timing[key]:.1f}",
        "Chunks/sek": f"{len(texts)/timing[key]:.1f}",
        "Minne (MB)": f"{emb.nbytes / 1024 / 1024:.1f}",
        "Beskrivelse": info["desc"]
    })

df_comparison = pd.DataFrame(comparison_data)
print("üìä Ytelsessammenligning:\n")
print(df_comparison.to_string(index=False))

In [None]:
# Visualiser ytelse
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Tid per modell", "Dimensjoner vs Hastighet"),
    specs=[[{"type": "bar"}, {"type": "scatter"}]]
)

colors = [MODELS[k]["color"] for k in MODELS.keys()]

# Bar chart: Tid
fig.add_trace(
    go.Bar(
        x=list(MODELS.keys()),
        y=[timing[k] for k in MODELS.keys()],
        marker_color=colors,
        text=[f"{timing[k]:.1f}s" for k in MODELS.keys()],
        textposition="outside"
    ),
    row=1, col=1
)

# Scatter: Dimensjoner vs Hastighet
fig.add_trace(
    go.Scatter(
        x=[MODELS[k]["dim"] for k in MODELS.keys()],
        y=[len(texts)/timing[k] for k in MODELS.keys()],
        mode="markers+text",
        marker=dict(size=20, color=colors),
        text=list(MODELS.keys()),
        textposition="top center"
    ),
    row=1, col=2
)

fig.update_layout(
    title="‚ö° Ytelsessammenligning av Embedding-modeller",
    showlegend=False,
    height=400
)
fig.update_xaxes(title_text="Modell", row=1, col=1)
fig.update_yaxes(title_text="Tid (sekunder)", row=1, col=1)
fig.update_xaxes(title_text="Dimensjoner", row=1, col=2)
fig.update_yaxes(title_text="Chunks per sekund", row=1, col=2)

fig.show()

## üé® Visualiser embedding-rom med PCA

PCA (Principal Component Analysis) lar oss visualisere h√∏y-dimensjonale embeddings i 2D.

**Hva ser vi etter?**
- Chunks om samme tema skal klynge seg sammen
- Forskjellige modeller kan gruppere tekst ulikt

In [None]:
from sklearn.decomposition import PCA

# Reduser dimensjoner med PCA for hver modell
pca_results = {}

for key in MODELS.keys():
    print(f"‚è≥ PCA for {key}...")
    reducer = PCA(n_components=2, random_state=42)
    pca_results[key] = reducer.fit_transform(embeddings[key])
    print(f"   ‚úÖ Ferdig (forklart varians: {sum(reducer.explained_variance_ratio_):.1%})")

In [None]:
# Hent metadata for fargelegging
# Bruk h1 header som kategori
categories = []
for c in chunks:
    meta = c.get("metadata", {})
    cat = meta.get("h3", meta.get("h2", meta.get("h1", "Ukjent")))
    # Forenkle kategorinavnet
    cat = cat.replace("**", "").strip()[:30]
    categories.append(cat)

# Finn de 8 vanligste kategoriene
from collections import Counter
cat_counts = Counter(categories)
top_cats = [c[0] for c in cat_counts.most_common(8)]
categories_simplified = [c if c in top_cats else "Andre" for c in categories]

print(f"üìÇ Kategorier funnet: {len(set(categories_simplified))}")

In [None]:
# Lag PCA visualisering for alle modeller
fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=[f"{k} ({MODELS[k]['dim']}d)" for k in MODELS.keys()]
)

for i, key in enumerate(MODELS.keys(), 1):
    pca_2d = pca_results[key]
    
    fig.add_trace(
        go.Scatter(
            x=pca_2d[:, 0],
            y=pca_2d[:, 1],
            mode="markers",
            marker=dict(
                size=6,
                opacity=0.7,
                color=[hash(c) % 10 for c in categories_simplified],
                colorscale="Viridis"
            ),
            text=[f"{c}\n{chunks[j]['text'][:50]}..." for j, c in enumerate(categories_simplified)],
            hovertemplate="%{text}<extra></extra>"
        ),
        row=1, col=i
    )

fig.update_layout(
    title="üé® PCA: Hvordan ser modellene p√• dokumentet?",
    showlegend=False,
    height=500,
    width=1200
)

fig.show()

## üîç Test retrieval: Hvilken modell finner best svar?

Vi tester modellene med noen norske sp√∏rsm√•l om dokumentavgift.

In [None]:
# Test-sp√∏rsm√•l p√• norsk
test_queries = [
    "Hva er dokumentavgift?",
    "Hvor mye er dokumentavgiften i prosent?",
    "N√•r slipper man √• betale dokumentavgift?",
    "Hva skjer ved arv av eiendom?",
    "Hvordan beregnes avgiftsgrunnlaget?"
]

def search(query: str, model_key: str, top_k: int = 3) -> list:
    """S√∏k etter mest relevante chunks for en query"""
    model = models[model_key]
    
    # E5 krever "query:" prefix
    if "E5" in model_key:
        query = f"query: {query}"
    
    query_emb = model.encode([query], normalize_embeddings=True)
    
    # Beregn similarity
    similarities = cosine_similarity(query_emb, embeddings[model_key])[0]
    
    # Hent top-k
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        results.append({
            "chunk_id": chunks[idx]["chunk_id"],
            "score": float(similarities[idx]),
            "text": chunks[idx]["text"][:200],
            "context": chunks[idx].get("context", "")
        })
    return results

In [None]:
# Test alle modeller med f√∏rste query
query = test_queries[0]
print(f"üîç Query: '{query}'\n")
print("=" * 80)

for model_key in MODELS.keys():
    print(f"\nüìå {model_key} ({MODELS[model_key]['desc']})")
    print("-" * 40)
    
    results = search(query, model_key, top_k=3)
    for i, r in enumerate(results, 1):
        print(f"  {i}. [Score: {r['score']:.3f}] {r['text'][:100]}...")

In [None]:
# Sammenlign alle queries visuelt
all_results = []

for query in test_queries:
    for model_key in MODELS.keys():
        results = search(query, model_key, top_k=1)
        all_results.append({
            "Query": query[:40] + "...",
            "Modell": model_key,
            "Top Score": results[0]["score"],
            "Chunk ID": results[0]["chunk_id"]
        })

df_results = pd.DataFrame(all_results)

# Pivot for heatmap
pivot = df_results.pivot(index="Query", columns="Modell", values="Top Score")

fig = px.imshow(
    pivot,
    labels=dict(x="Modell", y="Sp√∏rsm√•l", color="Similarity Score"),
    color_continuous_scale="RdYlGn",
    aspect="auto",
    text_auto=".2f"
)

fig.update_layout(
    title="üéØ Retrieval Scores per Modell og Sp√∏rsm√•l",
    height=400
)

fig.show()

## üá≥üá¥ Spr√•ktest: Hvordan h√•ndterer modellene norsk?

Vi tester hvordan modellene matcher norske vs engelske queries mot norsk tekst.

In [None]:
# Norsk vs Engelsk query
language_pairs = [
    ("Hva er dokumentavgift?", "What is document tax?"),
    ("Fritak ved arv", "Exemption for inheritance"),
    ("Beregning av avgift", "Calculation of tax")
]

print("üá≥üá¥ Norsk vs üá¨üáß Engelsk queries p√• norsk dokument\n")
print("=" * 80)

for no_query, en_query in language_pairs:
    print(f"\nüìù NO: '{no_query}' | EN: '{en_query}'")
    print("-" * 60)
    
    for model_key in MODELS.keys():
        no_results = search(no_query, model_key, top_k=1)
        en_results = search(en_query, model_key, top_k=1)
        
        no_score = no_results[0]["score"]
        en_score = en_results[0]["score"]
        diff = no_score - en_score
        
        emoji = "‚úÖ" if diff > 0 else "‚ö†Ô∏è"
        print(f"  {model_key:15} | NO: {no_score:.3f} | EN: {en_score:.3f} | Diff: {diff:+.3f} {emoji}")

## üìà Intra-cluster similarity: Hvor konsistente er embeddings?

Vi m√•ler hvor like chunks innenfor samme kategori er - h√∏yere = bedre semantisk forst√•else.

In [None]:
from collections import defaultdict

# Grupper chunks per kategori
cat_to_indices = defaultdict(list)
for i, cat in enumerate(categories_simplified):
    if cat != "Andre":  # Skip "Andre"
        cat_to_indices[cat].append(i)

# Beregn gjennomsnittlig intra-cluster similarity
cluster_scores = []

for model_key in MODELS.keys():
    emb = embeddings[model_key]
    
    similarities = []
    for cat, indices in cat_to_indices.items():
        if len(indices) >= 2:
            cat_emb = emb[indices]
            sim_matrix = cosine_similarity(cat_emb)
            # Hent √∏vre triangel (unng√• diagonalen)
            upper_tri = sim_matrix[np.triu_indices_from(sim_matrix, k=1)]
            similarities.extend(upper_tri)
    
    avg_sim = np.mean(similarities)
    cluster_scores.append({
        "Modell": model_key,
        "Avg Intra-Cluster Similarity": avg_sim
    })

df_cluster = pd.DataFrame(cluster_scores)

fig = px.bar(
    df_cluster,
    x="Modell",
    y="Avg Intra-Cluster Similarity",
    color="Modell",
    color_discrete_sequence=[MODELS[k]["color"] for k in MODELS.keys()],
    text_auto=".3f"
)

fig.update_layout(
    title="üìä Intra-Cluster Similarity (h√∏yere = bedre gruppering)",
    showlegend=False,
    yaxis_range=[0, 1]
)

fig.show()

## üíæ Lagre embeddings for senere bruk

Vi lagrer embeddings slik at de kan brukes i retrieval-steget.

In [None]:
output_dir = Path("./output")
output_dir.mkdir(exist_ok=True)

for model_key in MODELS.keys():
    # Lagre som numpy
    np.save(output_dir / f"embeddings_{model_key}.npy", embeddings[model_key])
    print(f"‚úÖ Lagret embeddings_{model_key}.npy ({embeddings[model_key].shape})")

print(f"\nüìÇ Filer lagret i {output_dir.absolute()}")

## üéì Oppsummering

### Hva l√¶rte vi?

1. **Spr√•kst√∏tte er kritisk** - Engelske modeller sliter med norsk tekst
2. **Dimensjonalitet ‚â† Kvalitet** - Flere dimensjoner betyr ikke alltid bedre resultater
3. **Trade-offs finnes** - Raskere modeller kan v√¶re "good enough" for mange use cases
4. **Test p√• dine data** - Benchmark-resultater gjelder ikke alltid for din use case

### Anbefalinger for norske RAG-systemer

| Scenario | Anbefalt modell |
|----------|----------------|
| Produksjon med norsk tekst | `multilingual-e5-large` |
| Rask prototyping | `paraphrase-multilingual-MiniLM-L12-v2` |
| Kun engelsk tekst | `all-MiniLM-L6-v2` |

### Neste steg

- **Hybrids√∏k** - Kombiner med BM25 for bedre resultater
- **Re-ranking** - Bruk cross-encoder for √• forbedre ranking
- **Evaluering** - M√•l MRR, Recall@k p√• reelle test-sett