In [1]:
%pip install voyageai numpy

Collecting voyageai
  Downloading voyageai-0.2.4-py3-none-any.whl.metadata (2.6 kB)
Collecting numpy
  Downloading numpy-2.4.2-cp314-cp314-macosx_10_15_x86_64.whl.metadata (6.6 kB)
Collecting aiohttp<4.0,>=3.5 (from voyageai)
  Downloading aiohttp-3.13.3-cp314-cp314-macosx_10_13_x86_64.whl.metadata (8.1 kB)
Collecting aiolimiter<2.0.0,>=1.1.0 (from voyageai)
  Downloading aiolimiter-1.2.1-py3-none-any.whl.metadata (4.5 kB)
Collecting tenacity>=8.0.1 (from voyageai)
  Downloading tenacity-9.1.4-py3-none-any.whl.metadata (1.2 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp<4.0,>=3.5->voyageai)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.4.0 (from aiohttp<4.0,>=3.5->voyageai)
  Downloading aiosignal-1.4.0-py3-none-any.whl.metadata (3.7 kB)
Collecting frozenlist>=1.1.1 (from aiohttp<4.0,>=3.5->voyageai)
  Downloading frozenlist-1.8.0-cp314-cp314-macosx_10_13_x86_64.whl.metadata (20 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp<4.0

In [2]:
import os
import json
import voyageai
import numpy as np
from dotenv import load_dotenv

load_dotenv()

voyage = voyageai.Client(api_key=os.getenv("VOYAGE_API_KEY"))

# ‚îÄ‚îÄ‚îÄ 1. Convertir une station en texte lisible (le "chunk") ‚îÄ‚îÄ‚îÄ

def station_to_text(station: dict) -> str:
    return f"""
Station : {station['name']} ({station['region']})
Altitude : {station['altitude_min']}m - {station['altitude_max']}m
Domaine : {station['ski_area']} ‚Äî {station['km_slopes']} km de pistes
Pistes : {station['slopes_detail']['green']} vertes, {station['slopes_detail']['blue']} bleues,
         {station['slopes_detail']['red']} rouges, {station['slopes_detail']['black']} noires
Niveaux : {', '.join(station['level'])}
Forfait journ√©e adulte : {station['passes']['full_day']['adult']}‚Ç¨
H√©bergement moyen : {station['avg_accommodation_price']}‚Ç¨/nuit
Activit√©s : {', '.join(station['activities'])}
Services : {', '.join(station['services'])}
Description : {station['description']}
A√©roport le plus proche : {station['access']['nearest_airport']} ({station['access']['distance_from_airport_km']} km)
""".strip()

# Charger les stations
with open("../data/stations/stations.json", "r") as f:
    stations = json.load(f)

# Visualiser un chunk
les_orres = next(s for s in stations if s['id'] == 'les-orres')
print(station_to_text(les_orres))

Station : Les Orres (Hautes-Alpes)
Altitude : 1500m - 2500m
Domaine : Les Orres ‚Äî 100 km de pistes
Pistes : 15 vertes, 30 bleues,
         35 rouges, 20 noires
Niveaux : beginner, intermediate, advanced
Forfait journ√©e adulte : 39‚Ç¨
H√©bergement moyen : 60‚Ç¨/nuit
Activit√©s : snowshoeing, snow_scooter
Services : ski_school, equipment_rental
Description : Ski dans les Alpes du Sud, ambiance conviviale
A√©roport le plus proche : Marseille (230 km)


In [3]:
# ‚îÄ‚îÄ‚îÄ 2. Cr√©er un embedding et l'explorer ‚îÄ‚îÄ‚îÄ

chunk = station_to_text(les_orres)
result = voyage.embed([chunk], model="voyage-3")
embedding = result.embeddings[0]

print(f"Type    : {type(embedding)}")
print(f"Taille  : {len(embedding)} dimensions")  # ‚Üí 1024
print(f"Extrait : {embedding[:5]}")              # ‚Üí liste de floats

Type    : <class 'list'>
Taille  : 1024 dimensions
Extrait : [-0.03828684240579605, -0.02504226751625538, -0.05431389436125755, -0.052310511469841, -0.03984503075480461]


In [4]:
# ‚îÄ‚îÄ‚îÄ 3. La vraie magie : comparer deux stations ‚îÄ‚îÄ‚îÄ
# La similarit√© cosinus mesure si deux vecteurs "pointent dans la m√™me direction"
# ‚Üí 1.0 = identiques, 0.0 = sans rapport, -1.0 = oppos√©s

def cosine_similarity(a, b):
    a, b = np.array(a), np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Embeddings de 3 stations
val_thorens  = next(s for s in stations if s['id'] == 'val-thorens')
les_menuires = next(s for s in stations if s['id'] == 'les-menuires')  # aussi dans les 3 Vall√©es

results = voyage.embed(
    [station_to_text(val_thorens), 
     station_to_text(les_menuires), 
     station_to_text(les_orres)],
    model="voyage-3"
)
emb_vt, emb_lm, emb_lo = results.embeddings

print("Val Thorens ‚Üî Les Menuires (m√™me domaine) :", 
      round(cosine_similarity(emb_vt, emb_lm), 4))  # ‚Üí score √©lev√©
print("Val Thorens ‚Üî Les Orres (domaines diff√©rents) :", 
      round(cosine_similarity(emb_vt, emb_lo), 4))  # ‚Üí score plus bas

Val Thorens ‚Üî Les Menuires (m√™me domaine) : 0.828
Val Thorens ‚Üî Les Orres (domaines diff√©rents) : 0.6966


In [6]:
# ‚îÄ‚îÄ‚îÄ 4. Recherche s√©mantique manuelle (avant ChromaDB) ‚îÄ‚îÄ‚îÄ
# C'est exactement ce que fait un vector store, mais √† la main

# Embedder toutes les stations
all_texts = [station_to_text(s) for s in stations]
all_results = voyage.embed(all_texts, model="voyage-3")
all_embeddings = all_results.embeddings

def search(query: str, top_k: int = 3):
    # Embedder la question
    query_result = voyage.embed([query], model="voyage-3", input_type="query")
    query_embedding = query_result.embeddings[0]
    
    # Calculer la similarit√© avec toutes les stations
    scores = [
        (stations[i]['name'], cosine_similarity(query_embedding, emb))
        for i, emb in enumerate(all_embeddings)
    ]
    
    # Trier et retourner les meilleurs r√©sultats
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top_k]

# Test !
print("üîç 'station pour famille d√©butante pas ch√®re'")
for name, score in search("station pour famille d√©butante pas ch√®re"):
    print(f"  {name} ‚Üí {round(score, 4)}")

print("\nüîç 'ski expert haute altitude glacier'")
for name, score in search("ski expert haute altitude glacier"):
    print(f"  {name} ‚Üí {round(score, 4)}")

üîç 'station pour famille d√©butante pas ch√®re'
  Superd√©voluy ‚Üí 0.5284
  Montgen√®vre ‚Üí 0.5084
  Valberg ‚Üí 0.5044

üîç 'ski expert haute altitude glacier'
  Tignes ‚Üí 0.5686
  Les 2 Alpes ‚Üí 0.5252
  L'Alpe d'Huez ‚Üí 0.4998
