In [None]:
import pandas as pd

In [None]:
df = pd.read_parquet("../data/video-games-db-processed.parquet", engine="fastparquet")
df.shape

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
import requests

In [None]:
res = requests.get("http://localhost:9200")
if res.status_code == 200:
    print("Elasticsearch is up and running!")

In [None]:
df.dtypes

In [None]:
mapping = {
    "mappings": {
        "properties": {
            "id": {"type": "integer"},
            "name": {"type": "text"},
            "released": {"type": "date", "format": "yyyy-MM-dd"},
            "rating": {"type": "float"},
            "description": {"type": "text"},
            "playtime": {"type": "integer"},
            "genres": {"type": "keyword"},
            "publisher": {"type": "keyword"},
            "image": {"type": "keyword"},
            "embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine",
            },
        }
    }
}

In [None]:
db_url = "http://localhost:9200"
index_name = "video_games"

In [None]:
# Crear el index
res = requests.put(f"{db_url}/{index_name}", json=mapping)
res.raise_for_status()
res.json()

In [None]:
# Subir registros a la db
for _, row in df.iterrows():
    doc = {
        "id": row["id"],
        "name": row["name"],
        "released": row["released"],
        "rating": row["rating"],
        "description": row["description"],
        "playtime": row["playtime"],
        "genres": row["genres"],
        "publisher": row["publisher"],
        "image": row["image"],
        "embedding": row["embedding"],
    }
    res = requests.post(f"{db_url}/{index_name}/_doc", json=doc)
    res.raise_for_status()
    print(res.json())

In [None]:
res = requests.get(
    f"{db_url}/{index_name}/_search", json={"query": {"term": {"id": 3498}}}
)
res.raise_for_status()
res.json()

In [None]:
# Probamos búsqueda vectorial
query_embedding = df.iloc[1, -1]
query = {
    "size": 10,
    "query": {
        "knn": {
            "field": "embedding",
            "query_vector": query_embedding,
            "k": 10,
            "num_candidates": 100,
        }
    },
}
res = requests.get(f"{db_url}/{index_name}/_search", json=query)
res.raise_for_status()
data = res.json()

In [None]:
[row["_source"].get("name") for row in data["hits"]["hits"]]