In [79]:
import duckdb
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [80]:
views_df = duckdb.sql("SELECT * FROM 'data/content_views.parquet'").df()
content_df = duckdb.sql("SELECT * FROM 'data/content_metadata.parquet'").df()
adventurers_df = duckdb.sql("SELECT * FROM 'data/adventurer_metadata.parquet'").df()

df = views_df.merge(content_df, on='content_id').merge(adventurers_df, on='adventurer_id')

In [83]:
content_features = ["genre_id", "language_code"]
adventurer_features = ["primary_language", "favorite_genre"]

features = content_features + adventurer_features

categorical = ["genre_id", "language_code", "primary_language", "favorite_genre"]
numerical = []

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
])

adventurer_X = preprocessor.fit_transform(df[features])
df_features = pd.DataFrame.sparse.from_spmatrix(adventurer_X, index=df.index)

In [84]:
adventurer_profiles = df_features.groupby(df["adventurer_id"]).mean()

In [85]:
# content_X = preprocessor.transform(
#     content_df[content_features].assign(
#         honorific="", gender="", age=0, region="", favorite_genre="", primary_language=""  # placeholders for missing cols
#     )
# )

adventurer_seen = (
    views_df.groupby("adventurer_id")["content_id"]
    .apply(set)
    .to_dict()
)

content_X = preprocessor.transform(
    content_df[content_features].assign(
        favorite_genre="", primary_language=""
    )
)

# content_features_df = pd.DataFrame.sparse.from_spmatrix(
#     content_X, index=content_df["content_id"]
# )

In [86]:
knn = NearestNeighbors(algorithm="auto")
knn.fit(content_X)

In [87]:
def recommend_for_adventurer(adventurer_id, k=5):
    # Get this adventurer’s feature vector
    idx = df.index[df["adventurer_id"] == adventurer_id][0]
    adventurer_vec = adventurer_X[idx]

    # Find nearest content items
    distances, indices = knn.kneighbors(adventurer_vec, n_neighbors=content_X.shape[0])

    seen = adventurer_seen.get(adventurer_id, set())
    recommendations = []

    for i in indices[0]:
        content_id = content_df.iloc[i]["content_id"]
        if content_id not in seen:   # ✅ filter out already seen
            recommendations.append(content_df.iloc[i])
        if len(recommendations) >= k:
            break

    return pd.DataFrame(recommendations)


In [88]:
print(recommend_for_adventurer("ih3j"))

    content_id studio                                      title genre_id  \
390       5oeu   e3o7        This Earth Tale Changes Everything!      FNT   
769       w7u1   hi7j  EXPLORE: Enigmatic Tale from Soggy Hollow      FNT   
688       r8fp   hi7j      Jamshire Heritage: Radiant Compendium      FNT   
166       12go   8mdi          Local Elegance from Soggy Hollow!      FNT   
304       zlfr   e3o7                             Riddles Manual      FNT   

     minutes language_code      month        day  day_of_month   year  
390     6.12            FF  Frostmere    Soulday            24  10234  
769     3.81            FF  Mysthaven    Soulday            18  10234  
688     6.30            FF   Aurorath  ArcanaDay             3  10234  
166     4.17            FF  Frostmere    Edgeday             1  10234  
304     5.53            FF  Frostmere    Edgeday             1  10234  


In [89]:
q = f"""
SELECT genre_id, COUNT(genre_id)
FROM 'data/content_views.parquet' views
LEFT JOIN 'data/adventurer_metadata.parquet' adv USING(adventurer_id)
LEFT JOIN 'data/content_metadata.parquet' meta USING(content_id)
WHERE adventurer_id = 'ih3j'
GROUP BY genre_id
ORDER BY COUNT(genre_id) DESC
"""

print(duckdb.sql(q).df())


q = f"""
SELECT content_id, title, genre_id, meta.language_code
FROM 'data/content_views.parquet' views
LEFT JOIN 'data/adventurer_metadata.parquet' adv USING(adventurer_id)
LEFT JOIN 'data/content_metadata.parquet' meta USING(content_id)
WHERE adventurer_id = 'ih3j'
"""

print(duckdb.sql(q).df())

  genre_id  count(genre_id)
0      KID               12
1      HOR                6
2      FNT                6
3      DOC                5
4      ROM                5
5      COM                4
6      RLG                3
7      ACT                2
   content_id                                              title genre_id  \
0        r8yv                  The Ancient Wisdom that Revealed!      FNT   
1        cib3                The Extensive Strategies of Fortune      DOC   
2        wfmv                  Greycliffs Heritage: Hidden Codex      HOR   
3        cf4i            Starting Your Tactics Tradition Journey      HOR   
4        s5o5                     Local Freedom from Threadbare!      KID   
5        cp66              Local Spirituality from Boggy Bottom!      KID   
6        fabt       Slurpington Revered Contemplations REVEALED!      COM   
7        al2w                  AMAZING: Veiled Enigmas Revealed!      RLG   
8        blf4     INCREDIBLE: Sublime Resilience Will S