In [64]:
import duckdb
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
views_df = duckdb.sql("SELECT * FROM 'data/content_views.parquet'").df()
content_df = duckdb.sql("SELECT * FROM 'data/content_metadata.parquet'").df()
adventurers_df = duckdb.sql("SELECT * FROM 'data/adventurer_metadata.parquet'").df()

df = views_df.merge(content_df, on='content_id').merge(adventurers_df, on='adventurer_id')
df = df.drop(columns=['rating', 'playlist_id', 'month_x', 'day_x', 'day_of_month_x', 'year_x', 'minutes', 'title', 'age', 'month_y', 'day_y', 'day_of_month_y', 'year_y', 'name', 'seconds_viewed', 'publisher_id'])

In [66]:
content_features = ["genre_id", "language_code", "studio"]
adventurer_features = ["primary_language", "favorite_genre", "region", "gender", "honorific"]

features = content_features + adventurer_features

categorical = ["genre_id", "language_code", "primary_language", "favorite_genre", "region", "gender", "honorific", "studio"]
numerical = []

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
    ]
)

X = preprocessor.fit_transform(df[features])

In [67]:
# For content rows: fill adventurer columns with NaN
content_X = df[content_features].copy()
content_X["primary_language"] = None
content_X["favorite_genre"] = None
content_X["region"] = None 
content_X["gender"] = None
content_X["honorific"] = None

# For adventurer rows: fill content columns with NaN
adventurer_X = df[adventurer_features].copy()
adventurer_X["genre_id"] = None
adventurer_X["language_code"] = None
adventurer_X["studio"] = None

# Combine both so the encoder learns all possible categories
all_X = pd.concat([content_X, adventurer_X], axis=0)

# Fit once on the combined schema
preprocessor.fit(all_X)

# Now transform separately
content_encoded = preprocessor.transform(content_X)
adventurer_encoded = preprocessor.transform(adventurer_X)

In [68]:
knn = NearestNeighbors(metric="cosine")
knn.fit(content_encoded)

In [73]:
def recommend_for_adventurer(adventurer_id, k=2):
    a_vec = adventurer_encoded[df['adventurer_id'] == adventurer_id]
    distances, indices = knn.kneighbors(a_vec, n_neighbors=k*50)  # extra neighbors
    candidate_content_ids = df.iloc[indices[0]]['content_id'].unique()

    viewed = set(views_df.loc[views_df['adventurer_id'] == adventurer_id, 'content_id'])
    recommended_ids = [c for c in candidate_content_ids if c not in viewed][:k]

    recommended_contents = content_df[content_df['content_id'].isin(recommended_ids)]
    return recommended_contents

advs = ["ih3j", "utgz", "2nxf"]

for adv in advs:
    recs = recommend_for_adventurer(adv)
    print(recs[['content_id', 'title', 'genre_id', 'language_code', 'studio']])


    content_id                                        title genre_id  \
406       swhg  Chronicles of Greycliffs: Holy Perseverance      ROM   
490       o8sq               This Ancient Memoirs Revealed!      ROM   

    language_code studio  
406            FF   hi7j  
490            RP   supg  
    content_id                                           title genre_id  \
460       blf4  INCREDIBLE: Sublime Resilience Will Shock You!      KID   
490       o8sq                  This Ancient Memoirs Revealed!      ROM   

    language_code studio  
460            FF   hi7j  
490            RP   supg  
    content_id                                title genre_id language_code  \
103       rr7g  Exploring Puddington: Mystic Quests      ROM            FF   
490       o8sq       This Ancient Memoirs Revealed!      ROM            RP   

    studio  
103   hi7j  
490   supg  
