# HBM
## Reference
- [Efficient bayesian hierarchical user modeling for recommendation system](https://doi.org/10.1145/1277741.1277752)

In [4]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
import pymc as pm
import numpy as np
import pytensor.tensor as pt
from PMF.LoadData import load_rating_data

## Data Analysis

In [13]:
dataset_path = r'data\ml-latest-small\ml-latest-small'
ratings = pd.read_csv(dataset_path + r'\ratings.csv')
movies = pd.read_csv(dataset_path + r'\movies.csv')
tags = pd.read_csv(dataset_path + r'\tags.csv')

In [14]:
#ratings = load_rating_data(r'data\ml-100k\ml-100k\u.data')
#ratings = pd.DataFrame(ratings, columns=['user_id', 'item_id', 'rating'])
#movies = pd.read_csv(r'data\ml-100k\ml-100k\u.item', sep='|', encoding='latin-1', header=None)
#movies.columns = ['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown',
#                  'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary',
#                  'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
#                  'Sci-Fi', 'Thriller', 'War', 'Western']

In [15]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [16]:
num_movies = movies.shape[0]
num_users = ratings['userId'].nunique()


In [17]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
movies['genres'] = movies['genres'].str.split('|')
movies['genres']

0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9737                 [Action, Animation, Comedy, Fantasy]
9738                         [Animation, Comedy, Fantasy]
9739                                              [Drama]
9740                                  [Action, Animation]
9741                                             [Comedy]
Name: genres, Length: 9742, dtype: object

In [19]:
#genre_df

In [20]:
#sns.barplot(data=genre_df, orient='h')

## Model

In [33]:
class GenreHBMRecommender:
    def __init__(self, ratings_df, movies_df, latent_dim=10):
        """
        ratings_df: DataFrame with columns ['userId', 'movieId', 'rating']
        movies_df:  DataFrame with columns ['movieId', 'genres']

        """
        self.ratings_df = ratings_df.copy()
        self.movies_df = movies_df.copy()
        self.latent_dim = latent_dim
        self.model = None
        self.trace = None
        self.mlb = MultiLabelBinarizer()

        self._preprocess()

    def _preprocess(self):
        """Map userId and movieId to 0-based indices"""
        self.user_map = {uid: i for i, uid in enumerate(self.ratings_df['userId'].unique())}
        self.movie_map = {mid: i for i, mid in enumerate(self.movies_df['movieId'].unique())}

        self.ratings_df['uidx'] = self.ratings_df['userId'].map(self.user_map)
        self.ratings_df['midx'] = self.ratings_df['movieId'].map(self.movie_map)

        self.num_users = len(self.user_map)
        self.num_movies = len(self.movie_map)
        
        genre_onehot = self.mlb.fit_transform(movies['genres'])

        # 建立對應欄位名稱
        genre_df = pd.DataFrame(genre_onehot, columns=self.mlb.classes_)
        self.genre_map = {genre: i for i, genre in enumerate(self.mlb.classes_)}
        self.genre_idx_list = genre_df.apply(lambda x: [self.genre_map[g] for g in x.index[x == 1].tolist()], axis=1).to_list()

    def _build_mu_i(self, mu_g):
        """Build per-movie prior mean vector mu_i based on genre index list"""
        mu_i_list = []
        for genre_idxs in self.genre_idx_list:
            mu_mean = pm.math.mean(mu_g[genre_idxs], axis=0)
            mu_i_list.append(mu_mean)
        return pm.math.stack(mu_i_list, axis=0)

    def build_model(self):
        """Construct the PyMC model"""
        print("Building model...")
        user_idx = self.ratings_df['uidx'].values
        movie_idx = self.ratings_df['midx'].values
        ratings = self.ratings_df['rating'].values
        num_genres = len(self.mlb.classes_)
        print(f"Number of genres: {num_genres}")

        with pm.Model() as model:
            mu_g = pm.Normal("mu_g", mu=0, sigma=1, shape=(num_genres, self.latent_dim))
            print(f"mu_g shape: {mu_g.shape}")
            sigma = pm.Exponential("sigma", 1.0)
            

            mu_i = self._build_mu_i(mu_g)
            x_movie = pm.Normal("x_movie", mu=mu_i, sigma=sigma, shape=(self.num_movies, self.latent_dim))
            print(f"x_movie shape: {x_movie.shape}")
            w_user = pm.Normal("w_user", mu=0, sigma=1, shape=(self.num_users, self.latent_dim))
            print(f"w_user shape: {w_user.shape}")

            pred = pm.math.sum(w_user[user_idx] * x_movie[movie_idx], axis=1)
            print(f"pred shape: {pred.shape}")
            r = pm.Normal("r", mu=pred, sigma=0.5, observed=ratings)

            self.model = model
        print("Model built successfully.")
        
    def fit(self, draws=1000, tune=500, target_accept=0.9):
        """Run MCMC sampling"""
        if self.model is None:
            self.build_model()
        print("Fitting model...")
        with self.model:
            self.trace = pm.sample(draws=draws, tune=tune, target_accept=target_accept, progressbar=True)

            #approx = pm.fit(method='advi', n=10000)
            #trace = approx.sample(500)

            
    def predict(self, user_id, movie_id):
        """
        根據 user_id 與 movie_id 預測評分（使用後驗平均潛在向量）
        """
        if self.trace is None:
            raise ValueError("Model not fitted yet. Call fit() first.")
        if user_id not in self.user_map or movie_id not in self.movie_map:
            raise ValueError("user_id 或 movie_id 不在訓練資料中")

        # 對應 index
        u_idx = self.user_map[user_id]
        m_idx = self.movie_map[movie_id]

        # 後驗平均向量
        w_user = self.trace.posterior['w_user'].mean(dim=("chain", "draw")).values
        x_movie = self.trace.posterior['x_movie'].mean(dim=("chain", "draw")).values

        # 預測值 = 內積
        return float(np.dot(w_user[u_idx], x_movie[m_idx]))

    def get_posterior_means(self):
        """Extract MAP estimate of user and movie vectors"""
        assert self.trace is not None, "You must call fit() first."
        w_user = self.trace.posterior['w_user'].mean(dim=("chain", "draw")).values
        x_movie = self.trace.posterior['x_movie'].mean(dim=("chain", "draw")).values
        return w_user, x_movie


In [36]:
model = GenreHBMRecommender(ratings, movies, latent_dim=10)

In [37]:
model.fit(100, 50, 0.9)

Building model...
Number of genres: 20
mu_g shape: Shape.0
x_movie shape: Shape.0
w_user shape: Shape.0
pred shape: Shape.0
Model built successfully.
Fitting model...


KeyboardInterrupt: 