In [9]:
import os
from typing import Dict, List

import numpy as np
import pandas as pd
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

In [10]:
config = {
    "rating_address_template": "dataset/raw/binge/rating_x.csv",
    "number_of_rating_addresses": 4,
    "movie_titles_address": "dataset/raw/binge/movie_titles.csv",
}

model_save_address = "./models/als_recommender_model.npz"

In [11]:
class BingeDataset:
    def __init__(self, rating_address_template: str, number_of_rating_addresses: int, movie_titles_address: str):
        self.rating_address_template = rating_address_template
        self.NUMBER_OF_RATING_ADDRESSES = number_of_rating_addresses
        self.movie_titles_address = movie_titles_address

    def get_df(self) -> pd.DataFrame:
        ratings_df = self.get_cleaned_ratings()
        ratings_with_median = self.replace_ratings_with_median_ratings(ratings_df)
        ratings_with_count = self.get_rating_counts(ratings_with_median)
        rating_final_df = self.merge_median_and_count_ratings(ratings_with_median, ratings_with_count)

        rating_final_df["score"] = (
            rating_final_df["median_rating"] * rating_final_df["count_of_review"]
        )

        movie_titles_final = self.get_movie_titles_df()

        df = rating_final_df.merge(
            movie_titles_final,
            how="inner",
            left_on="movie_id",
            right_on="movie_id"
        )

        df = self.change_types(df)

        return df

    def change_types(self, df: pd.DataFrame):
        return (
            df
            .astype({"year_of_publication": "int32"})
            .astype({col: "int32" for col in df.select_dtypes("int64").columns})
            .astype({col: "float32" for col in df.select_dtypes("float64").columns})
        )

    def merge_median_and_count_ratings(
        self,
        ratings_with_median: pd.DataFrame,
        ratings_with_count: pd.DataFrame
    ) -> pd.DataFrame:
        columns_mapping = {
            "median_rating_x": "median_rating",
            "median_rating_y": "count_of_review",
        }

        return (
            ratings_with_median
            .merge(ratings_with_count, how="inner", left_on="movie_id", right_on="movie_id")
            .rename(columns=columns_mapping)
        )

    def get_movie_titles_df(self) -> pd.DataFrame:

        movie_titles_columns = ("movie_id", "year_of_publication", "movie_name")

        movie_titles = pd.read_csv(
            self.movie_titles_address,
            encoding="ISO-8859-1",
            usecols=[0, 1, 2],
            names=movie_titles_columns,
        )

        return movie_titles[~(movie_titles["year_of_publication"].isna())]

    def get_rating_counts(self, ratings_with_median: pd.DataFrame) -> pd.DataFrame:
        return (
            ratings_with_median.groupby(by="movie_id")["median_rating"]
            .count()
            .reset_index()
        )

    def replace_ratings_with_median_ratings(
        self, ratings_df: pd.DataFrame
    ) -> pd.DataFrame:
        return (
            ratings_df.groupby(by="movie_id")[["rating"]]
            .median()
            .reset_index()
            .merge(ratings_df, how="inner", left_on="movie_id", right_on="movie_id")
            .drop(columns="rating_y")
            .rename(columns={"rating_x": "median_rating"})
        )

    def get_cleaned_ratings(self) -> pd.DataFrame:
        rating_columns = ["movie_id", "user_id", "datetime", "rating"]

        rating_dfs = [
            pd.read_csv(address, names=rating_columns)
            for address in (
                self.rating_address_template.replace("x", str(i))
                for i in range(1, self.NUMBER_OF_RATING_ADDRESSES)
            )
        ]

        return pd.concat(rating_dfs).drop(columns="datetime")

In [12]:
class RecommenderSystem:
    def __init__(self, binge_dataset: BingeDataset, model_save_address: str):
        self.binge_dataset = binge_dataset
        self.df = binge_dataset.get_df()
        self.sparse_matrix = self.get_sparse()

        self.model_save_address = model_save_address
        self.model = self.load_model(model_save_address)

    def load_model(self, model_save_address):
        if os.path.exists(model_save_address):
            return AlternatingLeastSquares().load(model_save_address)
        else:
            return None

    def fit(
        self, test_size: int = 0.2, shuffle=True, random_state: int = None, **kwargs
    ):
        X_train, X_test, *_ = train_test_split(
            self.sparse_matrix,
            test_size=test_size,
            shuffle=shuffle,
            random_state=random_state,
        )
        self.model = AlternatingLeastSquares(**kwargs)
        self.model.fit(X_train)
        self.test_score = self.evaluate_model(X_train, X_test)
        self.model.save(self.model_save_address)
        return self.model, self.test_score

    #gerçekten csr matrix mi
    def evaluate_model(self, X_train: csr_matrix, X_test: csr_matrix) -> float:
        return mean_average_precision_at_k(self.model, X_train, X_test)

    def get_sparse(self) -> csr_matrix:
        df = self.binge_dataset.get_cleaned_ratings()
        return csr_matrix(
            (
                df["rating"].values,
                (df["user_id"].values, df["movie_id"].values)
            ),
        ).astype("int32")

    def recommend_for_new_users(self, n_movies=5, size_of_pool_to_select_from=100) -> List[int]:
        '''
            Used for to generating recommendations for new users.

            Returns n_movie number of movie_ids from a pool generated from general movie scores desc.

        '''
        movies_to_select = np.random.randint(0, size_of_pool_to_select_from, n_movies)

        return (
            self.df
            .sort_values(by="score", ascending=False)["movie_id"]
            .unique()
            [:size_of_pool_to_select_from]
            [movies_to_select]
        )

    def recommend_for_user(
        self,
        user_id: int, 
        number_of_movies_to_recommend: int = 10
    ) -> Dict[int, float]:
        '''
        Returns a dictionary that keys are movie_ids and 
        values are the score that which selected movie he/she most likely to watch.
        Higher score is better.
        '''
        recommendation_list = self.model.recommend(
            user_id,
            self.sparse_matrix[user_id],
            N=number_of_movies_to_recommend
        )
        return {movie_id: score for (movie_id, score) in zip(*recommendation_list)}
    
    def compare_movies(self, movie_one_id: int, movie_two_id: int, threshold: float = 5.0) -> bool:
        '''
        Compares two movies based on their general score(also used for cold start recommendations).
        Threshold can't be more then 100 and less then 0.

        Returns True if their difference in percentage is greater then threshold.
        Returns False if their difference in percentage is lesser or equal to the threshold.
        '''
        if threshold > 100 or threshold < 0:
            raise ValueError("Threshold can't be more then 100")

        movie_one_score, movie_two_score = (
            self.df.loc[self.df["movie_id"] == movie_one_id, "score"].iloc[0],
            self.df.loc[self.df["movie_id"] == movie_two_id, "score"].iloc[0]
        )

        difference_by_percentage = abs(
            ((movie_one_score - movie_two_score) / movie_one_score) * 100
        )

        return difference_by_percentage > threshold 

In [13]:
binge_dataset = BingeDataset(**config)

In [14]:
recommender_system = RecommenderSystem(binge_dataset, model_save_address)

  check_blas_config()


In [17]:
# hyperparams = {
#     "iterations": 20,
#     "factors": 256,
#     "regularization": 0.1
# }
# model, score = recommender_system.fit(random_state=11, **hyperparams)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/95893 [00:00<?, ?it/s]

In [22]:
#print(f"Mean Precision @10 Score Of The Model: {score}")

Mean Precision @10 Score Of The Model: 0.02756972098691428


In [19]:
recommender_system.recommend_for_user(6, 5)

{6117: 1.186178,
 10607: 1.0608814,
 8387: 1.0059323,
 571: 0.9900291,
 8596: 0.98567605}

In [20]:
recommender_system.recommend_for_new_users(6)

array([11677,  6206, 10550,  5085,  3825,   886], dtype=int32)

In [21]:
recommender_system.compare_movies(6117, 10607, 5)

True