# Packages

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import random as rd
from surprise import AlgoBase
from surprise.prediction_algorithms.predictions import PredictionImpossible
from sklearn.linear_model import LinearRegression

from loaders import load_ratings, load_items, load_visuals
from constants import Constant as C

           action  adventure  animation  children    comedy    crime  \
movieId                                                                
1        0.000000   0.409953   0.532782  0.495334  0.267377  0.00000   
2        0.000000   0.510589   0.000000  0.616929  0.000000  0.00000   
3        0.000000   0.000000   0.000000  0.000000  0.587560  0.00000   
4        0.000000   0.000000   0.000000  0.000000  0.523855  0.00000   
5        0.000000   0.000000   0.000000  0.000000  1.000000  0.00000   
...           ...        ...        ...       ...       ...      ...   
161582   0.000000   0.000000   0.000000  0.000000  0.000000  0.87227   
161594   0.324133   0.361472   0.469775  0.000000  0.000000  0.00000   
161918   0.384347   0.428622   0.000000  0.000000  0.000000  0.00000   
163056   0.377365   0.420836   0.000000  0.000000  0.000000  0.00000   
163949   0.000000   0.000000   0.000000  0.000000  0.000000  0.00000   

         documentary     drama   fantasy        fi  ...  listed

# Explore and select content features

In [2]:
df_items = load_items()
df_ratings = load_ratings()

display(df_items)
display(df_ratings)

# Example 1 : create title_length features
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
#display(df_features.head())

# (explore here other features)

df_avg_ratings = df_items.merge(df_ratings, how = 'inner', on = C.ITEM_ID_COL)
grouped = df_avg_ratings.groupby(by = C.LABEL_COL)[C.RATING_COL].mean()
#display(grouped)

df_visuals = load_visuals(mode = 'log')

df_name_visuals = df_items.merge(df_visuals, how = 'inner', left_on = C.ITEM_ID_COL, right_on = C.VISUAL_MOVIE_ID)
df_visuals_ratings = df_name_visuals.merge(df_ratings, how = 'inner', left_index = True , right_index = True)
display(df_visuals_ratings)


Unnamed: 0_level_0,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995
2,Jumanji (1995),"[Adventure, Children, Fantasy]",1995
3,Grumpier Old Men (1995),"[Comedy, Romance]",1995
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",1995
5,Father of the Bride Part II (1995),[Comedy],1995
...,...,...,...
161582,Hell or High Water (2016),"[Crime, Drama]",2016
161594,Kingsglaive: Final Fantasy XV (2016),"[Action, Adventure, Animation, Drama, Fantasy,...",2016
161918,Sharknado 4: The 4th Awakens (2016),"[Action, Adventure, Horror, Sci-Fi]",2016
163056,Shin Godzilla (2016),"[Action, Adventure, Fantasy, Sci-Fi]",2016


Unnamed: 0,userId,movieId,rating
0,277,6,5.0
1,277,7,5.0
2,277,10,3.0
3,277,16,5.0
4,277,18,3.0
...,...,...,...
381176,283184,5553,2.0
381177,283184,5673,5.0
381178,283184,5689,3.0
381179,283184,5902,5.0


data\hackathon\content\visuals\LLVisualFeatures13K_Log.csv


KeyError: 'ML_Id'

# Build a content-based model
When ready, move the following class in the *models.py* script

In [None]:
class ContentBased(AlgoBase):
    def __init__(self, features_method, regressor_method):
        AlgoBase.__init__(self)
        self.regressor_method = regressor_method
        self.content_features = self.create_content_features(features_method)

    def create_content_features(self, features_method):
        """Content Analyzer"""
        df_items = load_items()
        df_ratings = load_ratings()
        if features_method is None:
            df_features = None

        elif features_method == "title_length": # a naive method that creates only 1 feature based on title length
            df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')

        elif features_method == "visual" :
            df_visuals = load_visuals(mode = 'log')
            df_visuals_ratings = df_items.merge(df_visuals, how = 'inner', left_index = True, right_index = True)
            df_features = df_visuals_ratings
            df_features = df_features.select_dtypes(include=[np.number])
            
        else: # (implement other feature creations here)
            raise NotImplementedError(f'Feature method {features_method} not yet implemented')
        return df_features
    

    def fit(self, trainset):
        """Profile Learner"""
        AlgoBase.fit(self, trainset)
        
        # Preallocate user profiles
        self.user_profile = {u: None for u in trainset.all_users()}

        if self.regressor_method == 'random_score':
            pass
        
        elif self.regressor_method == 'random_sample':
            for u in self.user_profile:
                self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]

            # (implement here the regressor fitting)  
        elif self.regressor_method == 'linear':
            for u in self.user_profile:
                ratings = self.trainset.ur[u]
                df_user = pd.DataFrame(ratings, columns=['inner_item_id', 'user_ratings'])
                df_user["item_id"] = df_user["inner_item_id"].map(self.trainset.to_raw_iid)
                df_user = df_user.merge(self.content_features, how='left', left_on='item_id', right_index=True )
                
                df_user = df_user.dropna()

                if len(df_user) == 0:
                    self.user_profile[u] = None
                    continue

                feature_names = list(self.content_features.columns)
                X = df_user[feature_names].values
                y = df_user["user_ratings"].values

                reg = LinearRegression(fit_intercept=False)
                reg.fit(X, y)

                self.user_profile[u] = reg
        else:
                pass
        
    def estimate(self, u, i):
        """Scoring component used for item filtering"""
        # First, handle cases for unknown users and items
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')


        if self.regressor_method == 'random_score':
            rd.seed()
            score = rd.uniform(0.5,5)

        elif self.regressor_method == 'random_sample':
            rd.seed()
            score = rd.choice(self.user_profile[u])

        elif self.regressor_method == 'linear':
            iid_raw = self.trainset.to_raw_iid(i)
            iid_int = int(iid_raw)
            if iid_int not in self.content_features.index:
                raise PredictionImpossible("Pas de features pour cet item")

            x = self.content_features.loc[iid_int].values.reshape(1, -1)
            score = self.model.predict(x)[0]
        else:
            score=None
            # (implement here the regressor prediction)

        return score

'''
The most up-to-date content-based model has already been implemented in models.py
'''

The following script test the ContentBased class

In [None]:
def test_contentbased_class(feature_method, regressor_method):
    """Test the ContentBased class.
    Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
    """
    sp_ratings = load_ratings(surprise_format=True)
    train_set = sp_ratings.build_full_trainset()
    content_algo = ContentBased(feature_method, regressor_method)
    content_algo.fit(train_set)
    anti_test_set_first = train_set.build_anti_testset()[0]
    prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
    print(prediction)

# (call here the test functions with different regressor methods)
test_contentbased_class(feature_method='multi_visual', regressor_method = 'svm')


NameError: name 'load_ratings' is not defined