# Packages

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import random as rd
from surprise import AlgoBase
from surprise.prediction_algorithms.predictions import PredictionImpossible
from sklearn.linear_model import LinearRegression

from loaders import load_ratings, load_items, load_visuals
from constants import Constant as C

# Explore and select content features

In [3]:
df_items = load_items()
df_ratings = load_ratings()

display(df_items)
display(df_ratings)

# Example 1 : create title_length features
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
#display(df_features.head())

# (explore here other features)

df_avg_ratings = df_items.merge(df_ratings, how = 'inner', on = C.ITEM_ID_COL)
grouped = df_avg_ratings.groupby(by = C.LABEL_COL)[C.RATING_COL].mean()
#display(grouped)

df_visuals = load_visuals(mode = 'log')

df_name_visuals = df_items.merge(df_visuals, how = 'inner', left_on = C.ITEM_ID_COL, right_on = C.VISUAL_MOVIE_ID)
df_visuals_ratings = df_name_visuals.merge(df_ratings, how = 'inner', left_index = True , right_index = True)
display(df_visuals_ratings)


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
161582,Hell or High Water (2016),Crime|Drama
161594,Kingsglaive: Final Fantasy XV (2016),Action|Adventure|Animation|Drama|Fantasy|Sci-Fi
161918,Sharknado 4: The 4th Awakens (2016),Action|Adventure|Horror|Sci-Fi
163056,Shin Godzilla (2016),Action|Adventure|Fantasy|Sci-Fi


Unnamed: 0,userId,movieId,rating
0,277,6,5.0
1,277,7,5.0
2,277,10,3.0
3,277,16,5.0
4,277,18,3.0
...,...,...,...
381176,283184,5553,2.0
381177,283184,5673,5.0
381178,283184,5689,3.0
381179,283184,5902,5.0


data\hackathon\content\visuals\LLVisualFeatures13K_Log.csv
Index(['ML_Id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7'], dtype='object')


Unnamed: 0,title,genres,f1,f2,f3,f4,f5,f6,f7,userId,movieId,rating
0,Nick of Time (1995),Action|Thriller,0.440830,0.763504,0.784965,0.132239,0.176285,0.275521,0.707383,277,6,5.0
1,Vampire in Brooklyn (1995),Comedy|Horror|Romance,0.467434,0.657441,0.651940,0.024859,0.061322,0.226915,0.687485,277,7,5.0
2,Beautiful Girls (1996),Comedy|Drama|Romance,0.700268,0.652688,0.653051,0.020993,0.050809,0.201239,0.498546,277,10,3.0
3,Broken Arrow (1996),Action|Adventure|Thriller,0.522593,0.720691,0.725353,0.017811,0.040945,0.240973,0.660679,277,16,5.0
4,In the Bleak Midwinter (1995),Comedy|Drama,0.782697,0.658655,0.648308,0.138313,0.172365,0.191381,0.353793,277,18,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4598,Kidnapping Mr. Heineken (2015),Action|Crime|Drama|Thriller,0.380148,0.554954,0.592917,0.021228,0.057797,0.162178,0.721846,2764,3763,4.0
4599,The Cobbler (2015),Comedy|Drama|Fantasy,0.536049,0.568985,0.547581,0.023061,0.056075,0.201187,0.648675,2764,3769,2.0
4600,Insurgent (2015),Action|Sci-Fi|Thriller,0.523038,0.630923,0.640514,0.037381,0.079298,0.262880,0.656804,2764,3789,4.5
4601,Home (2015),Adventure|Animation|Children|Comedy|Fantasy|Sc...,0.536234,0.752874,0.768606,0.058113,0.130669,0.257451,0.648675,2764,3809,3.0


# Build a content-based model
When ready, move the following class in the *models.py* script

In [4]:
class ContentBased(AlgoBase):
    def __init__(self, features_method, regressor_method):
        AlgoBase.__init__(self)
        self.regressor_method = regressor_method
        self.content_features = self.create_content_features(features_method)

    def create_content_features(self, features_method):
        """Content Analyzer"""
        df_items = load_items()
        if features_method is None:
            df_features = None
        elif features_method == "title_length": # a naive method that creates only 1 feature based on title length
            df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
        else: # (implement other feature creations here)
            raise NotImplementedError(f'Feature method {features_method} not yet implemented')
        return df_features
    

    def fit(self, trainset):
        """Profile Learner"""
        AlgoBase.fit(self, trainset)
        
        # Preallocate user profiles
        self.user_profile = {u: None for u in trainset.all_users()}

        if self.regressor_method == 'random_score':
            pass
        
        elif self.regressor_method == 'random_sample':
            for u in self.user_profile:
                self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]
        else:
            pass
            # (implement here the regressor fitting)  
        
    def estimate(self, u, i):
        """Scoring component used for item filtering"""
        # First, handle cases for unknown users and items
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')


        if self.regressor_method == 'random_score':
            rd.seed()
            score = rd.uniform(0.5,5)

        elif self.regressor_method == 'random_sample':
            rd.seed()
            score = rd.choice(self.user_profile[u])
        
        else:
            score=None
            # (implement here the regressor prediction)

        return score


The following script test the ContentBased class

In [5]:
def test_contentbased_class(feature_method, regressor_method):
    """Test the ContentBased class.
    Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
    """
    sp_ratings = load_ratings(surprise_format=True)
    train_set = sp_ratings.build_full_trainset()
    content_algo = ContentBased(feature_method, regressor_method)
    content_algo.fit(train_set)
    anti_test_set_first = train_set.build_anti_testset()[0]
    prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
    print(prediction)

# (call here the test functions with different regressor methods)
test_contentbased_class(feature_method='title_length', regressor_method = 'random_score')


user: 277        item: 3          r_ui = None   est = 1.06   {'was_impossible': False}
