# 0. Configuration

In [1]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
RATINGS_SMALL_URL = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [2]:
import numpy as np
import pandas as pd

from typing import Tuple, Dict
from itertools import islice, cycle, product

import warnings
warnings.filterwarnings('ignore')

## 1. 1. Helper functions to avoid copy paste

In [4]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

In [11]:
def compute_popularity(df: pd.DataFrame, item_id: str, max_candidates: int):
    """
    calculates median rating to define popular titles
    """
    popular_titles = df.groupby(item_id).agg({'rating': np.mean})\
                     .sort_values(['rating'], ascending=False).head(max_candidates).index.values

    return popular_titles

# 2. Data

## 2. 1. Load data

In [6]:
# interactions data
interactions = read_csv_from_gdrive(RATINGS_SMALL_URL)
interactions.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [7]:
# information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


## 2.2 Data preparation

In [8]:
# align data in both dataframes to merge
interactions['movieId'] = interactions['movieId'].astype(str)
movies_metadata.rename(columns = {'id': 'movieId'}, inplace = True)

In [9]:
# leave only those films that intersect with each other
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['movieId'])]
print(interactions.shape, interactions_filtered.shape)

(100004, 4) (44989, 4)


In [10]:
# crate mapper for movieId and title names
item_name_mapper = dict(zip(movies_metadata['movieId'], movies_metadata['original_title']))

In [38]:
# create users input
users = interactions[['userId']].drop_duplicates().reset_index(drop = True)

# 3. Model

Let's define our baseline popularity recommender BaselineRecommender - top rated titles based on average rating with possibility to get by any group(s)

The pipeline will be similar to most python ML modules -- it will have two methods in the end: fit() and recommend()
1. The logic of fit() as follow:
- Initiate recommendation based on median rating from all observations recomm_common;
- Prepare list of interacted items by users
- If we set groups - we get recommendations i.e. calculate movie ratings by groups:
    - If we get NaN, we fill with base recommendations 
    - If we get less than required number of candidates, we populate from base recommendations

2. The logic of recommend():
- Return base recommendations if users data is not set;
- In case of category wise requirement -- we get results of our fit

## 3.1. Fit

In [18]:
# first, we define how many candidates we want to get
MAX_CANDIDATES = 20
ITEM_COLUMN = 'movieId'
USER_COLUMN = 'userId'

In [19]:
# then, we calculate avg rating and sort by this value
base_recommendations = compute_popularity(interactions_filtered, ITEM_COLUMN, MAX_CANDIDATES)
base_recommendations

array(['74727', '128846', '702', '127728', '65216', '43267', '8675',
       '80717', '86817', '8699', '872', '27724', '26791', '876', '64278',
       '301', '59392', '3021', '3112', '1933'], dtype=object)

Thus, we got 20 films with highest average rating

Now, as we discussed earlier, in movies recommendations there is no need to recommend the same film which use has already watched. Let's implement it as well

In [20]:
# we get all interacted items for each user and save it in dictionary {'userId': [items list]}
known_items = interactions_filtered.groupby(USER_COLUMN)[ITEM_COLUMN].apply(list).to_dict()
len(known_items)


671

In [26]:
# let's check it for one userId = 1
known_items[1]

['1371', '1405', '2105', '2193', '2294', '2455']

Now we have all necessary components: base recommendations without groups with possibility to filter already watched items

Also, if we want to get recommendations based on some user groups we can easily do the same with groupby() method and same approach

In [41]:
# lets add artifical binary group to check BaselineRecommender
group = [np.random.random_integers(2) for x in range(len(users))]
users['group'] = group

In [49]:
data = pd.merge(interactions_filtered, users, how='left', on = USER_COLUMN)
group_recommendations = data.groupby('group').apply(compute_popularity, ITEM_COLUMN, MAX_CANDIDATES)
group_recommendations.head()

group
1    [2982, 25801, 1933, 1936, 1941, 5056, 50641, 6...
2    [2620, 25852, 5473, 5062, 36931, 54328, 8908, ...
dtype: object

In the output we have two rows with a list of film ids for each binary group 

Next, we have to implement recommned() method which will use 

## 3. 2. Recommend

In [33]:
# if we do not have groups, then it means we give the same recommendations for all users i.e. base_recommendations
recs = list(islice(cycle([base_recommendations]), len(users['userId'])))
users['rekkos'] = recs
users.head()

Unnamed: 0,userId,group,rekkos
0,1,2,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
1,2,2,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
2,3,1,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
3,4,2,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
4,5,1,"[74727, 128846, 702, 127728, 65216, 43267, 867..."


In [50]:
# and let's have an example with groups we created earlier
group_recommendations = group_recommendations.reset_index()
group_rekkos = pd.merge(users, group_recommendations, how = 'left', on = 'group')
group_rekkos.rename(columns = {0: 'rekkos'}, inplace = True)
group_rekkos.head()

Unnamed: 0,userId,group,rekkos
0,1,2,"[2620, 25852, 5473, 5062, 36931, 54328, 8908, ..."
1,2,1,"[2982, 25801, 1933, 1936, 1941, 5056, 50641, 6..."
2,3,2,"[2620, 25852, 5473, 5062, 36931, 54328, 8908, ..."
3,4,2,"[2620, 25852, 5473, 5062, 36931, 54328, 8908, ..."
4,5,2,"[2620, 25852, 5473, 5062, 36931, 54328, 8908, ..."


We got our groupwise recommendations from 3.1. part and just joined them by group of users are assigned to

## 3.3. Wrap everything into pretty functions

Well, in this section we discussed how basic recommendations based on heuristic can be done
- We took top-rated films and recommended to users
- Added filter to remove already watched films
- Wrapped all steps into functions

# Appendix

In [6]:
class BaselineRecommender:
    def __init__(self, max_candidates = 100,
                user_column = 'userId',
                item_column = 'movieId',
                groups = None):
        self.recommendations = []
        self.max_candidates = max_candidates
        self.user_column = user_column
        self.item_column = item_column
        self.groups = groups
        self.known_items = {}
        
    def fit(self, data: pd.DataFrame, df_users: pd.DataFrame = None):
        base_recommendations = compute_popularity(data, self.item_column, self.max_candidates)
        self.base_recommendations = base_recommendations
        self.df_users = df_users
        self.known_items = data.groupby(self.user_column)[self.item_column].apply(list).to_dict()
        
        if self.groups is not None:
            if df_users is None:
                raise ValueError('No users data is specified!')
            
            data = pd.merge(data, df_users, how='left', on = self.user_column)
            self.recommendations = data.groupby(self.groups).apply(compute_popularity, self.item_column, self.max_candidates)
            na_mask = self.recommendations.isna()
            self.recommendations.loc[na_mask] = self.recommendations[na_mask].apply(lambda x: base_recommendations)
            self.recommendations = self.recommendations.apply(lambda x: np.concatenate((x, base_recommendations)))

        else:
            self.recommendations = base_recommendations
        
    def recommend(self, users: list = None, N: int = 10, drop_known: bool = False):
        recs = self.recommendations.tolist()
        
        if users is None:
            return recs[:N]

        else:
            if self.groups is not None:
                rekko = self.recommendations.apply(lambda x: x[:N])
                rekko.name = 'rekkos'
                rekko = rekko.reset_index()
                rekko.loc[:, self.groups] = rekko[self.groups].astype('category')
                data = users.to_frame().merge(self.df_users, on = self.user_column, how = 'left')
                data = data.merge(rekko, on = self.groups, how = 'left')
                na_mask = data.iloc[:, -1].isna()
                data.loc[na_mask, 'rekkos'] = data.loc[na_mask, 'rekkos'].apply(lambda x: self.base_recommendations[:N])
                recs = data.iloc[:, -1].apply(lambda x: x.tolist()).values
            
            else:
                recs = list(islice(cycle([recs]), len(users)))
            
            if drop_known:
                data = users.to_frame()
                for u in users:
                    if u not in self.known_items.keys():
                        self.known_items[u] = []
                data[self.item_column] = recs
                data[self.item_column] = data.apply(lambda x: [f for f in x[self.item_column] \
                                                        if f not in self.known_items[x[self.user_column]]][:N], axis = 1)
                return data[self.item_column]
            
            else:
                return recs

In [11]:
# check the model - fit and predict
baseline_model = BaselineRecommender(
    max_candidates = 100, user_column = 'userId',
    item_column = 'movieId', groups = ['group'])

baseline_model.fit(interactions_filtered, users)
users['movieId'] = baseline_model.recommend(users['userId'], drop_known = True)

In [12]:
# let's explore recommendations we got
recs = users.explode('movieId')
recs['title'] = recs['movieId'].map(item_name_mapper)

In [13]:
recs.loc[recs['userId'].isin(recs['userId'].sample(2))]

Unnamed: 0,userId,group,movieId,title
199,200,1,850,A Christmas Story
199,200,1,40226,The Era of Vampires
199,200,1,635,Angel Heart
199,200,1,2981,The Lost World
199,200,1,178,Blown Away
199,200,1,876,Frank Herbert's Dune
199,200,1,872,Singin' in the Rain
199,200,1,127728,8:46
199,200,1,8699,Anchorman: The Legend of Ron Burgundy
199,200,1,1819,"You, Me and Dupree"


Congrats! Your first popularity based recommender is ready!

In the next chapter we will talk about a bit more advanced technique Collaborative Filtering