# 0. Configuration

In [1]:
ratings_small_url = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'
movies_metadata_url = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [2]:
import numpy as np
import pandas as pd

from typing import Tuple, Dict
from itertools import islice, cycle, product

import warnings
warnings.filterwarnings('ignore')

def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data


In [3]:
def get_mappings(
    df: pd.DataFrame,
    user_colname: str,
    item_id_colname: str,
    item_name_colname: str) -> Tuple[Dict, Dict, Dict]:
    pass


In [4]:
def compute_popularity(
    df: pd.DataFrame,
    item_id: str,
    max_K: int):
    return df.groupby(item_id).agg({'rating': np.median})\
                .sort_values(['rating'], ascending=False).head(max_K).index.values

# 2. Main

In [5]:
interactions = read_csv_from_gdrive(ratings_small_url)
interactions.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [6]:
movies_metadata = read_csv_from_gdrive(movies_metadata_url)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


## 2.1. Baseline

Let's define our baseline popularity recommender as top rated titles based on average rating with possibility to get by any group(s)

In [82]:
class BaselineRecommender:
    def __init__(self, max_K = 100,
                user_column = 'userId',
                item_column = 'movieId',
                groupby = None,
                fit_na_as_common = False):
        self.max_K = max_K
        self.user_column = user_column
        self.item_column = item_column
        self.groupby = groupby
        self.fit_na_as_common = fit_na_as_common
        self.recommendations = []
        self.known_items = {}
        
    def fit(self, data: pd.DataFrame, df_users: pd.DataFrame = None):
        recomm_common = compute_popularity(data, self.item_column, self.max_K)
        self.recomm_common = recomm_common
        self.df_users = df_users
        self.known_items = data.groupby(self.user_column)[self.item_column].apply(list).to_dict()
        
        if self.groupby is not None:
            if df_users is None:
                print('No df_users')
                return None
            
            data = data.merge(df_users, on = self.user_column, how='left')
            self.recommendations = data.groupby(self.groupby).apply(compute_popularity,\
                                                    self.item_column, self.max_K)
            # если нет записей для рекомендации, рекомендовать общее
            na_mask = self.recommendations.isna()
            self.recommendations.loc[na_mask] = self.recommendations[na_mask].apply(lambda x: recomm_common)
            # на случай, если список рекомендаций будет коротким (например только 2 книги)
            # добавим общую рекоменацию
            self.recommendations = self.recommendations.apply(lambda x: np.concatenate((x, recomm_common)))
            # na в категориях
            if self.fit_na_as_common:
                na_mask = (self.recommendations.reset_index()[self.groupby]=='nan').sum(axis=1)!=0
                self.recommendations.loc[na_mask.values] = self.recommendations[na_mask.values].apply(lambda x: recomm_common)
        else:
            self.recommendations = recomm_common
        
    def recommend(self, users = None, N = 10, drop_known = False):
        recs = self.recommendations.tolist()
        
        if users is None:
            if self.groupby is not None:
                print('For recomendations based on groupby needs used_id')
                return None
            return recs[:N]
        else:
            if self.groupby is not None:
                recoms = self.recommendations.apply(lambda x: x[:N]) # только N первых рекомендаций
                recoms.name = 'recoms'
                recoms = recoms.reset_index()
                recoms.loc[:,self.groupby] = recoms[self.groupby].astype('category')
                data = users.to_frame().merge(self.df_users, on=self.user_column, how='left') # добавляем информацию по пользователям для разбиения на группы
                data = data.merge(recoms, on=self.groupby, how='left') # добавляем рекомендации в соответсвии с группой
                # если встречается уникальная группа, то пресказания будут пропусками. Заполнить их общими предсказаниями по всему набору
                na_mask = data.iloc[:, -1].isna()
                data.loc[na_mask, 'recoms'] = data.loc[na_mask, 'recoms'].apply(lambda x: self.recomm_common[:N])
                recs = data.iloc[:, -1].apply(lambda x: x.tolist()).values
            
            else:
                recs = list(islice(cycle([recs]), len(users)))
                print(len(recs))
            
            if drop_known:
                data = users.to_frame()
                for u in users:
                    if u not in self.known_items.keys():
                        self.known_items[u] = []
                data['item_id'] = recs
                data['item_id'] = data.apply(lambda x: [f for f in x.item_id \
                                                        if f not in self.known_items[x[self.user_column]]][:N], axis=1)
                return data.item_id
            else:
                return recs

In [8]:
interactions['movieId'] = interactions['movieId'].astype(str)
movies_metadata.rename(columns = {'id': 'movieId'}, inplace = True)
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['movieId'])]
print(interactions.shape, interactions_filtered.shape)

(100004, 4) (44989, 4)


In [83]:
baseline_model = BaselineRecommender(max_K = 100, user_column = 'userId', item_column = 'movieId')

In [84]:
baseline_model.fit(interactions_filtered, interactions_filtered)

In [88]:
recs = pd.DataFrame({'userId': interactions['userId'].unique()})

In [89]:
recs['movieId'] = baseline_model.recommend(recs['userId'], drop_known = True)
recs = recs.explode('movieId')
recs = pd.merge(recs, movies_metadata[['movieId', 'original_title']], how = 'left', on = 'movieId')

671


In [90]:
print(recs.shape)

(6710, 3)


In [91]:
recs.groupby('userId')['movieId'].count().min()

10

In [93]:
recs.loc[recs['userId'].isin(recs['userId'].sample(2))]

Unnamed: 0,userId,movieId,original_title
4510,452,31973,Soldier of God
4511,452,876,Frank Herbert's Dune
4512,452,1819,"You, Me and Dupree"
4513,452,1420,Breakfast on Pluto
4514,452,91690,Les neiges du Kilimandjaro
4515,452,36931,Hak bak do
4516,452,91673,Волк и телёнок
4517,452,845,Strangers on a Train
4518,452,1428,Once Upon a Time in Mexico
4519,452,31903,History Is Made at Night
