In [2]:
import pandas as pd
df = pd.read_csv("demographic.csv")
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Toy Story,Animation; Comedy; Family,81.0,7.7,5415.0,1995,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Jumanji,Adventure; Fantasy; Family,104.0,6.9,2413.0,1995,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Grumpier Old Men,Romance; Comedy,101.0,6.5,92.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,Waiting to Exhale,Comedy; Drama; Romance,127.0,6.1,34.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,Father of the Bride Part II,Comedy,106.0,5.7,173.0,1995,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# step 1 : filtering

In [6]:
genre = ['Animation', 'Comedy']
duration = (60, 150)
year = (2000, 2019)
topk = 10

In [7]:
df = df[df.release_year.between(year[0], year[1]) &
        df.runtime.between(duration[0], duration[1]) &
        df[genre].all(axis=1)]
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
3354,The Road to El Dorado,Adventure; Animation; Comedy; Family,89.0,7.0,892.0,2000,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3619,Chicken Run,Animation; Comedy; Family,84.0,6.5,1190.0,2000,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3622,The Adventures of Rocky & Bullwinkle,Action; Adventure; Animation; Comedy; Family,88.0,3.9,89.0,2000,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3855,Rugrats in Paris: The Movie,Adventure; Animation; Comedy; Family,78.0,6.0,101.0,2000,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3881,The Emperor's New Groove,Adventure; Animation; Comedy; Family; Fantasy,78.0,7.2,1544.0,2000,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


# step 2 : scoring

menggunakan vote_average

# step 3 : sorting

In [11]:
recommendation = df.loc[:, 'title':'release_year']
recommendation = recommendation.sort_values('vote_average', ascending=False).head(topk)
recommendation

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
25671,Rocks in my Pockets,Comedy; Animation; Drama,88.0,9.4,5.0,2014
26636,Lotte from Gadgetville,Adventure; Animation; Comedy; Family,81.0,9.0,4.0,2006
43884,Revengeance,Comedy; Action; Animation,71.0,8.0,2.0,2017
28655,Cardcaptor Sakura: The Sealed Card,Comedy; Animation; Adventure; Fantasy; Romance,80.0,8.0,24.0,2000
30208,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015
15433,South Park: Imaginationland,Animation; Comedy,67.0,7.9,75.0,2008
13710,Up,Animation; Comedy; Family; Adventure,96.0,7.8,7048.0,2009
14294,Mary and Max,Animation; Comedy; Drama,92.0,7.8,596.0,2009
20176,It's Such a Beautiful Day,Fantasy; Animation; Comedy; Drama,62.0,7.8,94.0,2012
24383,Big Hero 6,Adventure; Family; Animation; Action; Comedy,102.0,7.8,6289.0,2014


# IMDB weighted rating

In [14]:
def imdb_score(df, q=0.9):
    df = df.copy()

    m = df.vote_count.quantile(q)
    C = (df.vote_average * df.vote_count).sum() / df.vote_count.sum()

    df = df[df.vote_count >= m]
    df['score'] = df.apply(lambda x: (x.vote_average * x.vote_count + C*m) / (x.vote_count + m), axis=1)
    return df

In [15]:
df = imdb_score(df)

In [18]:
recommendation = df.loc[:, 'title':'release_year']
recommendation = recommendation.sort_values('vote_average', ascending=False).head(topk)
recommendation

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
30208,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015
13710,Up,Animation; Comedy; Family; Adventure,96.0,7.8,7048.0,2009
24383,Big Hero 6,Adventure; Family; Animation; Action; Comedy,102.0,7.8,6289.0,2014
36082,Zootopia,Animation; Adventure; Family; Comedy,108.0,7.7,4961.0,2016
23489,How to Train Your Dragon 2,Fantasy; Action; Adventure; Animation; Comedy;...,102.0,7.6,3163.0,2014
15328,Toy Story 3,Animation; Family; Comedy,103.0,7.6,4710.0,2010
11556,Ratatouille,Animation; Comedy; Family; Fantasy,111.0,7.5,4510.0,2007
4746,"Monsters, Inc.",Animation; Comedy; Family,92.0,7.5,6150.0,2001
22656,The Lego Movie,Adventure; Animation; Comedy; Family; Fantasy,100.0,7.5,3127.0,2014
4168,Shrek,Adventure; Animation; Comedy; Family; Fantasy,90.0,7.3,4183.0,2001


# ML engineering

In [20]:
class RecommenderSystem:
    def __init__(self, data):
        self.df = pd.read_csv(data)
    
    def recommend(self, genre=None, duration=None, year=None, topk=10):
        df = self.df.copy()
        df = self.demographic_filter(df, genre=genre, duration=duration, year=year)
        df = self.compute_imdb_score(df)
        result = df.loc[:, 'title':'release_year']
        result = result.sort_values('vote_average', ascending=False).head(topk)
        return result

    @staticmethod
    def demographic_filter(df, genre=None, duration=None, year=None):
        df = df.copy()

        if genre is not None:
            df = df[df[genre].all(axis=1)]
        if duration is not None:
            df = df[df.runtime.between(duration[0], duration[1])]
        if year is not None:
            df = df[df.release_year.between(year[0], year[1])]
        return df

    @staticmethod
    def compute_imdb_score(df, q=0.9):
        df = df.copy()

        m = df.vote_count.quantile(q)
        C = (df.vote_average * df.vote_count).sum() / df.vote_count.sum()

        df = df[df.vote_count >= m]
        df['score'] = df.apply(lambda x: (x.vote_average * x.vote_count + C*m) / (x.vote_count + m), axis=1)
        return df

In [21]:
recsys = RecommenderSystem(data="demographic.csv")

In [22]:
recsys.recommend(genre=['Animation', 'Family'], duration=(60,150), year=(2015,2020))

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
30208,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015
36082,Zootopia,Animation; Adventure; Family; Comedy,108.0,7.7,4961.0,2016
41203,Moana,Adventure; Animation; Family,107.0,7.3,3471.0,2016
37969,Finding Dory,Adventure; Animation; Comedy; Family,97.0,6.8,4333.0,2016
41433,Sing,Animation; Comedy; Drama; Family; Music,108.0,6.8,2363.0,2016
30588,Minions,Family; Animation; Adventure; Comedy,91.0,6.4,4729.0,2015
30388,The Secret Life of Pets,Animation; Family,87.0,5.9,3536.0,2016
