In [36]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) 

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# str_to_set = lambda s : set(item['name'] for item in eval(s))
def str_to_set(s):
    return set(item['name'] for item in eval(s))

meta = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')

# meta['original_title'] ==> Series
meta = meta[['id', 'original_title', 'original_language', 'genres']]

meta = meta.rename(columns={'id' : 'movieId', 'original_title' : 'title', 
                     'original_language' : 'language'})

meta = meta.loc[meta['language'] == 'en', :] #language가 en인 것들만
meta['movieId'] = pd.to_numeric(meta['movieId'])
meta['genres'] = meta['genres'].apply(str_to_set)

meta


In [37]:
keywords = pd.read_csv('/kaggle/input/the-movies-dataset/keywords.csv')
keywords = keywords.rename(columns={'id' : 'movieId'})
keywords['movieId'] = pd.to_numeric(keywords['movieId'])


keywords['keywords'] = keywords['keywords'].apply(str_to_set)

keywords
    

In [38]:
merged = pd.merge(meta, keywords, on='movieId', how='inner')

merged

In [40]:
dk = merged.loc[merged['title'] == 'The Dark Knight', :].iloc[0, :]
dkr = merged.loc[merged['title'] == 'The Dark Knight Rises', :].iloc[0, :]
toy = merged.loc[merged['title'] == 'Toy Story', :].iloc[0, :]

dk_set = dk.keywords | dk.genres
dkr_set = dkr.keywords | dkr.genres
toy_set = toy.keywords | toy.genres

def jaccard_similarity(set1, set2):
    if len(set1 | set2) == 0 : return 0
    return len(set1 & set2) / len(set1 | set2)

jaccard_similarity(dk_set, dkr_set)
jaccard_similarity(dk_set, toy_set)

In [41]:
ratings = pd.read_csv('/kaggle/input/the-movies-dataset/ratings_small.csv')
ratings['movieId'] = pd.to_numeric(ratings['movieId'])
ratings = pd.merge(ratings[['userId', 'movieId', 'rating']], 
                   meta[['movieId', 'title']], on='movieId', how='inner')

ratings

In [43]:
matrix = ratings.pivot_table(index='userId', columns='title', values='rating')

def pearson_similarity(a, b):
    a = a - a.mean()
    b = b - b.mean()
    
    denom = ((a**2).sum()**0.5) * ((b**2).sum()**0.5)
    
    if denom == 0: return 0
    else: return (a * b).sum() / denom
    

pn = matrix['Prom Night']
dk = matrix['The Dark Knight']

pearson_similarity(pn, dk)


In [44]:
def find_similar_movies(input_title, matrix, merged, alpha = 0.5, n = 10):
   
    input_vector = matrix[input_title]

    input_meta = merged.loc[merged['title'] == input_title, :].iloc[0, :]
    input_set = input_meta['genres'] | input_meta['keywords']

    results = []

    for this_title in matrix.columns:
        if this_title == input_title:
            continue
        this_vector = matrix[this_title]

        this_meta = merged.loc[merged['title'] == this_title, :].iloc[0, :]
        this_set = this_meta['genres'] | this_meta['keywords']

        jaccard = jaccard_similarity(input_set, this_set)
        pearson = pearson_similarity(input_vector, this_vector)

        score = alpha * jaccard + (1 - alpha) * pearson

        results.append((this_title, jaccard, pearson, score))

    results.sort(key = lambda item: item[3], reverse = True) 
    return pd.DataFrame(results[:n], columns=['title', 'jaccard', 'pearson', 'score'])
    

In [45]:
find_similar_movies('The Dark Knight', matrix, merged, 0.3, 10)