In [1]:
import csv
import pandas as pd
import os
import numpy as np

In [2]:
data_path = './movie_len_small/'
movie_path = os.path.join(data_path, "movies.csv")
rating_path = os.path.join(data_path, "ratings.csv")

movie_data = pd.read_csv(movie_path)
rating_data = pd.read_csv(rating_path)

In [3]:
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
rating_data = rating_data[['userId', 'movieId', 'rating']]
rating_data.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
rating_data.describe()

Unnamed: 0,userId,movieId,rating
count,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557
std,182.618491,35530.987199,1.042529
min,1.0,1.0,0.5
25%,177.0,1199.0,3.0
50%,325.0,2991.0,3.5
75%,477.0,8122.0,4.0
max,610.0,193609.0,5.0


In [6]:
def parse_genre(genre_str):
    genre_list = genre_str.split('|')
    
    return genre_list

movie_data['genres'] = movie_data['genres'].apply(parse_genre)

In [7]:
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [8]:
data = pd.merge(rating_data, movie_data, on="movieId", how="inner")
data.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,5,1,4.0,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
2,7,1,4.5,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
3,15,1,2.5,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
4,17,1,4.5,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"


In [9]:
matrix = data.pivot_table(index = "userId", columns = "title", values = "rating")
matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [10]:
def pearsonR(s1, s2):
    s1_c = s1 - np.nanmean(s1)
    s2_c = s2 - np.nanmean(s2)
    
    s1_c[np.isnan(s1_c)] = 0
    s2_c[np.isnan(s2_c)] = 0
    
    cor =  np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

    return cor

In [13]:
def recommend(input_movie, matrix, n, similar_genre = True):
    input_genre = movie_data.loc[movie_data['title'] == input_movie, 'genres'].iloc[0]
    
    print("Title : {}".format(input_movie))
    print("Genre : {}".format(input_genre))
    
    result = []
    for title in matrix.columns:
        if title == input_movie:
            continue
        # print("Searching ", title)
        cor = pearsonR(np.array(matrix[input_movie]), np.array(matrix[title]))
        
        if similar_genre and len(input_genre) > 0:
            temp_genre = movie_data.loc[movie_data['title'] == title, 'genres'].iloc[0]
            # print(temp_genre)
            same_count = np.sum(np.isin(input_genre, temp_genre))
            cor += 0.1 * same_count
        
        if np.isnan(cor):
            continue
        else:
            result.append((title, '{:.2f}'.format(cor), temp_genre))

    result.sort(key = lambda r:r[1], reverse=True)
        
    return result[:n]        

In [15]:
recommend_result = recommend('Aladdin (1992)', matrix, 10, similar_genre=True)

pd.DataFrame(recommend_result, columns=['Title', 'Correlation', 'Genre'])

Title : Aladdin (1992)
Genre : ['Adventure', 'Animation', 'Children', 'Comedy', 'Musical']


  cor =  np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))


Unnamed: 0,Title,Correlation,Genre
0,"Lion King, The (1994)",0.76,"[Adventure, Animation, Children, Drama, Musica..."
1,Toy Story (1995),0.72,"[Adventure, Animation, Children, Comedy, Fantasy]"
2,Mulan (1998),0.71,"[Adventure, Animation, Children, Comedy, Drama..."
3,Beauty and the Beast (1991),0.7,"[Animation, Children, Fantasy, Musical, Romanc..."
4,Toy Story 2 (1999),0.65,"[Adventure, Animation, Children, Comedy, Fantasy]"
5,"Bug's Life, A (1998)",0.64,"[Adventure, Animation, Children, Comedy]"
6,Hercules (1997),0.62,"[Adventure, Animation, Children, Comedy, Musical]"
7,Finding Nemo (2003),0.61,"[Adventure, Animation, Children, Comedy]"
8,Enchanted (2007),0.59,"[Adventure, Animation, Children, Comedy, Fanta..."
9,Robin Hood (1973),0.59,"[Adventure, Animation, Children, Comedy, Musical]"
