In [11]:
import pandas as pd  
import numpy as np  
movies_df = pd.read_table('ml-20m/movies.csv', sep=',')

movies_df.head() 

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [13]:
# we convert the movie genres to a set of dummy variables 
movies_df = pd.concat([movies_df, movies_df['genres'].str.get_dummies(sep='|')], axis=1)  
movies_df.head() 

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
movie_categories = movies_df.columns[4:]  
movie_categories

Index([u'Action', u'Adventure', u'Animation', u'Children', u'Comedy', u'Crime',
       u'Documentary', u'Drama', u'Fantasy', u'Film-Noir', u'Horror', u'IMAX',
       u'Musical', u'Mystery', u'Romance', u'Sci-Fi', u'Thriller', u'War',
       u'Western'],
      dtype='object')

In [42]:
#create a user references for each genre 
from collections import OrderedDict


user_preferences = OrderedDict(zip(movie_categories, []))

user_preferences['IMAX'] = 3
user_preferences['Action'] = 5  
user_preferences['Adventure'] = 5  
user_preferences['Animation'] = 1  
user_preferences["Children"] = 1  
user_preferences["Comedy"] = 3  
user_preferences['Crime'] = 2  
user_preferences['Documentary'] = 1  
user_preferences['Drama'] = 1  
user_preferences['Fantasy'] = 5  
user_preferences['Film-Noir'] = 1  
user_preferences['Horror'] = 2  
user_preferences['Musical'] = 1  
user_preferences['Mystery'] = 3  
user_preferences['Romance'] = 1  
user_preferences['Sci-Fi'] = 5  
user_preferences['War'] = 3  
user_preferences['Thriller'] = 2  
user_preferences['Western'] =1  

# Content filtering


In [43]:

def dot_product(vector_1, vector_2):  
    return np.dot(vector_1,vector_2)

def get_movie_score(movie_features, user_preferences):  
    return dot_product(movie_features, user_preferences)

In [44]:
#compute the score for "Toy Story"
toy_story_features = movies_df.loc[0][movie_categories]  
toy_story_features

Action         0
Adventure      1
Animation      1
Children       1
Comedy         1
Crime          0
Documentary    0
Drama          0
Fantasy        1
Film-Noir      0
Horror         0
IMAX           0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
Name: 0, dtype: object

In [45]:
toy_story_user_predicted_score = dot_product(toy_story_features, user_preferences.values())  
toy_story_user_predicted_score 

13

In [49]:
def get_movie_recommendations(user_preferences, n_recommendations):  
    #we add a column to the movies_df dataset with the calculated score for each movie for the given user
    movies_df['score'] = movies_df[movie_categories].apply(get_movie_score, 
                                                           args=([user_preferences.values()]), axis=1)
    return movies_df.sort_values(by=['score'], ascending=False)['title'][:n_recommendations]

get_movie_recommendations(user_preferences, 10) 

8996     Patlabor: The Movie (Kidô keisatsu patorebâ: T...
16024                                        Rubber (2010)
11794    Aqua Teen Hunger Force Colon Movie Film for Th...
4922                                       Motorama (1991)
16851                               Mars Needs Moms (2011)
24371                                Whip Hand, The (1951)
22776                                  Cowboy Bebop (1998)
10955                                   Renaissance (2006)
19144    Dragon Ball Z the Movie: The World's Strongest...
18388    Professor Layton and the Eternal Diva (Eiga Re...
Name: title, dtype: object

# Collaborative Filtering

get recommendationg based on other users with similar taste

In [71]:
ratings_df = pd.read_csv('ml-20m/ratings.csv', names=['user_id', 'movieId', 'rating', 'timestamp'])

#we dont care about the time the rating was given
del ratings_df['timestamp']

#replace movie_id with movie_title for legibility
ratings_df = pd.merge(ratings_df, movies_df, on='movieId')[['user_id', 'title', 'movieId','rating']]

ratings_df.head()

MemoryError: 

In [70]:

ratings_df = ratings_df.iloc[:5000,:]
print ratings_df.shape
ratings_df.head()

(1000, 4)


Unnamed: 0,user_id,title,movieId,rating
0,893,Stranger than Fiction (2006),46976,5.0
1,902,Stranger than Fiction (2006),46976,4.5
2,910,Stranger than Fiction (2006),46976,3.5
3,948,Stranger than Fiction (2006),46976,5.0
4,975,Stranger than Fiction (2006),46976,4.0


In [67]:
ratings_df['rating'] = ratings_df['rating'].apply(lambda x: float(x))
ratings_mtx_df = ratings_df.pivot_table(values='rating', index='user_id', columns='title')  
ratings_mtx_df.fillna(0, inplace=True)

movie_index = ratings_mtx_df.columns

ratings_mtx_df.head()

title,Stranger than Fiction (2006)
user_id,Unnamed: 1_level_1
893,5.0
902,4.5
910,3.5
948,5.0
975,4.0


In [68]:
ratings_mtx_df.shape

(1000, 1)