In [1]:
import pandas as pd
import numpy as np

import graphlab

<br>
<br>
<br>
<hr style='background-color: #fff; border-top: 2px dashed #8c8b8b;'>
# Load Data

In [2]:
df_users = pd.read_csv('data/users.dat', sep='::')
df_users.columns = ['user_id', 'gender', 'age', 'occupation', 'zip']

df_movies = pd.read_csv('data/movies.dat', sep='::')
df_movies.columns = ['movie_id', 'title', 'genre']

df_ratings = pd.read_csv('data/training_ratings.csv')
df_submission = pd.read_csv('data/sample_submission.csv')

  if __name__ == '__main__':


<br>
<br>
<br>
<hr style='background-color: #fff; border-top: 2px dashed #8c8b8b;'>
# Users DataFrame

In [3]:
df_users['occupation'] = df_users['occupation'].astype(str)
df_users['gender'] = df_users['gender'].astype(bool)
df_users.rename(columns={'user_id': 'user'}, inplace=True)

In [4]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6039 entries, 0 to 6038
Data columns (total 5 columns):
user          6039 non-null int64
gender        6039 non-null bool
age           6039 non-null int64
occupation    6039 non-null object
zip           6039 non-null object
dtypes: bool(1), int64(2), object(2)
memory usage: 241.8+ KB


# Movies DataFrame

In [5]:
genres = set()
for m in df_movies.genre:
   genres.update(g for g in m.split('|'))

genres = sorted(genres)

#make a column for each genre
for genre in genres:
   df_movies[genre] = [genre in movie.split('|') for movie in df_movies.genre]

In [6]:
df_movies.rename(columns={'movie_id': 'movie'}, inplace=True)
df_movies['year'] = df_movies['title'].str[-5:-1].astype(int)
df_movies['title'] = df_movies['title'].str[:-7]

In [7]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3882 entries, 0 to 3881
Data columns (total 22 columns):
movie          3882 non-null int64
title          3882 non-null object
genre          3882 non-null object
Action         3882 non-null bool
Adventure      3882 non-null bool
Animation      3882 non-null bool
Children's     3882 non-null bool
Comedy         3882 non-null bool
Crime          3882 non-null bool
Documentary    3882 non-null bool
Drama          3882 non-null bool
Fantasy        3882 non-null bool
Film-Noir      3882 non-null bool
Horror         3882 non-null bool
Musical        3882 non-null bool
Mystery        3882 non-null bool
Romance        3882 non-null bool
Sci-Fi         3882 non-null bool
Thriller       3882 non-null bool
War            3882 non-null bool
Western        3882 non-null bool
year           3882 non-null int64
dtypes: bool(18), int64(2), object(2)
memory usage: 219.9+ KB


# Ratings

In [8]:
df_ratings['rating'] = df_ratings['rating'].astype(float)

In [9]:
print df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500100 entries, 0 to 500099
Data columns (total 4 columns):
user      500100 non-null int64
movie     500100 non-null int64
rating    500100 non-null float64
id        500100 non-null object
dtypes: float64(1), int64(2), object(1)
memory usage: 19.1+ MB
None


<br>
<br>
<br>
<hr style='background-color: #fff; border-top: 2px dashed #8c8b8b;'>
# Train-Test Split

In [None]:
gdf_ratings = graphlab.SFrame(df_ratings.drop(['id'], axis=1))
gdf_ratings['rating'] = gdf_ratings['rating'].astype(float)

gdf_train_ratings, gdf_test_ratings = gdf_ratings.random_split(0.8)

gdf_users   = graphlab.SFrame(df_users.drop(['zip'], axis=1))
gdf_movies  = graphlab.SFrame(df_movies.drop(['genre', 'title'], axis=1))


In [None]:
recommender = graphlab.recommender.item_similarity_recommender.create(gdf_train_ratings,
                                                                      user_id='user',
                                                                      item_id='movie',
                                                                      target='rating',
                                                                      user_data=gdf_users,
                                                                      item_data=gdf_movies,
                                                                      verbose=False)

gdf_combined = gdf_train_ratings.append(gdf_test_ratings)
movie_movie = gdf_combined.copy()
movie_movie['rating'] = recommender.predict(gdf_combined)

In [None]:
recommender = graphlab.recommender.item_similarity_recommender.create(gdf_train_ratings,
                                                                      user_id='movie',
                                                                      item_id='user',
                                                                      target='rating',
                                                                      user_data=gdf_movies,
                                                                      item_data=gdf_users,
                                                                      verbose=False)


gdf_combined = gdf_train_ratings.append(gdf_test_ratings)
user_user = gdf_combined.copy()
user_user['rating'] = recommender.predict(gdf_combined)

In [None]:
recommender = graphlab.recommender.factorization_recommender.create(gdf_train_ratings,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
#                                                                     user_data=gdf_users,
#                                                                     item_data=gdf_movies,
                                                                    user_data=user_user,
                                                                    item_data=movie_movie,
                                                                    side_data_factorization=True,
                                                                    max_iterations=100,
#                                                                     sgd_step_size=5,
#                                                                     num_factors = 30,
                                                                    solver='auto',
                                                                    verbose=False)

In [None]:
print 'Training RMSE:', recommender.training_rmse

gdf_predict = recommender.predict(gdf_test_ratings)
print 'Testing RMSE:', np.sqrt(np.sum((gdf_test_ratings['rating'].to_numpy() - gdf_predict.to_numpy())**2)/len(gdf_test_ratings))

<br>
<br>
<br>
<hr style='background-color: #fff; border-top: 2px dashed #8c8b8b;'>

# Moment of Truth... (dum dum dum)

In [37]:
gdf_ratings = graphlab.SFrame(df_ratings.drop(['id'], axis=1))

gdf_users   = graphlab.SFrame(df_users.drop(['zip'], axis=1))
gdf_movies  = graphlab.SFrame(df_movies.drop(['genre', 'title'], axis=1))

In [38]:
df_submission['movie'] = df_submission.id.str.split('_', expand=True)[1].astype(int)
df_submission['rating'] = df_submission['rating'].astype(float)

gdf_submission = graphlab.SFrame(df_submission.drop('id', axis=1))


In [108]:
recommender = graphlab.recommender.popularity_recommender.create(gdf_ratings,
                                                                  user_id='user',
                                                                  item_id='movie',
                                                                  target='rating',
#                                                                   user_data=gdf_users,
#                                                                   item_data=gdf_movies,
                                                                  verbose=False)
popularity = recommender.predict(gdf_submission)

PROGRESS: Recsys training: model = popularity


In [90]:
recommender = graphlab.recommender.item_similarity_recommender.create(gdf_ratings, 
                                                                      user_id='user',
                                                                      item_id='movie',
                                                                      target='rating',
                                                                      user_data=gdf_users,
                                                                      item_data=gdf_movies,
                                                                      verbose=False)
movie_movie = recommender.predict(gdf_submission)

PROGRESS: Recsys training: model = item_similarity
PROGRESS: Finished prediction in 0.988146s


In [91]:
recommender = graphlab.recommender.item_similarity_recommender.create(gdf_ratings, 
                                                                      user_id='movie',
                                                                      item_id='user',
                                                                      target='rating',
                                                                      user_data=gdf_movies,
                                                                      item_data=gdf_users,
                                                                      verbose=False)
user_user = recommender.predict(gdf_submission)

PROGRESS: Recsys training: model = item_similarity
PROGRESS: Finished prediction in 0.390101s


In [105]:
recommender = graphlab.recommender.factorization_recommender.create(gdf_ratings,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
#                                                                     user_data=gdf_users,
#                                                                     item_data=gdf_movies,
                                                                    side_data_factorization=True,
                                                                    max_iterations=200,
                                                                    solver='auto',
                                                                    verbose=False,
                                                                     )
factorization = recommender.predict(gdf_submission)

PROGRESS: Recsys training: model = factorization_recommender


In [109]:
df_submission['rating'] = popularity #(popularity + factorization) / 2
# df_submission['rating'] = recommender.predict(gdf_submission).to_numpy()
df_submission.drop('movie', axis=1).to_csv('data/pred_ratings.csv', index=False)

In [None]:
# python code/slack_poster.py data/pred_ratings.csv

<br>
<br>
<br>
<hr style='background-color: #fff; border-top: 2px dashed #8c8b8b;'>

# Quick Evaluation

In [110]:
g = df_submission.groupby('user')

top_5 = g.rating.transform(
    lambda x: x >= x.quantile(.95)
)

test=pd.read_csv('data/dont_use.csv')
test.rating.name='test_rating'
test.rating[top_5==1].mean()

4.3145297224407235