In [29]:
import pandas as pd
import numpy as np

import graphlab
from graphlab import grid_search, SFrame, cross_validation, random_search
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 36) #AWS 36 cpu cores

## Load data

In [2]:
df_users = pd.read_csv('data/users.dat', sep='::', engine='python')
df_users.columns = ['user_id', 'gender', 'age', 'occupation', 'zip']

df_movies = pd.read_csv('data/movies.dat', sep='::', engine='python')
df_movies.columns = ['movie_id', 'title', 'genre']

df_ratings = pd.read_csv('data/training_ratings.csv')
df_test = pd.read_csv('data/sample_submission.csv')

In [3]:
df_test['movie'] = df_test.id.str.split('_', expand=True)[1]

## Clean data

In [4]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6039 entries, 0 to 6038
Data columns (total 5 columns):
user_id       6039 non-null int64
gender        6039 non-null object
age           6039 non-null int64
occupation    6039 non-null int64
zip           6039 non-null object
dtypes: int64(3), object(2)
memory usage: 283.1+ KB


In [5]:
df_users.loc[df_users['gender'] == "M", "gender"] = 1
df_users.loc[df_users['gender'] == "F", "gender"] = 0

In [6]:
df_users['occupation'] = df_users['occupation'].astype(str)
df_users['gender'] = df_users['gender'].astype(bool)

In [7]:
# occupations = pd.Series({ \
#                         0: "other",
#                         1:  "academic/educator",
#                         2:  "artist",
#                         3:  "clerical/admin",
#                         4:  "college/grad student",
#                         5:  "customer service",
#                         6:  "doctor/health care",
#                         7:  "executive/managerial",
#                         8:  "farmer",
#                         9:  "homemaker",
#                         10:  "K-12 student",
#                         11:  "lawyer",
#                         12:  "programmer",
#                         13:  "retired",
#                         14:  "sales/marketing",
#                         15:  "scientist",
#                         16:  "self-employed",
#                         17:  "technician/engineer",
#                         18:  "tradesman/craftsman",
#                         19:  "unemployed",
#                         20:  "writer",
#                         })

In [8]:
df_users.rename(columns={'user_id': 'user'}, inplace=True)
# df_users['occupation'] = occupations[df_users.occupation].values

In [9]:
df_users.head()

Unnamed: 0,user,gender,age,occupation,zip
0,2,True,56,16,70072
1,3,True,25,15,55117
2,4,True,45,7,2460
3,5,True,25,20,55455
4,6,False,50,9,55117


In [10]:
genres = set()
for m in df_movies.genre:
   genres.update(g for g in m.split('|'))

genres = sorted(genres)

#make a column for each genre
for genre in genres:
   df_movies[genre] = [genre in movie.split('|') for movie in df_movies.genre]

In [11]:
df_movies.rename(columns={'movie_id': 'movie'}, inplace=True)
df_movies['year'] = df_movies['title'].str[-5:-1].astype(int)
df_movies['title'] = df_movies['title'].str[:-7]

In [12]:
df_movies.head()

Unnamed: 0,movie,title,genre,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,2,Jumanji,Adventure|Children's|Fantasy,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,1995
1,3,Grumpier Old Men,Comedy|Romance,False,False,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,1995
2,4,Waiting to Exhale,Comedy|Drama,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,1995
3,5,Father of the Bride Part II,Comedy,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,1995
4,6,Heat,Action|Crime|Thriller,True,False,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,1995


In [13]:
df_ratings['rating'] = df_ratings['rating'].astype(float)

In [14]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500100 entries, 0 to 500099
Data columns (total 4 columns):
user      500100 non-null int64
movie     500100 non-null int64
rating    500100 non-null float64
id        500100 non-null object
dtypes: float64(1), int64(2), object(1)
memory usage: 19.1+ MB


## Train-test split

In [41]:
gdf_ratings = graphlab.SFrame(df_ratings.drop(['id'], axis=1))
gdf_users   = graphlab.SFrame(df_users.drop(['zip'], axis=1))
gdf_movies  = graphlab.SFrame(df_movies.drop(['genre', 'title'], axis=1))

In [16]:
training, validation = graphlab.recommender.util.random_split_by_user(gdf_ratings,
                                                                       user_id="user", 
                                                                       item_id="movie",
                                                                       item_test_proportion=0.3) 

In [17]:
gdf_movies.head()

movie,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror
2,0,1,0,1,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0
6,1,0,0,0,0,1,0,0,0,0,0
7,0,0,0,0,1,0,0,0,0,0,0
8,0,1,0,1,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,0
10,1,1,0,0,0,0,0,0,0,0,0
11,0,0,0,0,1,0,0,1,0,0,0

Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,0,0,0,0,0,0,1995
0,0,1,0,0,0,0,1995
0,0,0,0,0,0,0,1995
0,0,0,0,0,0,0,1995
0,0,0,0,1,0,0,1995
0,0,1,0,0,0,0,1995
0,0,0,0,0,0,0,1995
0,0,0,0,0,0,0,1995
0,0,0,0,1,0,0,1995
0,0,1,0,0,0,0,1995


In [18]:
gdf_users.head()

user,gender,age,occupation
2,1,56,16
3,1,25,15
4,1,45,7
5,1,25,20
6,0,50,9
7,1,35,1
8,1,25,12
9,1,25,17
10,0,35,1
11,0,25,1


In [19]:
gdf_ratings.head()

user,movie,rating
2783,1253,5.0
2783,589,5.0
2783,1270,4.0
2783,1274,4.0
2783,741,5.0
2783,750,5.0
2783,924,5.0
2783,2407,4.0
2783,3070,3.0
2783,208,1.0


## Recommender Model

In [20]:
recommender = graphlab.recommender.factorization_recommender.create(training,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    user_data=gdf_users,
                                                                    item_data=gdf_movies,
                                                                    side_data_factorization=True,
                                                                    max_iterations=100,
                                                                    solver='auto',
                                                                    verbose=False)

In [38]:
rmse = recommender.evaluate(validation,verbose=False)
rmse['rmse_overall']

0.8844163967600109

In [22]:
rank_recommender = graphlab.recommender.ranking_factorization_recommender.create(training,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    user_data=gdf_users,
                                                                    item_data=gdf_movies,
                                                                    side_data_factorization=True,
                                                                    max_iterations=100,
                                                                    solver='auto',
                                                                    verbose=False)

In [23]:
rank_rmse = rank_recommender.evaluate(validation,verbose=False)
rank_rmse['rmse_overall']

1.0324534677338029

## Evaluation

In [24]:
df_test['movie'] = df_test.id.str.split('_', expand=True)[1].astype(int)
df_test['rating'] = df_test['rating'].astype(float)

gdf_test = graphlab.SFrame(df_test.drop('id', axis=1))

In [25]:
test=pd.read_csv('data/dont_use.csv')

In [26]:
# score factorization model
df_test['rating'] = recommender.predict(gdf_test)
g = df_test.groupby('user')
top_5 = g.rating.transform(lambda x: x >= x.quantile(.95))
test.rating[top_5==1].mean()

4.279149956898167

In [27]:
# score ranking factorization model
df_test['rating'] = rank_recommender.predict(gdf_test)
g = df_test.groupby('user')
top_5 = g.rating.transform(lambda x: x >= x.quantile(.95))
test.rating[top_5==1].mean()

4.237247479479779

In [28]:
test[['user','rating']][top_5 == 1].sort_values('user').head(10)

Unnamed: 0,user,rating
23,1,5
38,1,4
48,1,5
93,2,5
105,2,4
107,2,4
127,2,5
131,2,5
135,2,4
137,2,4


## Parameter Tuning

In [30]:
# folds = cross_validation.KFold(gdf_ratings, 5)
params = {'user_id': 'user', 
          'item_id': 'movie', 
          'target': 'rating',
          'user_data' : [gdf_users],
          'item_data': [gdf_movies],
          'regularization':[.00000001 , .0000000001, .000000000001],
          'side_data_factorization': [True, False],
          'max_iterations': [50, 100],
          'num_factors': [6, 10, 18]}

In [31]:
# job = random_search.create(folds, graphlab.recommender.factorization_recommender.create, params)

job = grid_search.create((training, validation), graphlab.recommender.factorization_recommender.create, params)

2016-04-24 14:54:41,011 [INFO] graphlab.deploy.job, 22: Validating job.
2016-04-24 14:54:41,034 [INFO] graphlab.deploy.job, 36: Creating a LocalAsync environment called 'async'.
2016-04-24 14:54:41,039 [INFO] graphlab.deploy.map_job, 186: Validation complete. Job: 'Model-Parameter-Search-Apr-24-2016-14-54-4100000' ready for execution
2016-04-24 14:54:41,187 [INFO] graphlab.deploy.map_job, 192: Job: 'Model-Parameter-Search-Apr-24-2016-14-54-4100000' scheduled.
2016-04-24 14:55:31,312 [INFO] graphlab.deploy.job, 22: Validating job.
2016-04-24 14:55:31,315 [INFO] graphlab.deploy.map_job, 220: A job with name 'Model-Parameter-Search-Apr-24-2016-14-54-4100000' already exists. Renaming the job to 'Model-Parameter-Search-Apr-24-2016-14-54-4100000-841f0'.
2016-04-24 14:55:31,319 [INFO] graphlab.deploy.map_job, 186: Validation complete. Job: 'Model-Parameter-Search-Apr-24-2016-14-54-4100000-841f0' ready for execution
2016-04-24 14:55:31,455 [INFO] graphlab.deploy.map_job, 192: Job: 'Model-Param

In [32]:
results = job.get_results()
results.column_names()

['model_id',
 'item_data',
 'item_id',
 'max_iterations',
 'num_factors',
 'regularization',
 'side_data_factorization',
 'target',
 'user_data',
 'user_id',
 'training_precision@5',
 'training_recall@5',
 'training_rmse',
 'validation_precision@5',
 'validation_recall@5',
 'validation_rmse']

In [34]:
results[['model_id','regularization','num_factors','max_iterations', 'validation_rmse', 'side_data_factorization']].print_rows(num_rows=100)

+----------+----------------+-------------+----------------+-----------------+-------------------------+
| model_id | regularization | num_factors | max_iterations | validation_rmse | side_data_factorization |
+----------+----------------+-------------+----------------+-----------------+-------------------------+
|    11    |     1e-08      |      18     |      100       |  0.920826685422 |            0            |
|    10    |     1e-08      |      18     |       50       |  0.875497623035 |            0            |
|    9     |     1e-08      |      18     |      100       |  0.91716345895  |            1            |
|    8     |     1e-08      |      18     |       50       |  0.96958960467  |            1            |
|    1     |     1e-08      |      6      |      100       |  0.87811811441  |            1            |
|    0     |     1e-08      |      6      |       50       |  0.877233060007 |            1            |
|    3     |     1e-08      |      6      |      100   

In [35]:
best_RMSE = results['validation_rmse'].min()
best_model_id = results[results['validation_rmse'] == best_RMSE]['model_id'][0]

In [36]:
best_RMSE, best_model_id

(0.859389375869293, 23)

In [37]:
job.get_models()[best_model_id]

Class                           : FactorizationRecommender

Schema
------
User ID                         : user
Item ID                         : movie
Target                          : rating
Additional observation features : 0
Number of user side features    : 4
Number of item side features    : 20

Statistics
----------
Number of observations          : 456484
Number of users                 : 6039
Number of items                 : 3883

Training summary
----------------
Training time                   : 29.3169

Model Parameters
----------------
Model class                     : FactorizationRecommender
num_factors                     : 18
binary_target                   : 0
side_data_factorization         : 0
solver                          : auto
nmf                             : 0
max_iterations                  : 100

Regularization Settings
-----------------------
regularization                  : 0.0
regularization_type             : normal
linear_regularization           : 

## Final movie recommender using all training data

In [39]:
final_recommender = graphlab.recommender.factorization_recommender.create(gdf_ratings,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    user_data=gdf_users,
                                                                    item_data=gdf_movies,
                                                                    side_data_factorization=False,
                                                                    max_iterations=100,
                                                                    solver='auto',
                                                                    num_factors=18,
                                                                    verbose=False)

In [40]:
df_test['rating'] = final_recommender.predict(gdf_test)
g = df_test.groupby('user')
top_5 = g.rating.transform(lambda x: x >= x.quantile(.95))
test.rating[top_5==1].mean()

4.322101870244744