In [1]:
import pandas as pd
import numpy as np

import graphlab
from graphlab import model_parameter_search, SFrame

## Load data

In [22]:
df_users = pd.read_csv('data/users.dat', sep='::', engine='python')
df_users.columns = ['user_id', 'gender', 'age', 'occupation', 'zip']

df_movies = pd.read_csv('data/movies.dat', sep='::', engine='python')
df_movies.columns = ['movie_id', 'title', 'genre']

df_ratings = pd.read_csv('data/training_ratings.csv')
df_test = pd.read_csv('data/sample_submission.csv')

In [23]:
df_test['movie'] = df_test.id.str.split('_', expand=True)[1]

## Clean data

In [24]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6039 entries, 0 to 6038
Data columns (total 5 columns):
user_id       6039 non-null int64
gender        6039 non-null object
age           6039 non-null int64
occupation    6039 non-null int64
zip           6039 non-null object
dtypes: int64(3), object(2)
memory usage: 236.0+ KB


In [25]:
df_users.loc[df_users['gender'] == "M", "gender"] = 1
df_users.loc[df_users['gender'] == "F", "gender"] = 0

In [26]:
df_users['occupation'] = df_users['occupation'].astype(str)
df_users['gender'] = df_users['gender'].astype(bool)

In [27]:
# occupations = pd.Series({ \
#                         0: "other",
#                         1:  "academic/educator",
#                         2:  "artist",
#                         3:  "clerical/admin",
#                         4:  "college/grad student",
#                         5:  "customer service",
#                         6:  "doctor/health care",
#                         7:  "executive/managerial",
#                         8:  "farmer",
#                         9:  "homemaker",
#                         10:  "K-12 student",
#                         11:  "lawyer",
#                         12:  "programmer",
#                         13:  "retired",
#                         14:  "sales/marketing",
#                         15:  "scientist",
#                         16:  "self-employed",
#                         17:  "technician/engineer",
#                         18:  "tradesman/craftsman",
#                         19:  "unemployed",
#                         20:  "writer",
#                         })

In [28]:
df_users.rename(columns={'user_id': 'user'}, inplace=True)
# df_users['occupation'] = occupations[df_users.occupation].values

In [29]:
df_users.head()

Unnamed: 0,user,gender,age,occupation,zip
0,2,True,56,16,70072
1,3,True,25,15,55117
2,4,True,45,7,2460
3,5,True,25,20,55455
4,6,False,50,9,55117


In [30]:
genres = set()
for m in df_movies.genre:
   genres.update(g for g in m.split('|'))

genres = sorted(genres)

#make a column for each genre
for genre in genres:
   df_movies[genre] = [genre in movie.split('|') for movie in df_movies.genre]

In [31]:
df_movies.rename(columns={'movie_id': 'movie'}, inplace=True)
df_movies['year'] = df_movies['title'].str[-5:-1].astype(int)
df_movies['title'] = df_movies['title'].str[:-7]

In [32]:
df_movies.head()

Unnamed: 0,movie,title,genre,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,2,Jumanji,Adventure|Children's|Fantasy,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,1995
1,3,Grumpier Old Men,Comedy|Romance,False,False,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,1995
2,4,Waiting to Exhale,Comedy|Drama,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,1995
3,5,Father of the Bride Part II,Comedy,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,1995
4,6,Heat,Action|Crime|Thriller,True,False,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,1995


In [33]:
df_ratings['rating'] = df_ratings['rating'].astype(float)

In [34]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500100 entries, 0 to 500099
Data columns (total 4 columns):
user      500100 non-null int64
movie     500100 non-null int64
rating    500100 non-null float64
id        500100 non-null object
dtypes: float64(1), int64(2), object(1)
memory usage: 15.3+ MB


## Factorization Recommender Model

In [41]:
gdf_ratings = graphlab.SFrame(df_ratings.drop(['id'], axis=1))
gdf_users   = graphlab.SFrame(df_users.drop(['zip'], axis=1))
gdf_movies  = graphlab.SFrame(df_movies.drop(['genre', 'title'], axis=1))

# gdf_test = graphlab.SFrame(df_test)

In [42]:
gdf_movies.head()

movie,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror
2,0,1,0,1,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0
6,1,0,0,0,0,1,0,0,0,0,0
7,0,0,0,0,1,0,0,0,0,0,0
8,0,1,0,1,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,0
10,1,1,0,0,0,0,0,0,0,0,0
11,0,0,0,0,1,0,0,1,0,0,0

Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,0,0,0,0,0,0,1995
0,0,1,0,0,0,0,1995
0,0,0,0,0,0,0,1995
0,0,0,0,0,0,0,1995
0,0,0,0,1,0,0,1995
0,0,1,0,0,0,0,1995
0,0,0,0,0,0,0,1995
0,0,0,0,0,0,0,1995
0,0,0,0,1,0,0,1995
0,0,1,0,0,0,0,1995


In [43]:
gdf_users.head()

user,gender,age,occupation
2,1,56,16
3,1,25,15
4,1,45,7
5,1,25,20
6,0,50,9
7,1,35,1
8,1,25,12
9,1,25,17
10,0,35,1
11,0,25,1


In [44]:
gdf_ratings.head()

user,movie,rating
2783,1253,5.0
2783,589,5.0
2783,1270,4.0
2783,1274,4.0
2783,741,5.0
2783,750,5.0
2783,924,5.0
2783,2407,4.0
2783,3070,3.0
2783,208,1.0


## Train-test split

In [39]:
training, validation = gdf_ratings.random_split(0.8)

In [45]:
gdf_ratings.head()

user,movie,rating
2783,1253,5.0
2783,589,5.0
2783,1270,4.0
2783,1274,4.0
2783,741,5.0
2783,750,5.0
2783,924,5.0
2783,2407,4.0
2783,3070,3.0
2783,208,1.0


In [54]:
recommender = graphlab.recommender.ranking_factorization_recommender.create(training,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    user_data=gdf_users,
                                                                    item_data=gdf_movies,
                                                                    side_data_factorization=True,
                                                                    max_iterations=100,
#                                                                     sgd_step_size=5,
#                                                                     num_factors = 30,
                                                                    solver='auto',
                                                                    verbose=False)

In [55]:
print 'Training RMSE:', recommender.training_rmse

gdf_predict = recommender.predict(validation)
print 'Testing RMSE:', np.sqrt(np.sum((validation['rating'].to_numpy() - gdf_predict.to_numpy())**2)/len(validation))

Training RMSE: 1.22121704926
Testing RMSE: 1.23527470741


## Evaluation

In [60]:
df_test = pd.read_csv('data/sample_submission.csv')

In [61]:
df_test['movie'] = df_test.id.str.split('_', expand=True)[1].astype(int)
df_test['rating'] = df_test['rating'].astype(float)

gdf_test = graphlab.SFrame(df_test.drop('id', axis=1))

In [62]:
test=pd.read_csv('data/dont_use.csv')

In [63]:
df_test['rating'] = recommender.predict(gdf_test)
g = df_test.groupby('user')
top_5 = g.rating.transform(lambda x: x >= x.quantile(.95))
test.rating[top_5==1].mean()

4.133878040553203

## Parameter Tuning

In [None]:
params = {'user_id': 'user', 
          'item_id': 'movie', 
          'target': 'rating',
          'user_data' : [gdf_users], 
          'item_data': [gdf_movies],
          'regularization':[.00000001 , .0000001, .0001],
          'side_data_factorization': True, 
          'max_iterations': [50],
          'num_factors': [12, 20, 30]}

job = model_parameter_search.create((training, validation),
                            graphlab.recommender.ranking_factorization_recommender.create, params)

In [None]:
job.get_results()

In [None]:
results = job.get_results()
results.column_names()

In [None]:
results[['regularization','num_factors','max_iterations', 'validation_rmse']].print_rows(num_rows=100)

In [None]:
best_RMSE = results['validation_rmse'].min()
best_model = search_summary[search_summary['validation_rmse'] == best_RMSE]['model_id'][0]

In [None]:
df_test['rating'] = best_model.predict(gdf_test)
g = df_test.groupby('user')
top_5 = g.rating.transform(lambda x: x >= x.quantile(.95))
test.rating[top_5==1].mean()