In [1]:
import graphlab
import pandas as pd
import numpy as np


## Get train and test data

Get train and test data from movielens. Each set has >3000 unique users and >3000 unique movies. Most movies are present in both sets, but users have few overlap.

In [2]:
train = pd.read_csv('data/training_ratings.csv')
train.shape

(500100, 4)

In [3]:
train.head()

Unnamed: 0,user,movie,rating,id
0,2783,1253,5,2783_1253
1,2783,589,5,2783_589
2,2783,1270,4,2783_1270
3,2783,1274,4,2783_1274
4,2783,741,5,2783_741


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500100 entries, 0 to 500099
Data columns (total 4 columns):
user      500100 non-null int64
movie     500100 non-null int64
rating    500100 non-null int64
id        500100 non-null object
dtypes: int64(3), object(1)
memory usage: 19.1+ MB


In [5]:
train.describe()

Unnamed: 0,user,movie,rating
count,500100.0,500100.0,500100.0
mean,4408.620518,1834.281214,3.602224
std,926.274862,1076.388887,1.114688
min,2783.0,1.0,1.0
25%,3622.0,1022.0,3.0
50%,4371.0,1767.0,4.0
75%,5220.0,2724.0,4.0
max,6040.0,3952.0,5.0


In [6]:
test = pd.read_csv('data/sample_submission.csv')
test.shape

(500109, 3)

In [7]:
test.head()

Unnamed: 0,user,rating,id
0,1,3.5,1_1193
1,1,3.5,1_661
2,1,3.5,1_914
3,1,3.5,1_3408
4,1,3.5,1_2355


In [8]:
test['movie'] = test.id.apply(lambda x: str(x).split('_')[1]).astype(int)

In [9]:
test.head()

Unnamed: 0,user,rating,id,movie
0,1,3.5,1_1193,1193
1,1,3.5,1_661,661
2,1,3.5,1_914,914
3,1,3.5,1_3408,3408
4,1,3.5,1_2355,2355


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500109 entries, 0 to 500108
Data columns (total 4 columns):
user      500109 non-null int64
rating    500109 non-null float64
id        500109 non-null object
movie     500109 non-null int64
dtypes: float64(1), int64(2), object(1)
memory usage: 19.1+ MB


In [11]:
print len(test.movie.unique())
print len(test.user.unique())

3643
3415


In [12]:
print len(train.movie.unique())
print len(train.user.unique())

3551
3255


### How much overlap of users and movies between train and test

The users are quite different between train and test, both train and test have 3200-3400 unique users and less than 800 users are present in both sets. This is a cold start problem.

In [13]:
print len(set(train.user.unique()) - set(test.user.unique()))
print len(set(test.user.unique()) - set(train.user.unique()))

2625
2785


In [14]:
print len(set(train.movie.unique()) - set(test.movie.unique()))
print len(set(test.movie.unique()) - set(train.movie.unique()))

63
155


## Side data - users

In [15]:
df_users = pd.read_csv('data/users.dat', sep='::')
df_users.columns = ['user', 'gender', 'age', 'occupation', 'zip']

  if __name__ == '__main__':


In [16]:
df_users.head()

Unnamed: 0,user,gender,age,occupation,zip
0,2,M,56,16,70072
1,3,M,25,15,55117
2,4,M,45,7,2460
3,5,M,25,20,55455
4,6,F,50,9,55117


In [17]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6039 entries, 0 to 6038
Data columns (total 5 columns):
user          6039 non-null int64
gender        6039 non-null object
age           6039 non-null int64
occupation    6039 non-null int64
zip           6039 non-null object
dtypes: int64(3), object(2)
memory usage: 283.1+ KB


In [18]:
#change data type of 'occupation'
df_users['occupation'] = df_users['occupation'].astype(str)

## Side data - movies

In [19]:
df_movies = pd.read_csv('data/movies.dat', sep='::')
df_movies.columns = ['movie_id', 'title', 'genre']
df_movies.head()

  if __name__ == '__main__':


Unnamed: 0,movie_id,title,genre
0,2,Jumanji (1995),Adventure|Children's|Fantasy
1,3,Grumpier Old Men (1995),Comedy|Romance
2,4,Waiting to Exhale (1995),Comedy|Drama
3,5,Father of the Bride Part II (1995),Comedy
4,6,Heat (1995),Action|Crime|Thriller


In [20]:
genres = set()
for m in df_movies.genre:
    genres.update(g for g in m.split('|'))
genres = sorted(genres)

#make a column for each genre
for genre in genres:
    df_movies[genre] = [genre in movie.split('|') for movie in df_movies.genre]

In [21]:
len(genres)

18

In [22]:
df_movies.rename(columns={'movie_id': 'movie'}, inplace=True)
df_movies['year'] = df_movies['title'].str[-5:-1].astype(int)
df_movies['title'] = df_movies['title'].str[:-7]

In [23]:
df_movies.head()

Unnamed: 0,movie,title,genre,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,2,Jumanji,Adventure|Children's|Fantasy,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,1995
1,3,Grumpier Old Men,Comedy|Romance,False,False,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,1995
2,4,Waiting to Exhale,Comedy|Drama,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,1995
3,5,Father of the Bride Part II,Comedy,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,1995
4,6,Heat,Action|Crime|Thriller,True,False,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,1995


In [24]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3882 entries, 0 to 3881
Data columns (total 22 columns):
movie          3882 non-null int64
title          3882 non-null object
genre          3882 non-null object
Action         3882 non-null bool
Adventure      3882 non-null bool
Animation      3882 non-null bool
Children's     3882 non-null bool
Comedy         3882 non-null bool
Crime          3882 non-null bool
Documentary    3882 non-null bool
Drama          3882 non-null bool
Fantasy        3882 non-null bool
Film-Noir      3882 non-null bool
Horror         3882 non-null bool
Musical        3882 non-null bool
Mystery        3882 non-null bool
Romance        3882 non-null bool
Sci-Fi         3882 non-null bool
Thriller       3882 non-null bool
War            3882 non-null bool
Western        3882 non-null bool
year           3882 non-null int64
dtypes: bool(18), int64(2), object(2)
memory usage: 219.9+ KB


# Modeling with factorization_recommender

In [25]:
train['rating'] = train['rating'].astype(float)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500100 entries, 0 to 500099
Data columns (total 4 columns):
user      500100 non-null int64
movie     500100 non-null int64
rating    500100 non-null float64
id        500100 non-null object
dtypes: float64(1), int64(2), object(1)
memory usage: 19.1+ MB


In [27]:
gdf_ratings = graphlab.SFrame(train.drop(['id'], axis=1))

gdf_users   = graphlab.SFrame(df_users.drop(['zip'], axis=1))
gdf_movies  = graphlab.SFrame(df_movies.drop(['genre', 'title'], axis=1))

## Use deafult setting for factorization_recommender

In [28]:
recommender = graphlab.recommender.factorization_recommender.create(gdf_ratings,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    user_data=gdf_users,
                                                                    item_data=gdf_movies,
                                                                    side_data_factorization=True,
                                                                    verbose=False)

PROGRESS: Recsys training: model = factorization_recommender


In [29]:
recommender.num_factors

8

In [30]:
recommender.regularization

1e-08

In [31]:
recommender.solver

'auto'

In [32]:
recommender.max_iterations

50

## Train/test split for cross-validation

Use util.random_split_by_user to make training and validation subset.
Check various parameters based on RMSE. Suprisingly, the default setting, which results in num_factor=8, max_iteractions=50, solver=auto, gave the lowest RMSE. When I tried to increase num_factor, max_iteractions, or solver, RMSE went up.

I also tried different solver method. The default 'sgd', and 'adagrad' are the best, while 'als' has higher RMSE.

In [33]:
training_subset, validation_subset = \
graphlab.recommender.util.random_split_by_user(gdf_ratings,
                                               user_id="user", 
                                               item_id="movie",
                                               item_test_proportion=0.3) 

In [34]:
model_1 = graphlab.recommender.factorization_recommender.create(training_subset,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    user_data=gdf_users,
                                                                    item_data=gdf_movies,
                                                                    side_data_factorization=True,
                                                                    verbose=False)

PROGRESS: Recsys training: model = factorization_recommender


In [35]:
rmse_1 = model_1.evaluate(validation_subset,verbose=False)
rmse_1['rmse_overall']

0.8863974757135484

In [36]:
#Increase num_factors and max_iterations, and solver='sgd'
model_2 = graphlab.recommender.factorization_recommender.create(training_subset,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    user_data=gdf_users,
                                                                    item_data=gdf_movies,
                                                                    side_data_factorization=True,
                                                                    max_iterations = 200,
                                                                    num_factors = 20,
                                                                    solver='sgd',
                                                                    verbose=False)

PROGRESS: Recsys training: model = factorization_recommender


In [37]:
#RMSE is worse, most likely because num_factors
rmse_2 = model_2.evaluate(validation_subset,verbose=False)
rmse_2['rmse_overall']

1.1087815373222412

In [38]:
#reduce max_iterations
model_3 = graphlab.recommender.factorization_recommender.create(training_subset,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    user_data=gdf_users,
                                                                    item_data=gdf_movies,
                                                                    side_data_factorization=True,
                                                                    num_factors = 10,
                                                                    max_iterations=50,
                                                                    solver='sgd',
                                                                    verbose=False)

PROGRESS: Recsys training: model = factorization_recommender


In [39]:
#RMSE is still worse
rmse_3 = model_3.evaluate(validation_subset,verbose=False)
rmse_3['rmse_overall']

1.1166233711156477

In [40]:
#reduce num_factors to 8, still not as good as default model, 'sgd' vs 'auto'
model_4 = graphlab.recommender.factorization_recommender.create(training_subset,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    user_data=gdf_users,
                                                                    item_data=gdf_movies,
                                                                    side_data_factorization=True,
                                                                    num_factors = 8,
                                                                    max_iterations=50,
                                                                    solver='sgd',
                                                                    verbose=False)

PROGRESS: Recsys training: model = factorization_recommender


In [42]:
rmse_4 = model_4.evaluate(validation_subset,verbose=False)
rmse_4['rmse_overall']

1.1075197907073209

In [43]:
#try a different solver, adagrad, pretty close to default model
model_5 = graphlab.recommender.factorization_recommender.create(training_subset,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    user_data=gdf_users,
                                                                    item_data=gdf_movies,
                                                                    solver='adagrad',
                                                                    side_data_factorization=True,
                                                                    verbose=False)

PROGRESS: Recsys training: model = factorization_recommender


In [44]:
rmse_5 = model_5.evaluate(validation_subset,verbose=False)
rmse_5['rmse_overall']

0.8915285482300347

In [45]:
#try another solver, als, not as good as 'auto' or 'adagrad'
model_6 = graphlab.recommender.factorization_recommender.create(training_subset,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    user_data=gdf_users,
                                                                    item_data=gdf_movies,
                                                                    solver='als',
                                                                    side_data_factorization=True,
                                                                    verbose=False)

PROGRESS: Recsys training: model = factorization_recommender


In [46]:
rmse_6 = model_6.evaluate(validation_subset,verbose=False)
rmse_6['rmse_overall']

1.1194155389175386

## Try models other than factorization_recommender

Use item_similarity_recommender and popularity_recommender. Use both user and movie for item in item_similarity_recommender. Judging by RMSE, popularity is slightly better than model_movie, and both are better than model_user, but none is as good as factorization_recommender.

In [47]:
model_user = graphlab.recommender.item_similarity_recommender.create(training_subset, 
                                                                      user_id='movie',
                                                                      item_id='user',
                                                                      target='rating',
                                                                      user_data=gdf_movies,
                                                                      item_data=gdf_users,
                                                                      verbose=False)

PROGRESS: Recsys training: model = item_similarity


In [48]:
rmse_user = model_user.evaluate(validation_subset,verbose=False)
rmse_user['rmse_overall']

PROGRESS: Finished prediction in 0.127693s


1.2763729028774933

In [49]:
#use item_similarity_recommender, with 'movie' as item
model_movie = graphlab.recommender.item_similarity_recommender.create(training_subset, 
                                                                      user_id='user',
                                                                      item_id='movie',
                                                                      target='rating',
                                                                      user_data=gdf_users,
                                                                      item_data=gdf_movies,
                                                                      verbose=False)

PROGRESS: Recsys training: model = item_similarity


In [50]:
rmse_movie = model_movie.evaluate(validation_subset,verbose=False)
rmse_movie['rmse_overall']

PROGRESS: Finished prediction in 0.065216s


1.0295674849348777

In [51]:
model_pop = graphlab.recommender.popularity_recommender.create(training_subset, 
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    user_data=gdf_users,
                                                                    item_data=gdf_movies,
                                                                    verbose=False)

PROGRESS: Recsys training: model = popularity


In [52]:
rmse_pop = model_pop.evaluate(validation_subset,verbose=False)
rmse_pop['rmse_overall']

1.0012302918674774

# Predict test data

For each user, our scoring metric will select the 5% of movies you thought would be most highly rated by that user. It then looks at the actual ratings (in the test data) that the user gave those movies. Your score is the average of those ratings.

Using this metrics, factorization_recommender with adagrad > popularity_recommender > default factorization_recommender model > item_user or item_movie. It seems that RMSE is somewhat indicative of model performance.

In [53]:
gdf_test = graphlab.SFrame(test.drop('id', axis=1))

In [54]:
#the data used for scoring, same group of users as the test dataset
test2 = pd.read_csv('data/dont_use.csv')
test2.head()

Unnamed: 0,user,id,rating
0,1,1_1193,5
1,1,1_661,3
2,1,1_914,3
3,1,1_3408,4
4,1,1_2355,5


In [56]:
print len(test.user.unique())
print len(test2.user.unique())
print sum(test.user.unique() == test2.user.unique())

3415
3415
3415


In [57]:
#the default factorization_recommender model
test['rating'] = model_1.predict(gdf_test)
g = test.groupby('user')
top_5 = g.rating.transform(lambda x: x >= x.quantile(.95))
test2.rating[top_5==1].mean()

4.30399910048349

In [58]:
#use popularity_recommender model
test['rating'] = model_pop.predict(gdf_test)
g = test.groupby('user')
top_5 = g.rating.transform(lambda x: x >= x.quantile(.95))
test2.rating[top_5==1].mean()

4.313160458667466

In [60]:
#use item_user model
test['rating'] = model_user.predict(gdf_test)
g = test.groupby('user')
top_5 = g.rating.transform(lambda x: x >= x.quantile(.95))
test2.rating[top_5==1].mean()

PROGRESS: Finished prediction in 0.199819s


3.570325577319136

In [61]:
#use item_movie model
test['rating'] = model_movie.predict(gdf_test)
g = test.groupby('user')
top_5 = g.rating.transform(lambda x: x >= x.quantile(.95))
test2.rating[top_5==1].mean()

PROGRESS: Finished prediction in 0.53491s


3.7860111013376203

In [62]:
#use adagrad factorization_recommender model
test['rating'] = model_5.predict(gdf_test)
g = test.groupby('user')
top_5 = g.rating.transform(lambda x: x >= x.quantile(.95))
test2.rating[top_5==1].mean()

4.3245380607923245

In [63]:
#retrain model with whole dataset
model_factor1 = graphlab.recommender.factorization_recommender.create(gdf_ratings,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    user_data=gdf_users,
                                                                    item_data=gdf_movies,
                                                                    side_data_factorization=True,
                                                                    verbose=False)

PROGRESS: Recsys training: model = factorization_recommender


In [64]:
test['rating'] = model_factor1.predict(gdf_test)
g = test.groupby('user')
top_5 = g.rating.transform(lambda x: x >= x.quantile(.95))
test2.rating[top_5==1].mean()

4.305835613357821

In [65]:
#remove side data from factorization_recommender
model_factor2 = graphlab.recommender.factorization_recommender.create(gdf_ratings,
                                                                    user_id='user',
                                                                    item_id='movie',
                                                                    target='rating',
                                                                    #user_data=gdf_users,
                                                                    #item_data=gdf_movies,
                                                                    side_data_factorization=False,
                                                                    verbose=False)

PROGRESS: Recsys training: model = factorization_recommender


In [66]:
test['rating'] = model_factor2.predict(gdf_test)
g = test.groupby('user')
top_5 = g.rating.transform(lambda x: x >= x.quantile(.95))
test2.rating[top_5==1].mean()

4.3197781192608975