# Exercises for "Hands-on with Pydata: How to Build a Minimal Recommendation Engine"

# Systems check: imports and files

In [1]:
import numpy as np
import pandas as pd

# Pandas questions: Series and DataFrames
## 1. Adding a column in a DataFrame

In [2]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4]})
df

Unnamed: 0,col1
0,1
1,2
2,3
3,4


In [3]:
df['col2'] = [2,4,6,8]
df

Unnamed: 0,col1,col2
0,1,2
1,2,4
2,3,6
3,4,8


## 2. Deleting a row in a DataFrame

In [4]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])
df

Unnamed: 0,col1
a,1
b,2
c,3
d,4


In [5]:
df.drop('d', axis=0, inplace=True)
df

Unnamed: 0,col1
a,1
b,2
c,3


## 3. Creating a DataFrame from a few Series

In [6]:
# given the following three Series, create a DataFrame such that it holds them as its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))
df = pd.DataFrame({'a': ser_1, 'b': ser_2, 'c': ser_3})
df

Unnamed: 0,a,b,c
0,0.225839,-0.12998,1.092328
1,-2.111709,0.429975,-0.018653
2,0.288541,0.124234,0.209816
3,0.568558,0.39243,1.00432
4,1.135196,-0.090413,0.446329
5,1.45601,-0.70044,-0.350539


# Pandas questions: Indexing

## 1. Indexing into a specific column

In [7]:
# given the following DataFrame, try to index into the 'col_2' column
df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},
                  columns=['col_1', 'col_2', 'col_3'],
                  index=['obs1', 'obs2', 'obs3', 'obs4'])
df

Unnamed: 0,col_1,col_2,col_3
obs1,0.12,0.9,
obs2,7.0,9.0,
obs3,45.0,34.0,
obs4,10.0,11.0,


In [8]:
df.col_2

obs1     0.9
obs2     9.0
obs3    34.0
obs4    11.0
Name: col_2, dtype: float64

## 2. Label-based indexing

In [9]:
# using the same DataFrame, index into the row whose index is 'obs3'
df.ix['obs3']

col_1     45
col_2     34
col_3    NaN
Name: obs3, dtype: object

In [10]:
df.loc['obs3']

col_1     45
col_2     34
col_3    NaN
Name: obs3, dtype: object

## 2. Position-based indexing

In [11]:
# using the same DataFrame, index into into its first row
df.ix[0]

col_1    0.12
col_2     0.9
col_3     NaN
Name: obs1, dtype: object

In [12]:
df.iloc[0]

col_1    0.12
col_2     0.9
col_3     NaN
Name: obs1, dtype: object

# Mini-Challenge prep: data loading

## 1. How to load the `users` and `movies` portions of MovieLens

In [13]:
import pandas as pd

users = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'], engine='python')

movies = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None,
                       names=['movie_id', 'title', 'genres'], engine='python')

In [14]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [15]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 2. How to load the training and testing subsets

In [16]:
movielens_train = pd.read_csv('data/movielens-train.csv', index_col=0)
movielens_test = pd.read_csv('data/movielens-test.csv', index_col=0)
movielens = pd.concat([movielens_train, movielens_test])

In [17]:
movielens.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False


# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [18]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [19]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

Test a dummy solution!

In [20]:
def my_estimate_func(user_id, movie_id):
    return 3.0

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [21]:
print 'RMSE for my estimate function: %s' % evaluate(my_estimate_func)

RMSE for my estimate function: 1.23237195265


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [22]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    
    # second, compute the mean of those ratings
    if ratings_by_others.empty:
        return 3.0
    else:
        return ratings_by_others.rating.mean()
    
# try it out for a user_id, movie_id pair
collab_mean(4653, 2648)

4.0

In [23]:
print 'RMSE for collab_mean: %s' % evaluate(collab_mean)

RMSE for collab_mean: 1.1234279896


# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:

- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

In [24]:
def collab_age(user_id, movie_id):
    # first, index into all ratings of this movie
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    
    if ratings_by_others.empty:
        return 3.0
    
    means_by_age = ratings_by_others.pivot_table('rating', index='movie_id', columns='age')
    user_age = users.ix[user_id, 'age']
    
    if user_age in means_by_age.columns:
        return means_by_age.ix[movie_id, user_age]
    else:
        return means_by_age.ix[movie_id].mean()

print 'RMSE for collab_age: %s' % evaluate(collab_age)

RMSE for collab_age: 1.2097983189


In [25]:
def collab_age2(user_id, movie_id):
    # first, index into all ratings of this movie
    user_condition = movielens_train.user_id != user_id
    age = np.unique(movielens_train.loc[~user_condition].age)[0]
    age_condition = movielens_train.age == age
    ratings_by_others = movielens_train.loc[user_condition & age_condition]
    
    # second, compute the mean of those ratings
    if ratings_by_others.empty:
        return 3.0
    else:
        return ratings_by_others.rating.mean()

print 'RMSE for collab_age2: %s' % evaluate(collab_age2)

RMSE for collab_age2: 1.09093791784


In [26]:
def collab_zip(user_id, movie_id):
    # first, index into all ratings of this movie
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    
    if ratings_by_others.empty:
        return 3.0
    
    means_by_zip = ratings_by_others.pivot_table('rating', index='movie_id', columns='zip')
    user_zip = users.ix[user_id, 'zip']
    
    if user_zip in means_by_zip.columns:
        return means_by_zip.ix[movie_id, user_zip]
    else:
        return means_by_zip.ix[movie_id].mean()

print 'RMSE for collab_zip: %s' % evaluate(collab_zip)

RMSE for collab_zip: 1.12436927507


In [27]:
def collab_job(user_id, movie_id):
    # first, index into all ratings of this movie
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    
    if ratings_by_others.empty:
        return 3.0
    
    means_by_job = ratings_by_others.pivot_table('rating', index='movie_id', columns='occupation')
    user_job = users.ix[user_id, 'occupation']
    
    if user_job in means_by_job.columns:
        return means_by_job.ix[movie_id, user_job]
    else:
        return means_by_job.ix[movie_id].mean()

print 'RMSE for collab_job: %s' % evaluate(collab_job)

RMSE for collab_job: 1.18662369595


# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

In [28]:
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

In [29]:
def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2))

In [30]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

In [31]:
def jaccard(s1, s2):
    dotp = np.sum(s1 * s2)
    return dotp / (np.sum(s1 ** 2) + np.sum(s2 ** 2) - dotp)

In [32]:
class CollabSimFun:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens.pivot_table('rating', index='movie_id', columns='user_id')

    def similarity(self, simfun):
        self.simfun = simfun

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: self.simfun(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)

In [33]:
reco = CollabSimFun()
reco.learn()

In [34]:
reco.similarity(euclidean)
print 'RMSE for CollabSimFun w/ euclidean: %s' % evaluate(reco.estimate)

RMSE for CollabSimFun w/ euclidean: 0.993180683911


In [35]:
reco.similarity(cosine)
print 'RMSE for CollabSimFun w/ cosine: %s' % evaluate(reco.estimate)

RMSE for CollabSimFun w/ cosine: 1.12037146911


In [36]:
reco.similarity(pearson)
print 'RMSE for CollabSimFun w/ pearson: %s' % evaluate(reco.estimate)



RMSE for CollabSimFun w/ pearson: 1.06037523514


In [37]:
reco.similarity(jaccard)
print 'RMSE for CollabSimFun w/ jaccard: %s' % evaluate(reco.estimate)

RMSE for CollabSimFun w/ jaccard: 1.11772188908


In [38]:
overall_mean = movielens_train['rating'].mean()

def hybrid_fun(user_id, movie_id):
    measures = [
        (0.6, movielens_train.loc[movielens_train.movie_id == movie_id]),
        (0.4, movielens_train.loc[reduce(np.logical_and,
                                        [movielens_train.gender == users.ix[user_id, 'gender'],
                                         movielens_train.occupation == users.ix[user_id, 'occupation'],
                                         abs(movielens_train.age - users.ix[user_id, 'age']) <= 10])])
    ]
    
    return sum((weight * (measure.empty and overall_mean or measure['rating'].mean())
                for weight, measure in measures))
    
print 'RMSE for hybrid_fun: %s' % evaluate(hybrid_fun)

RMSE for hybrid_fun: 1.05782017012
