# CS579: Lecture 19  

**Recommendation Systems**

*[Dr. Aron Culotta](http://cs.iit.edu/~culotta)*  
*[Illinois Institute of Technology](http://iit.edu)*

## Recommendation Systems, continued.

Let's try out some of the ideas from last lecture on the [MovieLens](http://grouplens.org/datasets/movielens/) dataset.

In [1]:
import matplotlib.pyplot as plt
from IPython.display import display
import numpy as np
import os
import pandas as pd
import urllib
import zipfile
%matplotlib inline

In [22]:
# Download the data.
def download_data():
    """ Download and unzip data.
    DONE ALREADY.
    """
    url = 'https://www.dropbox.com/s/p9wmkvbqt1xr6lc/ml-latest-small.zip?dl=1'
    urllib.request.urlretrieve(url, 'ml-latest-small.zip')
    zfile = zipfile.ZipFile('ml-latest-small.zip')
    zfile.extractall()
    zfile.close()
    
download_data()
path = 'ml-latest-small'
ratings = pd.read_csv(path + os.path.sep + 'ratings.csv')
movies = pd.read_csv(path + os.path.sep + 'movies.csv')
tags = pd.read_csv(path + os.path.sep + 'tags.csv')
print('read %d ratings, %d movies, and %d tag records' % (len(ratings), len(movies), len(tags)))

read 100004 ratings, 9125 movies, and 1296 tag records


In [23]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182


In [24]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [25]:
tags.head(3)

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997


In [6]:
ratings[ratings.userId==1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


**Let's use the item-item method to predict user 1's rating for movie 3671**

In [26]:
movies[movies.movieId==3671]
#print(movies[movies.movieId==3671].iloc[0]['genres'])

Unnamed: 0,movieId,title,genres
2925,3671,Blazing Saddles (1974),Comedy|Western


In [27]:
# how many users are there with ratings?
user_ids = sorted(set(ratings.userId))
print(len(user_ids))
user_ids[:10]

671


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [28]:
# make user ids start at 0.
ratings['userId'] = ratings['userId'] - 1

In [29]:
sorted(set(ratings['userId']))[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [30]:
# What are the ratings for 3671?
ratings[ratings.movieId==3671]

Unnamed: 0,userId,movieId,rating,timestamp
19,0,3671,3.0,1260759117
1679,14,3671,2.0,1166586157
4436,22,3671,3.5,1149868554
5761,29,3671,4.0,960918106
8453,55,3671,4.0,1467003357
10809,72,3671,3.0,1255595938
11981,74,3671,4.0,1165596914
12031,75,3671,3.5,1194384277
12239,76,3671,4.0,1163079471
13017,82,3671,4.5,1156206112


In [31]:
# mean rating for Blazing Saddles?
ratings[ratings.movieId==3671].rating.mean()

3.935483870967742

In [13]:
# Get the ratings from all users assigned to movie 3671
# Store ratings in a numpy array with dimension equal to number of users.
target_movie_id = 3671
target_movie_vector = np.zeros(len(user_ids))
for index, row in ratings[ratings.movieId==3671].iterrows():
    target_movie_vector[int(row.userId)] = row.rating
# Remove target user's rating for this movie:
target_movie_vector[0] = 0
target_movie_vector[:23]

array([0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 2. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3.5])

In [32]:
from scipy.stats import pearsonr
# Correlation between two item vectors (e.g., all ratings given to movie j)
def correlation(v1, v2):
    # only consider entries that are nonzero in both vectors.
    indices = [i for i in range(len(v1)) if v1[i] != 0 and v2[i] != 0]
    if len(indices) < 2:  # not enough evidence...
        return 0
    else:
        return pearsonr(v1[indices], v2[indices])[0]
        
        
correlation(np.array([0,4,0,5,0,5,2]),
            np.array([4,3,0,4,0,5,1]))

0.9660917830792959

In [15]:
# scaled to mean per user
correlation(np.array([0,3,0,4,0,4,1]),  # all ratings reduced by 1
            np.array([4,3,0,4,0,5,1]))

0.9660917830792959

In [33]:
correlation(np.array([0,4,0,5,0,5,2]),
            np.array([4,5,0,4,0,4,5]))  # change second vector 

-0.8164965809277261

In [34]:
# For every other movie that user 0 rated, compute its correlation with movie 3671
correlations = {}  # movieId->correlation dict
for index, row in ratings[ratings.userId==0].iterrows():  # for each movie this user has rated.
    if row.movieId != 3671:  # ignore Blazing Saddles
        movie = movies[movies.movieId==row.movieId].iloc[0]  # iloc: to get index of 
        movie_vector = np.zeros(len(user_ids))
        # get all user ratings for this title.
        for j, row2 in ratings[ratings.movieId==row.movieId].iterrows():
            movie_vector[int(row2.userId)] = int(row2.rating)
        corr = correlation(target_movie_vector, movie_vector)
        correlations[int(row.movieId)] = corr
        
sorted(correlations.items(), key=lambda x: x[1])[::-1]

[(1287, 0.6761234037828132),
 (2294, 0.6165511261899305),
 (2455, 0.6047610936431034),
 (1029, 0.45195010349519066),
 (2193, 0.42206356372217485),
 (1953, 0.40360701146647887),
 (1343, 0.3933978962347216),
 (1293, 0.3439821898276892),
 (31, 0.3273268353539886),
 (1405, 0.31985970521949497),
 (1339, 0.28333941149126035),
 (1371, 0.17334874279536608),
 (2968, 0.14433756729740643),
 (2105, 0.09636241116594314),
 (1129, 0.02886525844924785),
 (1172, 0.0),
 (2150, -0.038437177725349166),
 (1263, -0.14500663746266237),
 (1061, -0.474341649025257)]

In [35]:
# Now, take top K movies and do weighted average to compute predicted score.
def get_top_k_movies(correlations, ratings, movies, K=5):
    top_movies = sorted(correlations.items(),
                        key=lambda x: x[1])[::-1][:K]
    top_movie_ids = [int(x[0]) for x in top_movies]

    # get user 0's ratings for these most similar other movies.
    top_ratings = ratings[(ratings.userId==0) &
                          ratings.movieId.isin(top_movie_ids)]
    # add title for debugging (using a join!)
    top_ratings = top_ratings.merge(movies, on='movieId')
    # add correlations
    top_ratings['correlation'] = [correlations[i]
                                  for i in top_ratings.movieId]
    return top_ratings

top_ratings = get_top_k_movies(correlations, ratings, movies, K=5)
print('top K most similar movies that user 0 has rated')
display(top_ratings.sort_values('correlation', ascending=False))

top K most similar movies that user 0 has rated


Unnamed: 0,userId,movieId,rating,timestamp,title,genres,correlation
1,0,1287,2.0,1260759187,Ben-Hur (1959),Action|Adventure|Drama,0.676123
3,0,2294,2.0,1260759108,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy,0.616551
4,0,2455,2.5,1260759113,"Fly, The (1986)",Drama|Horror|Sci-Fi|Thriller,0.604761
0,0,1029,3.0,1260759179,Dumbo (1941),Animation|Children|Drama|Musical,0.45195
2,0,2193,2.0,1260759198,Willow (1988),Action|Adventure|Fantasy,0.422064


**weighted average**
$$
r_{ix} = \frac{\sum_{j \in N(i;x)} s_{ij} * r_{jx}}{\sum_{j \in N(i;x)} s_{ij}}
$$

In [36]:
# weighted average:
def weighted_average(top_ratings):
    return np.dot(top_ratings.rating, top_ratings.correlation) / top_ratings.correlation.sum()
# True rating: 3.0
weighted_average(top_ratings)

2.2721791276541663

In [37]:
# how does K affect result?
for K in range(1,16):
    top_ratings = get_top_k_movies(correlations, ratings, movies, K=K)
    print(K, weighted_average(top_ratings))

1 2.0
2 2.0
3 2.159362743619897
4 2.321075692940519
5 2.2721791276541663
6 2.491816372553354
7 2.43759694993172
8 2.399123338565876
9 2.406911424033386
10 2.308215922151981
11 2.3779417171314963
12 2.3821596871760136
13 2.3435022275476696
14 2.3738662723578288
15 2.3718246531591425
