In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, pairwise

## Collaborative Filtering


In [2]:
# # load item data
item = pd.read_csv("../data/u.item", sep="|",encoding="latin-1", 
                      names=["movie_id", "movie_title", "release_date", "video_release_date",
                             "imbd_url", "unknown", "action", "adventure", "animation",
                             "childrens", "comedy", "crime", "documentary", "drama", "fantasy", 
                             "film_noir", "horror", "musical", "mystery", "romance", 
                             "sci-fi", "thriller", "war", "western"])

# load ratings data
rating = pd.read_csv("../data/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"])

In [3]:
# peak at dataframe
item.head()

# we only need the movie_id and movie_title
movies = item.loc[:, :"movie_title"].copy() 
movies.head() 

Unnamed: 0,movie_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
movies.movie_title

0                                Toy Story (1995)
1                                GoldenEye (1995)
2                               Four Rooms (1995)
3                               Get Shorty (1995)
4                                  Copycat (1995)
                          ...                    
1677                            Mat' i syn (1997)
1678                             B. Monkey (1998)
1679                         Sliding Doors (1998)
1680                          You So Crazy (1994)
1681    Scream of Stone (Schrei aus Stein) (1991)
Name: movie_title, Length: 1682, dtype: object

In [5]:
# peak at rating data
rating.head()

# dropping timestamp 
rating.drop("timestamp", axis=1, inplace=True)
rating.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


#### Building the Memory-Based Recommendation system

In [6]:
# creating n x m matrix where n is user_id and m is item_id 
user_ratings = pd.pivot_table(rating, index="user_id", columns="item_id", values="rating").fillna(0)

# user and item counts 
n_users = len(user_ratings.index)
n_items = len(user_ratings.columns)

print(f"Users: {n_users}\nItems: {n_items}")
user_ratings.head()

Users: 943
Items: 1682


item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# alterations made on code from https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/ 
# to cater to pandas dataframes

def train_test_split(data: np.array, n_users: int, n_items:int): 
    # create a empty array of shape n x m for test
    test = np.zeros((n_users, n_items))
    train = data.copy()
    
    # for each user, we generate a random sample of 5 from movies they've watched
    for user in range(n_users):
        random_sample = np.random.choice(data[user, :].nonzero()[0], 
                                         size=5, 
                                         replace=False)
        # set the train to zero to represent no rating and the test will be the original rating
        train[user, random_sample] = 0. 
        test[user, random_sample] = data[user, random_sample]
        
    return train, test

train, test = train_test_split(data=user_ratings.to_numpy(), n_users=n_users, n_items=n_items)

In [8]:
# find similar users and items
user_similarity = pairwise.cosine_similarity(train + 1e-9)
item_similarity = pairwise.cosine_similarity(train.T + 1e-9)

print(user_similarity.shape, item_similarity.shape)

(943, 943) (1682, 1682)


#### User Based Collaborative Filtering (UBCF)

In [9]:
# predict user ratings not included in data
user_preds = np.dot(user_similarity, train) / np.array([np.abs(user_similarity).sum(axis=1)]).T

# get the nonzero elements
nonzero_test = test[test.nonzero()]
nonzero_user_preds = user_preds[test.nonzero()]

user_rating_preds = mean_squared_error(nonzero_test, nonzero_user_preds)
print(f"UBCF Mean Squared Error: {user_rating_preds}")

UBCF Mean Squared Error: 8.314954380040954


#### Item Based Collaborative Filtering (IBCF)

In [10]:
# predict item ratings not included in data
item_preds = np.dot(train, item_similarity) / np.array([np.abs(item_similarity).sum(axis=1)])

# get the nonzero elements
nonzero_item_preds = item_preds[test.nonzero()]

item_rating_preds = mean_squared_error(nonzero_test, nonzero_item_preds)
print(f"IBCF Mean Squared Error: {item_rating_preds}")

IBCF Mean Squared Error: 11.389855185521185


## Content Based Filtering

In [17]:
# merge data so we know the features of each movie
movies = pd.merge(item, rating, right_on="item_id", left_on="movie_id")
# create a pivot table
movies_pivot = pd.pivot_table(movies, index="user_id", columns="movie_title", values="rating")

# transpose only so it fit's in the screen
movies_pivot.T.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
1-900 (1994),,,,,,,,,,,...,,,,,,,,,,
101 Dalmatians (1996),2.0,,,,2.0,,,,,,...,2.0,,,2.0,4.0,,,,,
12 Angry Men (1957),5.0,,,,,4.0,4.0,,,5.0,...,,,,,,,,,,
187 (1997),,,2.0,,,,,,,,...,,,,,,,,,,


In [12]:
# avg ratings and rating counts
avg_rating = movies.groupby("movie_title")["rating"].mean()
num_ratings = movies.groupby("movie_title")["rating"].count()

# getting counts and average ratings
ratings_counts = pd.DataFrame({"avg_rating": avg_rating,
                               "num_of_ratings": num_ratings})

# joining the new values to movie data
full_movie_data = pd.merge(movies, ratings_counts, left_on="movie_title", right_index=True)
full_movie_data.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imbd_url,unknown,action,adventure,animation,childrens,...,romance,sci-fi,thriller,war,western,user_id,item_id,rating,avg_rating,num_of_ratings
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,308,1,4,3.878319,452
1,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,287,1,5,3.878319,452
2,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,148,1,4,3.878319,452
3,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,280,1,4,3.878319,452
4,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,66,1,3,3.878319,452


In [15]:
# https://towardsdatascience.com/recommender-system-in-python-part-2-content-based-system-693a0e4bb306

def get_similar_movies(full_movie_data: pd.DataFrame,
                       movie_matrix: pd.DataFrame,
                       movie_title: str,
                       min_num_of_ratings: int = 100,
                       n_recommendations: int = 5
                       ):
    """
    Get similar movies based on correlation with other movies 
    """
    # get most correlated movies
    similar_movies = movie_matrix.corrwith(movie_matrix[movie_title])
    # converting to a dataframe and dropping NaN's
    similar_corr_df = pd.DataFrame({"correlation":similar_movies})
    similar_corr_df.dropna(inplace=True)
    
    # store the oringinal dataframe
    orig = full_movie_data.copy()
    
    # merge with correlated dataframe but only keep specified columns
    corr_with_movie = pd.merge(left=similar_corr_df,
                               right=orig, 
                               on="movie_title")[
        ["movie_title", "correlation", "avg_rating", "num_of_ratings"]].drop_duplicates().reset_index(drop=True)
    
    # filter movies with less than min_num_of_ratings
    result = corr_with_movie[corr_with_movie['num_of_ratings'] > min_num_of_ratings].sort_values(
                                                                                     by='correlation',
                                                                                     ascending=False)
    return result.iloc[1:, :].head()

In [16]:
get_similar_movies(full_movie_data, movies_pivot, "Toy Story (1995)")

Unnamed: 0,movie_title,correlation,avg_rating,num_of_ratings
288,"Craft, The (1996)",0.5491,3.115385,104
356,Down Periscope (1996),0.457995,2.70297,101
825,Miracle on 34th Street (1994),0.456291,3.722772,101
479,G.I. Jane (1997),0.454756,3.36,175
51,Amistad (1997),0.449915,3.854839,124
