In [1]:
import pandas as pd
import numpy as np
import os

cwd = os.getcwd()
cwd

'/Users/javi/Developer/WBS DataScience/0011-popularity/streamlit-app'

In [2]:
movies_df = pd.read_csv(os.path.join(cwd, "data/movies.csv"))
ratings_df = pd.read_csv(os.path.join(cwd, "data/ratings.csv"))
tags_df = pd.read_csv(os.path.join(cwd, "data/tags.csv"))
links_df = pd.read_csv(os.path.join(cwd, "data/links.csv"))

user_ids = pd.unique(ratings_df['userId'])

In [3]:
links_df = links_df.fillna(0).astype(int)

In [4]:
# pd.to_numeric(links_df, downcast='integer')

movies_with_links = (movies_df
    .merge(links_df, on='movieId')
)

movies_with_links.to_csv('data/movies-tags.csv', index=False)

In [5]:
user_ids_df = pd.DataFrame(list(user_ids), columns=["userId"])
user_ids_df = user_ids_df.sort_values("userId")

user_ids_df.to_csv('data/users.csv', index=False)

In [6]:
genres = []

for i,mov in movies_df.iterrows():
    genresArr = mov['genres'].split('|')
    for genre in genresArr:
        #if genre == "(no genres listed)":
        #    genre = "(Not Specified)"
        if genre not in genres:
            genres.append(genre)

genres = pd.DataFrame(list(pd.unique(genres)), columns=["genre"])
genres = genres.sort_values("genre")

genres.to_csv('data/genres.csv', index=False)

## Popularity-based Recommendations

In [7]:
# get the mean of ratings of every movie
rating = pd.DataFrame(ratings_df.groupby('movieId')['rating'].mean())

# get the best and most recent ratings
# rating.sort_values(["rating", "timestamp"], ascending=[False, False]).head()
rating.sort_values(["rating"], ascending=[False]).head(10)

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
88448,5.0
100556,5.0
143031,5.0
143511,5.0
143559,5.0
6201,5.0
102217,5.0
102084,5.0
6192,5.0
145994,5.0


In [6]:
rating['rating_count'] = ratings_df.groupby('movieId')['rating'].count()
rating.sort_values(["rating_count", "rating"], ascending=[False, False]).head(10)

Unnamed: 0_level_0,rating,rating_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
356,4.164134,329
318,4.429022,317
296,4.197068,307
593,4.16129,279
2571,4.192446,278
260,4.231076,251
480,3.75,238
110,4.031646,237
589,3.970982,224
527,4.225,220


In [7]:
popular_movies = (rating
    .merge(movies_df, on='movieId')
    .sort_values(["rating_count", "rating"], ascending=[False, False])
)
popular_movies.head(10)

Unnamed: 0,movieId,rating,rating_count,title,genres
314,356,4.164134,329,Forrest Gump (1994),Comedy|Drama|Romance|War
277,318,4.429022,317,"Shawshank Redemption, The (1994)",Crime|Drama
257,296,4.197068,307,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
510,593,4.16129,279,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
1938,2571,4.192446,278,"Matrix, The (1999)",Action|Sci-Fi|Thriller
224,260,4.231076,251,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
418,480,3.75,238,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
97,110,4.031646,237,Braveheart (1995),Action|Drama|War
507,589,3.970982,224,Terminator 2: Judgment Day (1991),Action|Sci-Fi
461,527,4.225,220,Schindler's List (1993),Drama|War


In [8]:
def get_popular_movies(ratings_df, movies_df, n = 10):
    rating = pd.DataFrame(ratings_df.groupby('movieId')['rating'].mean())
    rating['rating_count'] = ratings_df.groupby('movieId')['rating'].count()
    
    popular_movies = (rating
        .merge(movies_df, on='movieId')
        .sort_values(["rating_count", "rating"], ascending=[False, False])
    )
    return popular_movies.head(n)


def get_most_popular_movie(popular_movies):
    mostPopularMovie = popular_movies.head(1)
    mostPopularMovieIdx = mostPopularMovie.index[0]
    mostPopularMovieId = mostPopularMovie['movieId'][mostPopularMovieIdx]
    
    return [mostPopularMovieIdx, mostPopularMovieId]


popular_movies = get_popular_movies(ratings_df, movies_df, n = 5)
mostPopularMovieIdx, mostPopularMovieId = get_most_popular_movie(popular_movies)

[mostPopularMovieIdx, mostPopularMovieId]

[314, 356]

## Item-based Recommendations

In [9]:
movies_crosstab = pd.pivot_table(data=ratings_df, values='rating', index='userId', columns='movieId')
movies_crosstab

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [10]:
popularMovieRatings = movies_crosstab[mostPopularMovieId]
popularMovieRatings[popularMovieRatings>=0] # exclude NaNs

userId
1      4.0
6      5.0
7      5.0
8      3.0
10     3.5
      ... 
605    3.0
606    4.0
608    3.0
609    4.0
610    3.0
Name: 356, Length: 329, dtype: float64

In [12]:
movies_crosstab.fillna(0, inplace=True)
similarToPopularMovie = movies_crosstab.corrwith(popularMovieRatings)

# Drop NaNs
similarToPopularMovieCorr = pd.DataFrame(similarToPopularMovie, columns=['PearsonR'])
similarToPopularMovieCorr = similarToPopularMovieCorr.dropna(inplace=False)

similarToPopularMovieCorr

Unnamed: 0_level_0,PearsonR
movieId,Unnamed: 1_level_1
1,0.228127
2,0.255733
3,0.090639
4,0.102824
5,0.064031
...,...
193581,-0.042070
193583,-0.042070
193585,-0.042070
193587,-0.042070


In [62]:
# Add rating counts

similarToPopularMovieSummary = similarToPopularMovieCorr.join(rating['rating_count'])
similarToPopularMovieSummary = similarToPopularMovieSummary.drop(
    mostPopularMovieId, inplace=False
) # drop popular movie itself

similarToPopularMovieSummary

Unnamed: 0_level_0,PearsonR,rating_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.303465,215
2,0.367247,110
3,0.534682,52
4,0.388514,7
5,0.349541,49
...,...,...
185585,-1.000000,2
187541,1.000000,4
187593,-0.203519,12
187595,0.870388,5


In [63]:
# Filter movies having less than 10 ratings, and get the top 10, then add the movies data

similarityTop10 = (similarToPopularMovieSummary[similarToPopularMovieSummary['rating_count']>=10]
                    .sort_values(['PearsonR', 'rating_count'], ascending=[False,False])
                    .merge(movies_df, left_index=True, right_on="movieId")
                    .head(10)
                  )
similarityTop10

Unnamed: 0,PearsonR,rating_count,movieId,title,genres
993,0.932958,11,1295,"Unbearable Lightness of Being, The (1988)",Drama
4573,0.885253,11,6793,Beethoven (1992),Children|Comedy|Drama
286,0.881682,10,328,Tales from the Crypt Presents: Demon Knight (1...,Horror|Thriller
3607,0.865633,11,4954,Ocean's Eleven (a.k.a. Ocean's 11) (1960),Comedy|Crime
693,0.850591,13,911,Charade (1963),Comedy|Crime|Mystery|Romance|Thriller
6607,0.799415,10,55721,Elite Squad (Tropa de Elite) (2007),Action|Crime|Drama|Thriller
165,0.786428,10,195,Something to Talk About (1995),Comedy|Drama|Romance
153,0.785661,17,181,Mighty Morphin Power Rangers: The Movie (1995),Action|Children
7436,0.782601,12,80906,Inside Job (2010),Documentary
9193,0.776636,10,150548,Sherlock: The Abominable Bride (2016),Action|Crime|Drama|Mystery|Thriller


In [23]:
def get_similar_movies(movieId, ratings_df, ratings_mean_df, movies_df, n = 10, min_ratings = 10):
    movies_crosstab = pd.pivot_table(data=ratings_df, values='rating', index='userId', columns='movieId')
    
    # Replace NaNs with zeros
         # not doing this gives different results??
    movies_crosstab = movies_crosstab.fillna(0, inplace=False)
    
    popular_ratings = movies_crosstab[movieId]
    popular_ratings[popular_ratings>=0] # exclude NaNs in the pivot table cross tab
    
    # Find PearsonR correlation
    similar_corr = pd.DataFrame(movies_crosstab.corrwith(popular_ratings), columns=['PearsonR_Value'])
    similar_corr = similar_corr.dropna(inplace=False) # exclude NaNs in the corr matrix
    
    similar_summary = similar_corr.join(ratings_mean_df['rating_count'])
    similar_summary = similar_summary.drop(
        movieId, inplace=False
    ) # drop popular movie itself
    
    return (similar_summary[similar_summary['rating_count']>=min_ratings]
            .sort_values(['PearsonR_Value', 'rating_count'], ascending=[False,False])
            .merge(movies_df, left_index=True, right_on="movieId")
            .head(n)
          )

get_similar_movies(mostPopularMovieId, ratings_df, rating, movies_df, 4, 15)

Unnamed: 0,PearsonR_Value,rating_count,movieId,title,genres
418,0.453632,238,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
436,0.44046,144,500,Mrs. Doubtfire (1993),Comedy|Drama
277,0.41118,317,318,"Shawshank Redemption, The (1994)",Crime|Drama
123,0.390374,201,150,Apollo 13 (1995),Adventure|Drama|IMAX


In [14]:
movies_df.loc[153]

movieId                                               181
title      Mighty Morphin Power Rangers: The Movie (1995)
genres                                    Action|Children
Name: 153, dtype: object

## User-based recommendations

In [95]:
from sklearn.metrics.pairwise import cosine_similarity
import random

def get_user_recommendations(forUserId, ratings_df, movies_df, n = 10):
    # Create the big users-items table, using the userId as index.
    users_items = pd.pivot_table(data=ratings_df, values='rating', index='userId', columns='movieId')
    
    # Replace NaNs with zeros
    users_items = users_items.fillna(0, inplace=False)
    
    # Compute pairwise cosine similarities
    user_similarities = pd.DataFrame(
        cosine_similarity(users_items),
        columns=users_items.index, 
        index=users_items.index
    )
    
    # build recommender system
    ## compute weights, excluding target user
    
    user_similarities_excl = user_similarities.query("userId!=@forUserId")[forUserId]
    user_similarities_excl_sums = sum(user_similarities_excl)
    weights = (user_similarities_excl / user_similarities_excl_sums)
    
    ## find movies that target user did not rate yet
    users_items.loc[forUserId,:]==0

    not_rated_movies = users_items.loc[users_items.index!=forUserId, users_items.loc[forUserId,:]==0]

    ## predict/compute the ratings target user would give to those unrated restaurants.
    
    ### dot product between the not-rated-movies and the weights
    weighted_averages = pd.DataFrame(not_rated_movies.T.dot(weights), columns=["predicted_rating"])
    
    ## find the top N movies from the rating predictions
    recommendations = weighted_averages.merge(movies_df, left_index=True, right_on="movieId")
    recommendations = recommendations.sort_values("predicted_rating", ascending=False).head(n)
    
    return recommendations
    

sampleUserId = random.choice(user_ids)

print("User ID", sampleUserId)

get_user_recommendations(sampleUserId, ratings_df, movies_df, n = 10)

User ID 102


Unnamed: 0,predicted_rating,movieId,title,genres
97,2.621274,110,Braveheart (1995),Action|Drama|War
461,1.89144,527,Schindler's List (1993),Drama|War
46,1.766555,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
0,1.596926,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
512,1.588869,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
31,1.566925,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
1939,1.538051,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
275,1.521506,316,Stargate (1994),Action|Adventure|Sci-Fi
224,1.475375,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
325,1.465296,367,"Mask, The (1994)",Action|Comedy|Crime|Fantasy
