This model combines the recommendations generated from content-based, collaborative filtering and SVD model. The hybrid model overcomes the shortcomings of individual models and improves the diversity of the recommendations

In [None]:
!pip install surprise

In [5]:
import pandas as pd
import numpy as np

In [6]:
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import dump

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [8]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [None]:
file_path_train = '../demo/preproc_data/training_data.csv'
file_path_test = '../demo/preproc_data/testing_data.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)

In [11]:
testdf.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,50,5.0,964982931,"['Crime', 'Mystery', 'Thriller']",[]
1,1,151,5.0,964984041,"['Action', 'Drama', 'Romance', 'War']",[]
2,1,157,5.0,964984100,"['Comedy', 'War']",[]
3,1,235,4.0,964980908,"['Comedy', 'Drama']",[]
4,1,296,3.0,964982967,"['Comedy', 'Crime', 'Drama', 'Thriller']",[]


### CF and Latent Factor models:

In [None]:
# basic collaborative filtering algorithm taking into account a baseline rating.
sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
knnbaseline_algo = KNNBaseline(sim_options=sim_options)

knnbaseline_algo.fit(trainset)
knnbaseline_predictions = knnbaseline_algo.test(testset)

file_name = 'KnnBaseline_model'
knn_model_path = "../demo/models/" + file_name
dump.dump(knn_model_path, algo=knnbaseline_predictions)
# _, loaded_algo = dump.load(knn_model_path)

accuracy.rmse(knnbaseline_predictions)
accuracy.mae(knnbaseline_predictions)
print("Done!")

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9044
MAE:  0.6960
Done!


In [None]:
svd_algo = SVD()

svd_algo.fit(trainset)
svd_predictions = svd_algo.test(testset)

file_name = 'svd_model'
svd_model_path = "../demo/models/" + file_name
dump.dump(svd_model_path, algo=svd_algo)
# _, loaded_algo = dump.load(svd_model_path)

accuracy.rmse(svd_predictions)
accuracy.mae(svd_predictions)
print("Done!")

RMSE: 0.8847
MAE:  0.6762
Done!


In [None]:
svdpp_algo = SVDpp()

svdpp_algo.fit(trainset)
svdpp_predictions = svdpp_algo.test(testset)

file_name = 'svdpp_model'
svdpp_model_path = "../demo/models/" + file_name
dump.dump(svdpp_model_path, algo=svdpp_algo)
# _, loaded_algo = dump.load(svdpp_model_path)

accuracy.rmse(svdpp_predictions)
accuracy.mae(svdpp_predictions)
print("Done!")

RMSE: 0.8741
MAE:  0.6666
Done!


##### Movie Similarity model

In [67]:
movies = pd.read_csv("../demo/data/tmdb_5000_movies.csv")

In [63]:
genre_to_idx = {'Adventure': 0,
 'Animation': 1,
 'Children': 2,
 'Comedy': 3,
 'Fantasy': 4,
 'Romance': 5,
 'Drama': 6,
 'Action': 7,
 'Crime': 8,
 'Thriller': 9,
 'Horror': 10,
 'Mystery': 11,
 'Sci-Fi': 12,
 'War': 13,
 'Musical': 14,
 'Documentary': 15,
 'IMAX': 16,
 'Western': 17,
 'Film-Noir': 18,
 '(no genres listed)': 19}

In [64]:
idx_to_genre = {0: 'Adventure',
 1: 'Animation',
 2: 'Children',
 3: 'Comedy',
 4: 'Fantasy',
 5: 'Romance',
 6: 'Drama',
 7: 'Action',
 8: 'Crime',
 9: 'Thriller',
 10: 'Horror',
 11: 'Mystery',
 12: 'Sci-Fi',
 13: 'War',
 14: 'Musical',
 15: 'Documentary',
 16: 'IMAX',
 17: 'Western',
 18: 'Film-Noir',
 19: '(no genres listed)'}

In [68]:
movies['tagline'] = movies['tagline'].fillna('')
movies['description_genre'] = movies['overview'] + movies['tagline'] + 2*movies['genres']
movies['description_genre'] = movies['description_genre'].fillna('')

In [29]:
tf_new = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.0, stop_words='english')
tfidf_matrix_new = tf_new.fit_transform(movies['description_genre'])

In [30]:
cosine_sim_new = linear_kernel(tfidf_matrix_new, tfidf_matrix_new)

In [31]:
movies = movies.reset_index()
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])
indices.head(2)

title
Avatar                                      0
Pirates of the Caribbean: At World's End    1
dtype: int64

In [73]:
def get_recommendations_new(title):
    idx = indices[title]
    if type(idx) != np.int64:
        if len(idx)>1:
            print("ALERT: Multiple values")
            idx = idx[0]
    sim_scores = list(enumerate(cosine_sim_new[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['id'].iloc[movie_indices]

##### Popularity model

In [76]:
def genre_based_popularity(genre):
    mask = movies.genres.apply(lambda x: genre in x)
    filtered_movie = movies[mask]
    filtered_movie = filtered_movie.sort_values(by='popularity', ascending=False)
#     filtered_movie = filtered_movie.sort_values(by='wr', ascending=False)
    return filtered_movie['id'].head(10).values.tolist() 

# genre_based_popularity('Animation')[['title', 'popularity']].head(25)

In [34]:
user_info = pd.read_csv('../demo/preproc_data/user_info.csv')

In [35]:
user_info['user_vector'] = user_info['user_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())
user_info['user_vector'] = user_info['user_vector'].apply(lambda x: np.asarray(x).astype(float))


In [36]:
def user_top_genre(userId):
    user_vec = user_info['user_vector'][user_info['userId'] == userId].values[0].copy()
    print("User Vector: ", user_vec)
    top_genre_indices = np.flip(np.argsort(user_vec))
    genre_list = []
    for i in top_genre_indices[:3]:
        genre_list.append(idx_to_genre[i])
    return genre_list

In [37]:
user_top_genre(1)

User Vector:  [4.43076923 4.72727273 4.64705882 4.26086957 4.28947368 4.22727273
 4.56       4.2972973  4.35135135 4.11363636 3.375      4.
 4.23529412 4.44444444 4.76470588 0.         0.         4.25
 5.         0.        ]


['Film-Noir', 'Musical', 'Animation']

##### Hybrid model

In [38]:
knn_baseline = dump.load('../demo/models/KnnBaseline_model')
svdpp = dump.load('../demo/models/svdpp_model')

In [39]:
# List of users in testing data:
user_list = testdf['userId'].unique()

In [40]:
# type(testdf['userId'][0])
test_movies = testdf[testdf['userId'] == 60]
test_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
1745,60,318,4.0,1393542053,"['Crime', 'Drama']",[]
1746,60,805,4.0,1393541754,"['Drama', 'Thriller']",[]
1747,60,1242,4.0,1393541757,"['Drama', 'War']",[]
1748,60,1562,3.0,1393541892,"['Action', 'Adventure', 'Fantasy', 'Thriller']",[]


In [69]:
# Combined model predicion on testing data, using top movies to generate more movies based on movie similarity and popularity

def hybrid(userId):
    user_movies = testdf[testdf['userId'] == userId]
    user_movies['est'] = user_movies['movieId'].apply(lambda x: 0.6*knnbaseline_algo.predict(userId,x).est + 0.4*svdpp_algo.predict(userId, x).est)    
    user_movies = user_movies.sort_values(by ='est', ascending=False).head(4)
    user_movies['Model'] = 'SVD + CF'
#     user_movies = user_movies['movieId'].values.tolist()
#     print("User liked movies list: ", user_movies)
    
    recommend_list = user_movies[['movieId', 'est', 'Model']]
    print(recommend_list.head())

#     top_movie = user_movies['movieId'].iloc[0]
#     print("Top movie id", top_movie)
#     top_movie_title = movies['title'][movies['movieId'] == top_movie].values[0]
#     print("Top movie title", top_movie_title)

    
    movie_list = recommend_list['movieId'].values.tolist()
    print(movie_list)
    sim_movies_list = []
    for movie_id in movie_list:
        # Call content based 
        # movie_title = movies['title'][movies['id'] == movie_id].values[0]
        
        # NOTE: hardcode
        movie_title = "Avatar"
        sim_movies = get_recommendations_new(movie_title)
#         print(sim_movies.values.tolist())
        sim_movies_list.extend(sim_movies)
    
    
    # Compute ratings for the popular movies
    for movie_id in sim_movies_list:
        pred_rating = 0.6*knnbaseline_algo.predict(userId, movie_id).est + 0.4*svdpp_algo.predict(userId, movie_id).est
        row_df = pd.DataFrame([[movie_id, pred_rating, 'Movie similarity']], columns=['movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    
    # Popular based movies
    top_genre_list = user_top_genre(userId)
    print("User top genre list: ", top_genre_list)
    
    popular_movies = []
    for top_genre in top_genre_list:
        popular_movies.extend(genre_based_popularity(top_genre))
    print("Final list: ", popular_movies)
    
    # Compute ratings for the popular movies
    for movie_id in popular_movies:
        pred_rating = 0.6*knnbaseline_algo.predict(userId, movie_id).est + 0.4*svdpp_algo.predict(userId, movie_id).est
        row_df = pd.DataFrame([[movie_id, pred_rating, 'Popularity']], columns=['movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    recommend_list = recommend_list.drop_duplicates(subset=['movieId'])
    train_movie_list = traindf[traindf['userId']==userId]['movieId'].values.tolist()
    
    # Remove movies in training for this user
    mask = recommend_list.movieId.apply(lambda x: x not in train_movie_list)
    recommend_list = recommend_list[mask]
    
    return recommend_list

In [42]:
# traindf[traindf['userId'] == 9].sort_values(by = 'rating', ascending = False)
traindf[traindf['userId'] == 524].sort_values(by = 'rating', ascending = False)

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
66038,524,589,5.0,851608875,"['Action', 'Sci-Fi']",[]
66000,524,151,5.0,851608839,"['Action', 'Drama', 'Romance', 'War']",[]
66024,524,457,5.0,851608781,['Thriller'],[]
66019,524,377,5.0,851608745,"['Action', 'Romance', 'Thriller']",[]
66015,524,367,5.0,851608818,"['Action', 'Comedy', 'Crime', 'Fantasy']",[]
...,...,...,...,...,...,...
66057,524,1027,2.0,851609037,"['Adventure', 'Drama']",[]
66060,524,1097,2.0,851609016,"['Children', 'Drama', 'Sci-Fi']",[]
66003,524,173,1.0,851609191,"['Action', 'Crime', 'Sci-Fi']",[]
65988,524,12,1.0,852404800,"['Comedy', 'Horror']",[]


In [43]:
testdf[testdf['userId'] == 574]
# testdf[testdf['userId'] == 574]
# testdf[testdf['userId'] == 576]

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
17795,574,47,5.0,834634527,"['Mystery', 'Thriller']",[]
17796,574,296,3.0,834634383,"['Comedy', 'Crime', 'Drama', 'Thriller']",[]
17797,574,316,4.0,834634443,"['Action', 'Adventure', 'Sci-Fi']",[]
17798,574,339,4.0,834634464,"['Comedy', 'Romance']",[]
17799,574,588,5.0,834634408,"['Adventure', 'Animation', 'Children', 'Comedy...",[]


In [87]:
# movie_ids = hybrid(1)
# movie_ids = hybrid(2)
# movie_ids = hybrid(574)
# movie_ids = hybrid(9)
movie_ids = hybrid(576)

       movieId       est     Model
17808     1231  3.674620  SVD + CF
17807     1188  3.473315  SVD + CF
17809     1347  3.224381  SVD + CF
17810     2826  2.861990  SVD + CF
[1231, 1188, 1347, 2826]
User Vector:  [3.3        3.5        4.         3.44444444 3.625      3.
 3.375      3.         4.75       3.8        3.66666667 3.5
 2.85714286 0.         0.         0.         0.         0.
 0.         0.        ]
User top genre list:  ['Crime', 'Children', 'Thriller']
Final list:  [155, 238, 278, 680, 198184, 272, 49026, 10764, 206647, 240, 76341, 135397, 119450, 131631, 87101, 155, 27205, 205596, 210577, 61791]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_movies['est'] = user_movies['movieId'].apply(lambda x: 0.6*knnbaseline_algo.predict(userId,x).est + 0.4*svdpp_algo.predict(userId, x).est)


In [81]:
def get_title(x):
    mid = x['movieId']
    return movies['title'][movies['id'] == mid].values

In [82]:
def get_genre(x):
    mid = x['movieId']
    return movies['genres'][movies['id'] == mid].values

In [83]:
movie_ids['title'] = movie_ids.apply(get_title, axis=1)
movie_ids['genre'] = movie_ids.apply(get_genre, axis=1)

In [84]:
movie_ids.sort_values(by='est', ascending = False).head(10)

Unnamed: 0,movieId,est,Model,title,genre
0,527,5.0,SVD + CF,[],[]
1,50,4.91908,SVD + CF,[],[]
2,296,4.915235,SVD + CF,[Terminator 3: Rise of the Machines],"[[{""id"": 28, ""name"": ""Action""}, {""id"": 53, ""na..."
3,2028,4.912835,SVD + CF,[],[]
5,18,4.624252,Movie similarity,[The Fifth Element],"[[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ..."
13,127585,4.201521,Movie similarity,[X-Men: Days of Future Past],"[[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""na..."
51,20352,4.201521,Popularity,[Despicable Me],"[[{""id"": 16, ""name"": ""Animation""}, {""id"": 1075..."
49,62177,4.201521,Popularity,[Brave],"[[{""id"": 16, ""name"": ""Animation""}, {""id"": 12, ..."
48,150540,4.201521,Popularity,[Inside Out],"[[{""id"": 18, ""name"": ""Drama""}, {""id"": 35, ""nam..."
47,93456,4.201521,Popularity,[Despicable Me 2],"[[{""id"": 16, ""name"": ""Animation""}, {""id"": 35, ..."
