In [1]:
import pandas as pd
import numpy as np
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import dump
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [2]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [3]:
file_path_train = './data/train.csv'
file_path_test = './data/test.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)

In [4]:
sim_options = {'name': 'cosine',
               'user_based': False
               }
knnbaseline_algo = KNNBaseline(sim_options=sim_options)
knnbaseline_algo.fit(trainset)

svd_algo = SVD()
svd_algo.fit(trainset)

svdpp_algo = SVDpp()
svdpp_algo.fit(trainset)

slopeone_algo = SlopeOne()
slopeone_algo.fit(trainset)

baseline_algo = BaselineOnly()
baseline_algo.fit(trainset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7fa3ca3431c0>

In [5]:
movies = pd.read_csv("./data/movies_tmdb.csv")

In [6]:
genre_to_idx = {'Adventure': 0,
 'Animation': 1,
 'Children': 2,
 'Comedy': 3,
 'Fantasy': 4,
 'Romance': 5,
 'Drama': 6,
 'Action': 7,
 'Crime': 8,
 'Thriller': 9,
 'Horror': 10,
 'Mystery': 11,
 'Sci-Fi': 12,
 'War': 13,
 'Musical': 14,
 'Documentary': 15,
 'IMAX': 16,
 'Western': 17,
 'Film-Noir': 18,
 '(no genres listed)': 19}

idx_to_genre = {0: 'Adventure',
 1: 'Animation',
 2: 'Children',
 3: 'Comedy',
 4: 'Fantasy',
 5: 'Romance',
 6: 'Drama',
 7: 'Action',
 8: 'Crime',
 9: 'Thriller',
 10: 'Horror',
 11: 'Mystery',
 12: 'Sci-Fi',
 13: 'War',
 14: 'Musical',
 15: 'Documentary',
 16: 'IMAX',
 17: 'Western',
 18: 'Film-Noir',
 19: '(no genres listed)'}

In [7]:
movies['description_genre'] = movies['overview'] + 2*movies['genres']
movies['description_genre'] = movies['description_genre'].fillna('')

tf_new = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, stop_words='english')
tfidf_matrix_new = tf_new.fit_transform(movies['description_genre'])

cosine_sim_new = linear_kernel(tfidf_matrix_new, tfidf_matrix_new)

In [8]:
movies = movies.reset_index()
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

def get_recommendations_new(title):
    idx = indices[title]
    if type(idx) != np.int64:
        if len(idx)>1:
            print("ALERT: Multiple values")
            idx = idx[0]
    sim_scores = list(enumerate(cosine_sim_new[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['movieId'].iloc[movie_indices]

def genre_based_popularity(genre):
    mask = movies.genres.apply(lambda x: genre in x)
    filtered_movie = movies[mask]
    filtered_movie = filtered_movie.sort_values(by='popularity', ascending=False)
    return filtered_movie['movieId'].head(10).values.tolist()

In [9]:
user_info = pd.read_csv('./data/user_info.csv')

user_info['user_vector'] = user_info['user_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())
user_info['user_vector'] = user_info['user_vector'].apply(lambda x: np.asarray(x).astype(float))

def user_top_genre(userId):
    user_vec = user_info['user_vector'][user_info['userId'] == userId].values[0].copy()
    top_genre_indices = np.flip(np.argsort(user_vec))
    genre_list = []
    for i in top_genre_indices[:3]:
        genre_list.append(idx_to_genre[i])
    return genre_list



In [10]:
user_list = testdf['userId'].unique()
svd_wt = 1.504
knn_wt = 0.712
svdpp_wt = 0.08
slopeone_wt = 0.88
baseline_wt = -2.53

def hybrid(userId):
    user_movies = testdf[testdf['userId'] == userId]
    user_movies['est'] = user_movies['movieId'].apply(lambda x: knn_wt*knnbaseline_algo.predict(userId, x).est + svdpp_wt*svdpp_algo.predict(userId, x).est\
              + svd_wt*svd_algo.predict(userId, x).est + baseline_wt*baseline_algo.predict(userId, x).est\
                +slopeone_wt*slopeone_algo.predict(userId,x).est)    
    user_movies = user_movies.sort_values(by ='est', ascending=False).head(4)
    user_movies['Model'] = 'SVD + CF'
    
    recommend_list = user_movies[['movieId', 'est', 'Model']]
    
    movie_list = recommend_list['movieId'].values.tolist()
    print(movie_list)
    sim_movies_list = []
    for movie_id in movie_list:
        # Call content based 
        movie_title = movies['title'][movies['movieId'] == movie_id].values[0]
        sim_movies = get_recommendations_new(movie_title)
        sim_movies_list.extend(sim_movies)
        
    for movie_id in sim_movies_list:
        pred_rating = knn_wt*knnbaseline_algo.predict(userId, movie_id).est + svdpp_wt*svdpp_algo.predict(userId, movie_id).est\
              + svd_wt*svd_algo.predict(userId, movie_id).est + baseline_wt*baseline_algo.predict(userId, movie_id).est\
                +slopeone_wt*slopeone_algo.predict(userId,movie_id).est
        row_df = pd.DataFrame([[movie_id, pred_rating, 'Movie similarity']], columns=['movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    
    # Popular based movies
    top_genre_list = user_top_genre(userId)
    
    popular_movies = []
    for top_genre in top_genre_list:
        popular_movies.extend(genre_based_popularity(top_genre))
    
    # Compute ratings for the popular movies
    for movie_id in popular_movies:
        pred_rating = knn_wt*knnbaseline_algo.predict(userId, movie_id).est + svdpp_wt*svdpp_algo.predict(userId, movie_id).est\
              + svd_wt*svd_algo.predict(userId, movie_id).est + baseline_wt*baseline_algo.predict(userId, movie_id).est\
                +slopeone_wt*slopeone_algo.predict(userId,movie_id).est
        row_df = pd.DataFrame([[movie_id, pred_rating, 'Popularity']], columns=['movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    recommend_list = recommend_list.drop_duplicates(subset=['movieId'])
    train_movie_list = traindf[traindf['userId']==userId]['movieId'].values.tolist()
    
    # Remove movies in training for this user
    mask = recommend_list.movieId.apply(lambda x: x not in train_movie_list)
    recommend_list = recommend_list[mask]
    
    return recommend_list

In [11]:
def get_title(x):
    mid = x['movieId']
    return movies['title'][movies['movieId'] == mid].values

def get_genre(x):
    mid = x['movieId']
    return movies['genres'][movies['movieId'] == mid].values

In [12]:
movie_ids = hybrid(1)
movie_ids['title'] = movie_ids.apply(get_title,axis=1)
movie_ids['genre'] = movie_ids.apply(get_genre,axis=1)
print(movie_ids.sort_values(by='est', ascending = False).head(10))

[2028, 151, 1517, 553]
    movieId       est             Model                          title  \
38     3494  3.912991  Movie similarity                    [True Grit]   
39     1283  3.807303  Movie similarity                    [High Noon]   
64    79702  3.665759        Popularity  [Scott Pilgrim vs. the World]   
65    81847  3.639001        Popularity                      [Tangled]   
0      2028  3.582133          SVD + CF          [Saving Private Ryan]   
49     1252  3.580368        Popularity                    [Chinatown]   
51     1748  3.547329        Popularity                    [Dark City]   
20     2067  3.529240  Movie similarity               [Doctor Zhivago]   
43    54997  3.480683  Movie similarity                 [3:10 to Yuma]   
53     1245  3.472763        Popularity            [Miller's Crossing]   

                                                genre  
38                [['Adventure', 'Drama', 'Western']]  
39                             [['Drama', 'Western

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_movies['est'] = user_movies['movieId'].apply(lambda x: knn_wt*knnbaseline_algo.predict(userId, x).est + svdpp_wt*svdpp_algo.predict(userId, x).est\


In [13]:
movie_ids = hybrid(3)
movie_ids['title'] = movie_ids.apply(get_title, axis=1)
movie_ids['genre'] = movie_ids.apply(get_genre, axis=1)
print(movie_ids.sort_values(by='est', ascending = False)[['est','Model','title']].head(10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_movies['est'] = user_movies['movieId'].apply(lambda x: knn_wt*knnbaseline_algo.predict(userId, x).est + svdpp_wt*svdpp_algo.predict(userId, x).est\


[70946, 914, 2018, 4518]
         est             Model                                     title
39  1.832586  Movie similarity                        [My Name Is Bruce]
37  1.832586  Movie similarity          [Beyond the Valley of the Dolls]
25  1.832586  Movie similarity  [Little Nemo: Adventures in Slumberland]
20  1.832586  Movie similarity                           [Sweet Charity]
51  0.963071        Popularity                      [Mad Max: Fury Road]
0   0.883210          SVD + CF                                 [Troll 2]
38  0.819171  Movie similarity               [What We Do in the Shadows]
1   0.562931          SVD + CF                            [My Fair Lady]
6   0.478125  Movie similarity                     [Blood and Chocolate]
36  0.404931  Movie similarity                  [Class of Nuke 'Em High]


In [14]:
import gradio as gr

def prediction(text):
    movie_ids = hybrid(text)
    movie_ids['title'] = movie_ids.apply(get_title, axis=1)
    movie_ids['genre'] = movie_ids.apply(get_genre, axis=1)
    # Use extend to concatenate lists
    movie_ids_list = []
    movie_ids['title'].head().apply(lambda x: movie_ids_list.extend(x))    
    return ", ".join(movie_ids_list)

examples = [1]
# Define the Gradio interface
iface = gr.Interface(fn=prediction, inputs="number", outputs="text", examples=examples)
# Launch the Gradio interface
iface.launch()


Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_movies['est'] = user_movies['movieId'].apply(lambda x: knn_wt*knnbaseline_algo.predict(userId, x).est + svdpp_wt*svdpp_algo.predict(userId, x).est\


[70946, 914, 2018, 4518]
[2028, 151, 1517, 553]
[356, 4306, 5816, 33794]
