## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from textblob import TextBlob
import surprise
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from surprise import dump
from surprise import accuracy

import pickle

import warnings
warnings.filterwarnings("ignore")


pd.options.display.max_rows = 30
pd.options.display.float_format = '{:.3f}'.format
pd.options.display.max_columns = 100
pd.options.display.width = 10000

%matplotlib inline

## Importing Datasets

### Genres Dataset Will be used for item based similarity

In [2]:
movie_genres = pd.read_csv('data_generated/movie_genres.csv',index_col='movieId')
movie_genres.drop(columns=['title','release_year'], inplace = True)
movie_genres.head()

Unnamed: 0_level_0,Horror,Crime,Sci-Fi,Western,Romance,Comedy,Documentary,Film-Noir,Action,Thriller,Musical,Adventure,Fantasy,Animation,Drama,Children,War,Mystery
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0
3,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


### Titles Dataset will be used for mapping movies to names

In [3]:
movie_titles = pd.read_csv('data_movielens/movies.csv')
movie_titles.drop(columns=['genres'],inplace = True)
movie_titles.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
movie_titles.shape

(9742, 2)

### Tags Dataset Could be used for Sentiment Analysis

In [6]:
movie_tags = pd.read_csv('data_generated/movie_tags.csv')
movie_tags.head()

Unnamed: 0,userId,movieId,tag
0,2,60756,funny
1,2,60756,highly quotable
2,2,60756,will ferrell
3,2,89774,boxing story
4,2,89774,mma


In [7]:
movie_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   userId   3683 non-null   int64 
 1   movieId  3683 non-null   int64 
 2   tag      3683 non-null   object
dtypes: int64(2), object(1)
memory usage: 86.4+ KB


## Sentiment Analysis on Tags

In [8]:
def get_polarity(df = pd.DataFrame([])):
    df['polarity'] = 0
    
    for i in range(df.shape[0]):
        res_i = TextBlob(df.loc[i,'tag'])
        df.loc[i,'polarity'] = res_i.sentiment.polarity
        
    return df['polarity']

In [9]:
movie_tags['polarity'] = get_polarity(movie_tags)

In [10]:
movie_tags.head(5)

Unnamed: 0,userId,movieId,tag,polarity
0,2,60756,funny,0.25
1,2,60756,highly quotable,0.16
2,2,60756,will ferrell,0.0
3,2,89774,boxing story,0.0
4,2,89774,mma,0.0


In [11]:
print("range of polarity {1:2f} TO {0:2f}".format(movie_tags['polarity'].describe()['max'] ,movie_tags['polarity'].describe()['min']))

range of polarity -1.000000 TO 1.000000


- Polarity ranges from -1 to 1 
- For this model, I would it to range from 0 to 5
- I consider negative polarity as a movie gaining 0 raiting
- Positive polarity will be multiplied by 5
- I will pick the average of the polarity for every user-item tuple

In [12]:
movie_tags.loc[movie_tags['polarity'] < 0] = 0 
movie_tags['polarity'] = movie_tags['polarity']*5
print("range of polarity {1:2f} to {0:2f}".format(movie_tags['polarity'].describe()['max'] ,movie_tags['polarity'].describe()['min']))

range of polarity 0.000000 to 5.000000


### Final Decision
- I decide to hold out the sentiment analysis for now
- I will implement it in future versions

### Ratings dataset will be used for Collabarative Filtering using User Based Similarity

In [13]:
user_ratings = pd.read_csv('data_movielens/ratings.csv')
user_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


### Drop timestamps

In [14]:
user_ratings.drop(columns=['timestamp'],inplace=True)
user_ratings['rating'] = user_ratings['rating'].astype(int)
user_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4
1,1,3,4
2,1,6,4
3,1,47,5
4,1,50,5


In [15]:
min_rating = user_ratings['rating'].describe()['min']
max_rating = user_ratings['rating'].describe()['max']
print(min_rating , " to " ,  max_rating)

0.0  to  5.0


## Creating Collabarative Filter

### Dataset Loading into Surprise

In [16]:
data_reader = Reader(rating_scale=(min_rating,max_rating))
data = Dataset.load_from_df(user_ratings, data_reader)

### Model Selection

In [17]:
benchmark = []
# Iterate over all algorithms
from surprise import SVD
from surprise import SlopeOne
from surprise import NormalPredictor
from surprise import KNNBaseline
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import BaselineOnly
from surprise import CoClustering


algorithms = [SVD(), SlopeOne(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]

print ("Attempting: ", str(algorithms), '\n\n\n')

for algorithm in algorithms:
    print("Starting: " ,str(algorithm))
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    print("Done: " ,str(algorithm), "\n\n")

print ('\n\tDONE\n')

Attempting:  [<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x000002121D779FA0>, <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x000002121D779D00>, <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x000002121D779640>, <surprise.prediction_algorithms.knns.KNNBaseline object at 0x000002121D7796D0>, <surprise.prediction_algorithms.knns.KNNBasic object at 0x000002121D774130>, <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x000002121D774610>, <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x000002121D774820>, <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x000002121D774520>, <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x000002121D7749A0>] 



Starting:  <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x000002121D779FA0>
Done:  <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x000002121D779FA0> 


Starting:  <surprise.predi

In [18]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
surprise_results.sort_values(by = 'test_rmse')

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.917,0.197,0.209
SVD,0.923,3.875,0.217
KNNBaseline,0.925,0.265,2.027
KNNWithZScore,0.948,0.174,2.053
SlopeOne,0.948,3.849,7.408
KNNWithMeans,0.949,0.119,1.75
CoClustering,0.98,2.039,0.189
KNNBasic,1.01,0.104,1.585
NormalPredictor,1.5,0.097,0.254


### All the algorithms perform fairly similar
- Using Occum's Razor
- I will move ahead with KNNmeans model
- It is the simplest model with competitive results
- KNN means has the advantage that it is well suited for movie recommendations
- As it takes into account "Tough Reviewers" who review trends along with public but often give lower results

### Train Test Split

In [19]:
trainset , testset = train_test_split(data = data , test_size=0.2, random_state=42)

## Dumping Algorithms for future imports

In [20]:
for algorithm in algorithms:
    
    ## fit the algorithm to trainset
    algorithm.fit(trainset)
    
    ## save the mode
    name = str(algorithm).split(' ')[0].split('.')[-1]
    path = 'data_saved_models/' + name
    dump.dump(path)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


In [21]:
print("users : " , trainset.n_users)
print("movies : " , trainset.n_items)

users :  610
movies :  8928


### Train 2 models using KNN with Means
- One of these will be used for item based recommendations
- The other will be used for user based recommendations

In [22]:
similarity_config_item_based = {
    'name':'cosine', 
    'user_based': False, 
    'min_support' : 3
    }

similarity_config_user_based = {
    'name':'cosine', 
    'user_based': True, 
    'min_support' : 3
    }

#### Item Based

In [23]:
algo_item_based = KNNWithMeans(
    k = 15,
    min_k = 5, 
    sim_option = similarity_config_item_based
    )

In [24]:
algo_item_based.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x2121d6f8760>

In [25]:
# Train the algorithm on the trainset, and predict ratings for the testset
predictions = algo_item_based.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9464


0.946368595043037

### Item Based

In [26]:
algo_user_based = KNNWithMeans(
    k = 15,
    min_k = 1, 
    sim_option = similarity_config_user_based
    )
algo_user_based.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x2121d6f82e0>

In [27]:
# Train the algorithm on the trainset, and predict ratings for the testset
predictions = algo_user_based.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9540


0.9540140708883638

## Pickle This Model for Future Use

In [28]:
dump.dump('data_saved_models/knnwithmeans_item', algo=algo_item_based)
dump.dump('data_saved_models/knnwithmeans_user', algo=algo_user_based)

In [29]:
_ , algo_item_based = dump.load('data_saved_models/knnwithmeans_item')
_ , algo_item_based = dump.load('data_saved_models/knnwithmeans_user')

In [30]:
## check our similarity matrix

sim_matrix = algo_item_based.sim
sim_matrix.shape

(610, 610)

In [31]:
algo_user_based.sim.shape

(610, 610)

#### Rough Work

In [None]:
algo_user_based.predict(1,3)

In [None]:
algo_user_based.predict(1,3).details['actual_k']

In [None]:
algo_user_based.predict(1,3).est

In [None]:
top_rated = user_ratings['movieId'].value_counts()
top_rated[top_rated.gt(100)].index

## Cold Start Problem
- Even the most sophisticated of algorithms find it hard to deal with cold start problems for new users.
- Youtube's Best Approximation is a "Trending" Tab that highlights the most popular of recently uploaded videos.
- Netflix's Best Approximation is a Category tab for "Recent Releases" and "Netflix Originals".
- To Keep it Simple, I have decided to go with the most highly rated movies of all time.
- This can be refined in future by forcing user to select at least 3 favourite Genres.

In [32]:
## Select movies that have been rated at least 100 times
top_rated = user_ratings['movieId'].value_counts()
top_rated = user_ratings[user_ratings['movieId'].isin(top_rated[top_rated.gt(100)].index)]

## Select Top Rated
top_rated = top_rated.groupby(by = 'movieId')['rating'].mean().sort_values(ascending=False)
top_rated = pd.DataFrame(top_rated,columns=['rating'])
top_rated.reset_index(inplace=True)
top_rated.head()

Unnamed: 0,movieId,rating
0,318,4.328
1,858,4.182
2,1221,4.147
3,260,4.131
4,1208,4.112


### Save this for future

In [33]:
top_rated.to_csv('data_generated/top_rated.csv', index=False)

In [34]:
temp = pd.read_csv('data_generated/top_rated.csv')
temp.head()

Unnamed: 0,movieId,rating
0,318,4.328
1,858,4.182
2,1221,4.147
3,260,4.131
4,1208,4.112


In [42]:
def top_recommendatations(user_ratings , gt = 100 , k = 30):
    
    ## Select movies that have been rated at least 100 times
    top_rated = user_ratings['movieId'].value_counts()
    top_rated = user_ratings[user_ratings['movieId'].isin(top_rated[top_rated.gt(gt)].index)]
    
    ## Select Top Rated
    top_rated = top_rated.groupby(by = 'movieId')['rating'].mean().sort_values(ascending=False)
    top_rated = pd.DataFrame(top_rated,columns=['rating'])
    top_rated.reset_index(inplace=True)
    top_rated = top_rated[0:k]
    
    return top_rated['movieId']

In [43]:
def get_mv_titles(mv_titles = None, mv_id = None):
    """
    given a list of movie IDs, returns their titles
    """
    titles = []
    for idx in mv_id:
        titles.append(mv_titles.loc[mv_titles['movieId'] == idx]['title'].values[0])
    
    return titles

In [44]:
temp = top_recommendatations(user_ratings)
get_mv_titles(movie_titles, temp)

['Shawshank Redemption, The (1994)',
 'Godfather, The (1972)',
 'Godfather: Part II, The (1974)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Apocalypse Now (1979)',
 'Fight Club (1999)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Goodfellas (1990)',
 'Usual Suspects, The (1995)',
 "Schindler's List (1993)",
 'Pulp Fiction (1994)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Reservoir Dogs (1992)',
 'Princess Bride, The (1987)',
 "One Flew Over the Cuckoo's Nest (1975)",
 'Matrix, The (1999)',
 "Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",
 'Forrest Gump (1994)',
 'Silence of the Lambs, The (1991)',
 'Dark Knight, The (2008)',
 'American History X (1998)',
 'Departed, The (2006)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Monty Python and the Holy Grail (1975)',
 'Fargo (1996)',
 'Saving Private Ryan (1998)',
 'Taxi Driver (1976)',
 'Green Mile, The (1999)',
 'Full Metal Jacket (1987)',
 'Eternal Suns

## Item Based Filtering

- For item based filtering, I will make use of the genres that extracted earlier
- It is obvious that similar genres can be recommended one after the other
- While an enterprise would probably be interested in creating a long term database for a user
- My model will use this as a feature for recommending a new movie ( possibly sequels ) after someone has finished one movie.
- I tested the model with including 'Release Year' and not including 'Release Year'.
- As a cenophile, I do not mind the year of release, so I found recommendations were better without the release year.
- For a general public, release year may as well be included

In [45]:
from scipy.spatial.distance import cdist

In [142]:
## Calculate the cosine similarity matrix for the dataset of all movies
temp = pd.read_csv('data_generated/movie_genres.csv')
temp = temp.drop(columns=['release_year','title','movieId'])

## similarity = 1 - cosine distance
sim_mat_genre = 1 - cdist(temp,temp,'cosine')

In [145]:
## Convert to Dataframe
sim_mat_genre = pd.DataFrame(sim_mat_genre,index=movie_genres['movieId'], columns=movie_genres['movieId'])

In [146]:
sim_mat_genre.head()

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,44,45,46,47,48,49,50,52,53,54,...,184471,184641,184721,184791,184931,184987,184997,185029,185031,185033,185135,185435,185473,185585,186587,187031,187541,187593,187595,187717,188189,188301,188675,188751,188797,188833,189043,189111,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.816,0.0,0.333,0.0,0.0,0.667,0.0,0.333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408,0.0,0.516,0.0,0.0,0.0,0.408,0.408,0.0,0.0,0.0,0.667,0.0,0.0,0.0,0.258,0.0,0.0,0.0,0.408,0.408,0.0,0.0,1.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0.0,1.0,0.816,0.707,0.0,1.0,0.0,0.0,0.0,0.816,0.5,0.0,0.0,0.408,0.0,0.5,0.707,0.707,0.316,0.408,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.408,0.5,0.0,0.316,0.5,0.0,0.816,0.0,0.5,0.0,0.816,0.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0.0,0.816,1.0,0.577,0.0,0.816,0.0,0.0,0.0,1.0,0.408,0.0,0.577,0.333,0.408,0.816,0.577,0.577,0.516,0.333,0.258,0.0,0.408,0.816,0.577,0.408,0.816,0.258,0.408,0.577,0.0,0.408,0.408,0.577,0.333,0.577,0.0,0.667,0.816,0.0,0.516,0.816,0.0,1.0,0.408,0.408,0.577,1.0,0.0,0.408,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0.0,0.707,0.577,1.0,0.0,0.707,0.0,0.0,0.0,0.577,0.707,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.447,0.577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707,0.0,0.0,0.0,0.0,0.577,0.0,0.0,0.0,0.0,0.0,0.577,0.0,0.707,0.0,0.577,0.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.577,0.667,0.0,0.0,0.0,0.0,0.333,0.408,0.0,0.0,0.0,0.775,0.667,0.516,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408,0.0,0.333,0.0,0.0,0.0,0.667,0.0,0.333,0.333,0.0,0.408,0.0,0.0,0.667,0.0,0.0,0.0,0.0,0.0,0.0,0.408,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [49]:
sim_mat_genre.shape

(9742, 9742)

### Save this for future use

In [None]:
sim_mat_genre.isna().sum()

In [99]:
sim_mat_genre.to_csv('src/data_generated/similarity_matrix_genre.csv')

### Test this for movie for "Toy Story"

In [50]:
def get_similar_movies_genres(k = 20 , mv_id = None, sim_mat = None):
    """
    returns movies similar to a given movie ID
    useful for making recommendations after a user has finished watching a particular movie 
    OR
    when user searches for a particular movie
    """
    temp = sim_mat.loc[:,mv_id].sort_values(ascending=False)[:k]
    temp = temp[temp.index != mv_id]
    
    return temp.index

In [51]:
get_mv_titles(movie_titles, get_similar_movies_genres(20,1,sim_mat_genre))

['Toy Story 3 (2010)',
 'Ant Bully, The (2006)',
 'Tale of Despereaux, The (2008)',
 'Wild, The (2006)',
 'Toy Story 2 (1999)',
 'Turbo (2013)',
 'Monsters, Inc. (2001)',
 'The Good Dinosaur (2015)',
 'Shrek Forever After (a.k.a. Shrek: The Final Chapter) (2010)',
 'Adventures of Rocky and Bullwinkle, The (2000)',
 'Asterix and the Vikings (Astérix et les Vikings) (2006)',
 'Antz (1998)',
 'Moana (2016)',
 "Emperor's New Groove, The (2000)",
 'Shrek the Third (2007)',
 'Inside Out (2015)',
 'Home (2015)',
 "Twelve Tasks of Asterix, The (Les douze travaux d'Astérix) (1976)",
 'Valiant (2005)']

- I get toy story sequels along with other animated movies 
- These are the movies I grew up watching so I am Personally Satisfied with the recommendations

## New User Registration

- A new User should be able to pick out his or her favourite movies and rate them
- This allows my filter to find users similar to that user and how they would rate other movies

In [53]:
## create a user-item matrix
user_item_mat = pd.crosstab(index = user_ratings['userId'],columns = user_ratings['movieId'], values = user_ratings['rating'], aggfunc='mean')
user_item_mat = user_item_mat.fillna(0)
user_item_mat.head()

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,44,45,46,47,48,49,50,52,53,54,...,184471,184641,184721,184791,184931,184987,184997,185029,185031,185033,185135,185435,185473,185585,186587,187031,187541,187593,187595,187717,188189,188301,188675,188751,188797,188833,189043,189111,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### hardcoding a dummy user for now
- These are my personal recommendations
- while this is not the scientific way to test any model, if the results generated are something that I would prefer to watch, I will consider it working well

In [54]:
## create a new row for a new user
new_user = user_item_mat.loc[0:1]
new_user = new_user.copy()
new_user.loc[1] = 0

## get ratings for new users
new_user[50] = 5
new_user[54] = 4
new_user[111] = 5
new_user[153] = 3
new_user[260] = 4
new_user[296] = 5
new_user[318] = 5
new_user[364] = 4
new_user[480] = 3
new_user[527] = 3
new_user[575] = 4
new_user[4993] = 5
new_user[4995] = 5
new_user[3535] = 5
new_user[6586] = 3
new_user[59429] = 3
new_user[73106] = 4
new_user[4993] = 5
new_user[4995] = 5

new_user

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,44,45,46,47,48,49,50,52,53,54,...,184471,184641,184721,184791,184931,184987,184997,185029,185031,185033,185135,185435,185473,185585,186587,187031,187541,187593,187595,187717,188189,188301,188675,188751,188797,188833,189043,189111,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,0.0,0.0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Finding Similar Users 
- This is a primitive way 
- Here we find similar users based on a distance metric
- I have chosen 'cosine' distances as the distance metric

In [55]:
from sklearn.neighbors import NearestNeighbors
nn = NearestNeighbors(metric='cosine')

In [56]:
nn.fit(user_item_mat)

NearestNeighbors(metric='cosine')

In [57]:
## [1] -> user or similarity
nn.kneighbors(new_user,return_distance=False)[0]

array([144,   7, 362, 443, 347], dtype=int64)

In [62]:
user_item_mat.loc[7].sort_values(ascending=False).index
temp = [int(mv_id) for mv_id in user_item_mat.loc[7].sort_values(ascending=False).index[0:10]]
get_mv_titles(movie_titles,temp)

['Terminator, The (1984)',
 'Hot Shots! Part Deux (1993)',
 'Contact (1997)',
 'Silence of the Lambs, The (1991)',
 'Planet of the Apes (1968)',
 'Seven Samurai (Shichinin no samurai) (1954)',
 'Naked Gun 2 1/2: The Smell of Fear, The (1991)',
 'Forrest Gump (1994)',
 'Jurassic Park (1993)',
 'Star Wars: Episode IV - A New Hope (1977)']

In [63]:
def get_movie_recommendations_users(new_user, nearest_neighbours, user_item_mat , k_users = 5, k_each_user = 10):
    """
    Returns Movie ID based on similar user preferences (Collabarative Filtring)
    
    new_user -> ratings provided by user (DataFrame)
    nearest_neighbours -> nearest_neigbhour algorithm to be used
    user_item_mat -> user item matrix to be used
    k_users -> number of nearest users to consider
    k_each_user -> number of recommendations to be picked from each neighbiour
    
    """
    neighbours = nn.kneighbors(new_user,k_users,return_distance=False)[0]
    
    mv_ids = []
    for neighbour in neighbours:
        temp = [int(mv_id) for mv_id in user_item_mat.loc[neighbour].sort_values(ascending=False).index[0:k_each_user]]
            
        [mv_ids.append(x) for x in temp if x not in mv_ids]
        
    return mv_ids

In [74]:
temp = get_movie_recommendations_users(new_user,nn,user_item_mat,3,10)
get_mv_titles(movie_titles, temp)

['Shrek (2001)',
 'American History X (1998)',
 'Finding Nemo (2003)',
 'Shrek 2 (2004)',
 'Memento (2000)',
 'Dances with Wolves (1990)',
 'Pirates of the Caribbean: The Curse of the Black Pearl (2003)',
 'Motorcycle Diaries, The (Diarios de motocicleta) (2004)',
 'Seven (a.k.a. Se7en) (1995)',
 'Show Me Love (Fucking Åmål) (1998)',
 'Terminator, The (1984)',
 'Hot Shots! Part Deux (1993)',
 'Contact (1997)',
 'Silence of the Lambs, The (1991)',
 'Planet of the Apes (1968)',
 'Seven Samurai (Shichinin no samurai) (1954)',
 'Naked Gun 2 1/2: The Smell of Fear, The (1991)',
 'Forrest Gump (1994)',
 'Jurassic Park (1993)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Lord of the Rings: The Return of the King, The (2003)',
 'Thing, The (1982)',
 'Clockwork Orange, A (1971)',
 'Fargo (1996)',
 'Lord of the Rings: The Fellowship of the Ring, The (2001)',
 "The Devil's Advocate (1997)",
 'Lord of the Rings: The Two Towers, The (2002)',
 'Good, the Bad and the Ugly, The (Buono, il brutto, 

### Save for future use

In [69]:
pickle.dump(nn,open('data_saved_models/NearestNeighbour.sav','wb'))

In [73]:
nn = pickle.load(open('data_saved_models/NearestNeighbour.sav','rb'))

In [85]:
temp = get_movie_recommendations_users(new_user,nn,user_item_mat,3,10)
type(get_mv_titles(movie_titles, temp))

list

### Success
- These are movies that I would personally prefer watching


In [76]:
user_item_mat.to_csv('data_generated/user_item_similarity',index_label=True)

## API Functions
## DO NOT RUN

In [77]:
def get_similar_users(new_user , user_item_mat, k_users):
    
    """
    When we get a new user registered, 
    we use cosine similarity to find the users most similar to the user
    
    This can be further used to predict what to recommend to the given user based 
    on collabarative filtering
    """

    
    ## calculate similarity between all users
    new_user_sim = 1-cdist(new_user,user_item_mat,'cosine')
    new_user_sim = pd.DataFrame(new_user_sim.T,index=user_item_mat.index)
    
    ## find the most similar users
    new_user_sim = new_user_sim[0].sort_values(ascending = False)
    
    ## get top k users similar to him
    new_user_sim = new_user_sim[0:k_users]
    
    return new_user_sim

In [78]:
def get_recommendations_user(u_id = None, algo_ub = None, mv_titles = None, k = 20):

    """
    built on top of surprise
    given a user id and a user based algorithm
    returns the movies that should be highest rated by the user
    as a DataFrame
    """
    
    top_recommendations = {}
    ## iterate over all movies
    for mv_id in mv_titles['movieId']:
        
        mv_id_prediction = algo_ub.predict(uid = u_id , iid = mv_id)
        
        ## if prediction is not possible, then ignore
        ## else add it to my recommendations
        if mv_id_prediction.details['was_impossible'] == True:
            continue
        else:
            top_recommendations[mv_id] = mv_id_prediction.est
            
    ## sort to get highest rated
    top_recommendations = sorted(top_recommendations.items(), key=lambda x: x[1], reverse=True)
    
    ## select top k
    top_recommendations = list(top_recommendations)[:min(k,len(top_recommendations))]
    
    ## comvert to dataframe
    top_recommendations = pd.DataFrame(top_recommendations,columns=['movieId','rating'])

    return top_recommendations

In [82]:
def get_recommendation_usr_id(rw_u_id = None , algo_ub = None , k = 10):
    """
    Built on top of surprise
    Supposed To find Similar Users
    Does Not Work as intended
    REDUNDANT
    """
    u_id = algo_ub.get_neighbors(algo_ub.trainset.to_inner_iid(rw_u_id) , k)
    u_rwid = [algo_ub.trainset.to_raw_iid(usr) for usr in u_id]
    return u_id

In [83]:
def get_recommendation_mv_id(rw_mv_id = 1 , algo_ib = None, k = 10):
    """
    REDUNDANT
    """
    
    mv_iid = algo_ib.get_neighbors(algo_ib.trainset.to_inner_iid(rw_mv_id) , k)
    mv_rawid = [algo_ib.trainset.to_raw_iid(mv_id) for mv_id in mv_iid]
    return mv_rawid

In [80]:
def get_recommendations_user(u_id = None, algo = None, mv_titles = None, k = 20):

    """
    built on top of surprise
    REDUNDANT
    """
    
    top_recommendations = {}
    for mv_id in mv_titles['movieId']:
        
        mv_id_prediction = algo.predict(uid = u_id , iid = mv_id)
        
        if mv_id_prediction.details['was_impossible'] == True:
            continue
        else:
            top_recommendations[mv_id] = mv_id_prediction.est
            
    top_recommendations = sorted(top_recommendations.items(), key=lambda x: x[1], reverse=True)
    
    top_recommendations = list(top_recommendations)[:min(k,len(top_recommendations))]
    
    ids = []
    for k , v in top_recommendations:
        ids.append(k)

    return ids

In [84]:
def get_mv_titles(mv_titles = None, mv_id = None):
    """
    returns movie titles based on movie ids
    """
    titles = []
    for idx in mv_id:
        titles.append(mv_titles.loc[mv_titles['movieId'] == idx]['title'].values[0])
    
    return titles

In [None]:
def get_recommendation_usr_id(rw_u_id = None , algo_ub = None , k = 10):
    
    u_id = algo_ub.get_neighbors(algo_ub.trainset.to_inner_iid(rw_u_id) , k)
    u_rwid = [algo_ub.trainset.to_raw_iid(usr) for usr in u_id]
    return u_id


def get_recommendations_user(u_id = None, algo = None, mv_titles = None, k = 20):
    
    top_recommendations = {}
    for mv_id in movie_genres['movieId']:
        
        mv_id_prediction = algo.predict(uid = u_id , iid = mv_id)
        
        if mv_id_prediction.details['was_impossible'] == True:
            continue
        else:
            top_recommendations[mv_id] = mv_id_prediction.est
            
    top_recommendations = sorted(top_recommendations.items(), key=lambda x: x[1], reverse=True)
    
    top_recommendations = list(top_recommendations)[:min(k,len(top_recommendations))]
    
    ids = []
    for k , v in top_recommendations:
        ids.append(k)

    return ids

def get_recommendations(algo , movie_genres , u_id = None, number_of_recommendations = 20):
    top_recommendations = {}
    
    for mv_id in movie_genres['movieId']:
        mv_id_prediction = algo.predict(uid = u_id , iid = mv_id)
        
        if mv_id_prediction.details['was_impossible'] == True:
            continue
        else:
            top_recommendations[mv_id] = mv_id_prediction.est
            ##print(top_recommendations[mv_id])
    top_recommendations = sorted(top_recommendations.items(), key=lambda x: x[1], reverse=True)
    
    rtrn_mv_id = list(top_recommendations)[:min(number_of_recommendations,len(top_recommendations))]
    return rtrn_mv_id

def cold_start_recommendations(k = 20 , user_ratings = None):
    ## Select movies that have been rated at least 5 times
    top_rated = user_ratings['movieId'].value_counts()
    top_rated = user_ratings[user_ratings.movieId.isin(top_rated.index[top_rated.gt(5)])]

    ## Select Top Rated
    top_rated = top_rated.groupby(by = 'movieId')['rating'].mean().sort_values(ascending=False)
    top_rated.head()
    top_rated = top_rated.index
    
    top_rated = top_rated[0:min(k,top_rated.shape[0])]
    
    return top_rated

def get_recommendation_mv_id(rw_mv_id = 1 , algo_ib = None, k = 10):
    
    mv_iid = algo_ib.get_neighbors(algo_ib.trainset.to_inner_iid(rw_mv_id) , k)
    mv_rawid = [algo_ib.trainset.to_raw_iid(mv_id) for mv_id in mv_iid]
    return mv_rawid

## API DO NOT RUN

In [None]:
import pickle
import surprise
from surprise import dump
from surprise import SVDpp
from surprise import Reader
from surprise import Dataset
import pandas as pd

from fastapi import FastAPI
from fastapi import Path
from typing import Optional

from pydantic import BaseModel


class data(BaseModel):
    cold_start : bool 
    reviews : dict
    

movie_genres = pd.read_csv('data_movielens/movies.csv')
user_ratings = pd.read_csv('data_movielens/ratings.csv')

_ , algo_ib = dump.load('1_knnwithmeans_item')


app = FastAPI()

@app.get("/")
def welcome():
    return {"Welcome to the homepage of MovieBuzz"}

@app.get("/homepage/{movie_id}")
def generate_movie_recommendations(movie_id : int = Path(None,description="Input Movie ID", gt=0,lt=610)):
    mvids = get_recommendation_mv_id(rw_mv_id=1, algo_ib=algo_ib,k = 20)
    titles = get_mv_titles(movie_genres, mvids)

    return {"details":titles}

@app.get("/popular")
def generate_popularpage():
    top_recommendations = cold_start_recommendations(20,user_ratings);
    titles = get_mv_titles(mv_titles = movie_genres , mv_id= top_recommendations)
    return {"details":titles}

@app.post("/reviewss")
def get_reviews(review : data):
    return {}

In [None]:
get_mv_titles(movie_titles,top_rated.head()['movieId'])

In [94]:
temp = 191
if temp in movie_titles['movieId']:
    print("YES")
else:
    print("NO")

YES


In [95]:
def get_similar_movies_genres(k = 20 , mv_id = None, sim_mat = None):
    """
    

    Parameters
    ----------
    k : TYPE, optional
        DESCRIPTION. The default is 20.
    mv_id : TYPE, optional
        DESCRIPTION. The default is None.
    sim_mat : TYPE, optional
        DESCRIPTION. The default is None.

    Returns
    -------
    TYPE
        DESCRIPTION.

    """
    """
    returns movies similar to a given movie ID
    useful for making recommendations after a user has finished watching a particular movie 
    OR
    when user searches for a particular movie
    """
    temp = sim_mat.loc[:,mv_id].sort_values(ascending=False)[:k]
    temp = temp[temp.index != mv_id]
    
    return temp.index

In [117]:
movie_genres = pd.read_csv('data_generated/movie_genres.csv')
movie_genres.drop(columns=['title','release_year'], inplace = True)

movie_titles = pd.read_csv('data_movielens/movies.csv')
movie_titles.drop(columns=['genres'],inplace = True)

user_ratings = pd.read_csv('data_movielens/ratings.csv')
user_ratings.drop(columns=['timestamp'],inplace=True)
user_ratings['rating'] = user_ratings['rating'].astype(int)

top_rated = pd.read_csv('data_generated/top_rated.csv')

temp = movie_genres.drop(columns=['movieId'])
sim_mat_genre = 1 - cdist(temp,temp,'cosine')
sim_mat_genre = pd.DataFrame(sim_mat_genre,index=movie_genres.index, columns=movie_genres.index)

In [137]:
mv_ids = get_similar_movies_genres(20,1,sim_mat_genre)
mv_ids = list(mv_ids)
mv_ids

[1514,
 1617,
 6655,
 109,
 6075,
 7478,
 53,
 1556,
 1618,
 6751,
 6389,
 9565,
 1799,
 9294,
 9336,
 8641,
 8230,
 7426,
 6629]

In [141]:
sim_mat_genre.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,9692,9693,9694,9695,9696,9697,9698,9699,9700,9701,9702,9703,9704,9705,9706,9707,9708,9709,9710,9711,9712,9713,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723,9724,9725,9726,9727,9728,9729,9730,9731,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
0,1.0,0.775,0.316,0.258,0.447,0.0,0.316,0.632,0.0,0.258,0.258,0.316,0.775,0.0,0.258,0.0,0.0,0.447,0.447,0.2,0.258,0.0,0.0,0.0,0.0,0.0,0.316,0.0,0.4,0.0,0.0,0.0,0.316,0.0,0.632,0.316,0.0,0.0,0.0,0.0,0.516,0.258,0.0,0.0,0.4,0.0,0.0,0.258,0.316,0.632,...,0.516,0.516,0.0,0.447,0.0,0.671,0.316,0.0,0.316,0.258,0.0,0.316,0.447,0.258,0.258,0.2,0.671,0.258,0.447,0.0,0.516,0.6,0.0,0.316,0.447,0.775,0.316,0.0,0.258,0.0,0.0,0.258,0.0,0.0,0.447,0.0,0.0,0.447,0.0,0.447,0.447,0.316,0.316,0.447,0.0,0.671,0.775,0.0,0.316,0.447
1,0.775,1.0,0.0,0.0,0.0,0.0,0.0,0.816,0.0,0.333,0.0,0.0,0.667,0.0,0.333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408,0.0,0.516,0.0,0.0,0.0,0.408,0.0,0.408,0.0,0.0,0.0,0.0,0.0,0.667,0.0,0.0,0.0,0.258,0.0,0.0,0.0,0.408,0.408,...,0.667,0.667,0.0,0.0,0.0,0.866,0.0,0.0,0.408,0.333,0.0,0.0,0.0,0.333,0.333,0.258,0.577,0.0,0.577,0.0,0.333,0.516,0.0,0.0,0.0,0.667,0.0,0.0,0.333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.289,0.0,0.0,0.0,0.0,0.0,0.289,0.333,0.0,0.0,0.0
2,0.316,0.0,1.0,0.816,0.707,0.0,1.0,0.0,0.0,0.0,0.816,0.5,0.0,0.0,0.408,0.0,0.5,0.707,0.707,0.316,0.408,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.408,0.5,0.0,0.316,0.5,0.0,0.816,0.0,0.5,...,0.0,0.0,0.0,0.707,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.707,0.0,0.0,0.0,0.0,0.408,0.0,0.0,0.408,0.316,0.0,1.0,0.707,0.408,0.5,0.0,0.0,0.0,0.0,0.408,0.0,0.5,0.707,0.0,0.0,0.0,0.0,0.354,0.354,0.0,0.5,0.0,0.0,0.354,0.408,0.0,0.0,0.707
3,0.258,0.0,0.816,1.0,0.577,0.0,0.816,0.0,0.0,0.0,1.0,0.408,0.0,0.577,0.333,0.408,0.816,0.577,0.577,0.516,0.333,0.258,0.0,0.408,0.816,0.577,0.408,0.816,0.258,0.408,0.577,0.0,0.408,0.408,0.408,0.816,0.577,0.408,0.333,0.577,0.0,0.667,0.816,0.0,0.516,0.816,0.0,1.0,0.408,0.408,...,0.0,0.0,0.408,0.577,0.289,0.0,0.816,0.333,0.0,0.333,0.0,0.408,0.577,0.0,0.0,0.258,0.0,0.333,0.0,0.0,0.333,0.258,0.408,0.816,0.577,0.333,0.816,0.0,0.0,0.0,0.0,0.667,0.0,0.816,0.577,0.577,0.577,0.0,0.0,0.289,0.289,0.408,0.816,0.0,0.0,0.289,0.333,0.577,0.0,0.577
4,0.447,0.0,0.707,0.577,1.0,0.0,0.707,0.0,0.0,0.0,0.577,0.707,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.447,0.577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707,0.707,0.0,0.0,0.0,0.0,0.0,0.577,0.0,0.0,0.0,0.0,0.0,0.577,0.0,0.707,...,0.0,0.0,0.0,1.0,0.0,0.0,0.707,0.0,0.0,0.0,0.0,0.707,1.0,0.0,0.0,0.0,0.0,0.577,0.0,0.0,0.577,0.447,0.0,0.707,1.0,0.577,0.707,0.0,0.0,0.0,0.0,0.577,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.707,0.0,0.0,0.5,0.577,0.0,0.0,1.0
