<a href="https://colab.research.google.com/github/jchen8000/MachineLearning/blob/master/6%20Recommender%20System/Recommendation_Content_Based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recommendation System with Neural Network

In [None]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
from pathlib import Path
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.metrics.pairwise import cosine_similarity

### MovieLens dataset

In [None]:
movielens = 'ml-100k'
url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'

movielens_file = keras.utils.get_file(
    fname = movielens + '.zip', 
    origin = url, 
    cache_dir='./'
)
datasets_path = Path(movielens_file).parents[0]
movielens_dir = datasets_path / movielens
# Extract the data file.
if not movielens_dir.exists():
    with ZipFile(movielens_file, "r") as zip:
        zip.extractall(path=datasets_path)


Downloading data from http://files.grouplens.org/datasets/movielens/ml-100k.zip


In [None]:
rating_col = ['userid','movieid','rating','timestamp']
ratings = pd.read_csv(movielens_dir/"u.data", 
                      sep='\t', 
                      header=None,
                      names=rating_col)
ratings = ratings[ratings.groupby("movieid")["movieid"].transform("size") > 30]
ratings = ratings[ratings.groupby("userid")["userid"].transform("size") > 20]
ratings

Unnamed: 0,userid,movieid,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
3,244,51,2,880606923
5,298,474,4,884182806
6,115,265,2,881171488
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [None]:
movie_col = ['movieid', 'title', 'releasedate', 'videoreleasedate', 'IMDbURL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv( movielens_dir/"u.item", 
                     sep='|',
                     header=None,
                     names=movie_col,
                     encoding='latin-1')
movies

Unnamed: 0,movieid,title,releasedate,videoreleasedate,IMDbURL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
n_users = ratings['userid'].nunique()
n_movies = ratings['movieid'].nunique()
n_users, n_movies

(882, 798)

### Content Based Recommendation

In [None]:
genre_matrix = movies.drop(['movieid', 'title', 'releasedate', 'videoreleasedate', 'IMDbURL'], axis=1)
genre_matrix

Unnamed: 0,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
movie_similarity = cosine_similarity(genre_matrix)
movie_similarity_df = pd.DataFrame(movie_similarity, 
                                   columns=movies.movieid,
                                   index=movies.movieid)
movie_similarity_df

movieid,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.000000,0.000000,0.333333,0.000000,0.000000,0.000000,0.666667,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.57735,0.000000
2,0.000000,1.000000,0.577350,0.333333,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.816497,0.000000,0.000000,0.000000,0.000000,0.000000,0.408248,0.000000,0.00000,0.000000
3,0.000000,0.577350,1.000000,0.000000,0.577350,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.707107,0.000000,0.00000,0.000000
4,0.333333,0.333333,0.000000,1.000000,0.333333,0.577350,0.408248,0.666667,0.577350,0.408248,...,0.408248,0.577350,0.577350,0.577350,0.577350,0.577350,0.000000,0.408248,0.57735,0.577350
5,0.000000,0.333333,0.577350,0.333333,1.000000,0.577350,0.408248,0.333333,0.577350,0.408248,...,0.408248,0.577350,0.577350,0.577350,0.577350,0.577350,0.408248,0.408248,0.00000,0.577350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0.000000,0.000000,0.000000,0.577350,0.577350,1.000000,0.707107,0.577350,1.000000,0.707107,...,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,0.707107,0.00000,1.000000
1679,0.000000,0.408248,0.707107,0.000000,0.408248,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.500000,0.00000,0.000000
1680,0.000000,0.000000,0.000000,0.408248,0.408248,0.707107,0.500000,0.408248,0.707107,0.500000,...,0.000000,0.707107,0.707107,0.707107,0.707107,0.707107,0.500000,1.000000,0.00000,0.707107
1681,0.577350,0.000000,0.000000,0.577350,0.000000,0.000000,0.000000,0.577350,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.00000,0.000000


### Make Recommendations

Recommend RECOMM_COUNT movies for a specific user RECOMM_USER.

In [None]:
RECOMM_MOVIE = 50
RECOMM_COUNT = 10

In [None]:
movies[movies['movieid'] == RECOMM_MOVIE]

Unnamed: 0,movieid,title,releasedate,videoreleasedate,IMDbURL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
49,50,Star Wars (1977),01-Jan-1977,,http://us.imdb.com/M/title-exact?Star%20Wars%2...,0,1,1,0,0,...,0,0,0,0,0,1,1,0,1,0


In [None]:
recommendations = movie_similarity_df.copy()
recommendations.rename(columns = {RECOMM_MOVIE:'similarity'}, inplace = True)
recommendations.drop(index=RECOMM_MOVIE, inplace=True)
recommendations = recommendations['similarity'] \
        .sort_values(ascending=False) \
        .reset_index() \
        .merge(movies, how="left", on="movieid") \
        .head(RECOMM_COUNT)

recommendations[['movieid','title','similarity']]

Unnamed: 0,movieid,title,similarity
0,181,Return of the Jedi (1983),1.0
1,172,"Empire Strikes Back, The (1980)",0.912871
2,498,"African Queen, The (1951)",0.894427
3,271,Starship Troopers (1997),0.894427
4,373,Judge Dredd (1995),0.774597
5,897,Time Tracers (1995),0.774597
6,1239,Cutthroat Island (1995),0.774597
7,241,"Last of the Mohicans, The (1992)",0.774597
8,230,Star Trek IV: The Voyage Home (1986),0.774597
9,229,Star Trek III: The Search for Spock (1984),0.774597


In [None]:
recommendations.to_csv("recommendations.csv")

### Backups

In [None]:
import numpy as np
from numpy.linalg import norm
star_wars = np.array([1, 1, 0, 0, 1, 1])
legends_fall = np.array([0, 0, 0, 1, 1, 1])
cos_sim = np.dot(star_wars, legends_fall) / \
          (norm(star_wars)*norm(legends_fall))
print("Cosine Similarity is:", cos_sim)

Cosine Similarity is: 0.5773502691896258


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
movies = np.array(
    [ [1, 1, 0, 0, 1, 1],
      [0, 0, 0, 1, 1, 1],
      [1, 0, 0, 0, 0, 0],
      [1, 1, 0, 0, 1, 1],
      [1, 1, 0, 0, 0, 1],
      [0, 0, 1, 1, 0, 0] ])
cos_sim = cosine_similarity(movies)
cos_sim_df = pd.DataFrame(cos_sim)
# cos_sim_df.to_csv('sim.csv')
cos_sim_df

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.57735,0.5,1.0,0.866025,0.0
1,0.57735,1.0,0.0,0.57735,0.333333,0.408248
2,0.5,0.0,1.0,0.5,0.57735,0.0
3,1.0,0.57735,0.5,1.0,0.866025,0.0
4,0.866025,0.333333,0.57735,0.866025,1.0,0.0
5,0.0,0.408248,0.0,0.0,0.0,1.0
