# Recommender system

In [2]:
import csv
import numpy as np
from scipy.sparse import lil_matrix

### Data loader

Loading the data and representing it as a sparce utility matrix (review_data).

Note that indeces in the dataset are not sequential. That is why we create a separate function for translating dataset indices into unitily matrix (review_data) indices and back.

In [3]:
users_index = []
movies_index = []

num_users_processed = 0
num_movies_processed = 0

def get_user_index(dataset_user_id):
    global num_users_processed
    if dataset_user_id not in users_index:
        users_index.append(dataset_user_id)
        num_users_processed += 1
    return users_index.index(dataset_user_id)

def get_movie_index(dataset_movie_id):
    global num_movies_processed
    if dataset_movie_id not in movies_index:
        movies_index.append(dataset_movie_id)
        num_movies_processed += 1
    return movies_index.index(dataset_movie_id)

In [4]:
num_movies = 9125
num_users = 671

review_data = lil_matrix((num_users, num_movies))

data_file = 'ratings.csv' 

with open(data_file) as ratings_file:
    rating_reader = csv.reader(ratings_file, delimiter=',')
    next(rating_reader)
    iter = 0

    for record in rating_reader:
        user_id, movie_id, rating, _ = record
        user_id = get_user_index(int(user_id))
        movie_id = get_movie_index(int(movie_id))
        rating = float(rating)
        review_data[user_id, movie_id] = rating
        iter += 1
        if iter % 10000 == 0: print('Processed ', iter, 'records out of 100k.')

Processed  10000 records out of 100k.
Processed  20000 records out of 100k.
Processed  30000 records out of 100k.
Processed  40000 records out of 100k.
Processed  50000 records out of 100k.
Processed  60000 records out of 100k.
Processed  70000 records out of 100k.
Processed  80000 records out of 100k.
Processed  90000 records out of 100k.
Processed  100000 records out of 100k.


In [12]:
review_data.get_shape()

(671, 9125)

In [5]:
movies = {}

data_file = 'movies.csv' 

with open(data_file) as movie_file:
    movie_reader = csv.reader(movie_file, delimiter=',')
    next(movie_reader)
    for record in movie_reader:
        movie_id, title, _ = record
        movies[int(movie_id)] = title

In [6]:
def get_movie_name_by_id(id):
    dataset_movie_id = movies_index[id]
    return movies[dataset_movie_id]

def get_movie_id_by_name(movie_name):
    movie_data_id = int(list(movies.keys())[list(movies.values()).index(movie_name)]) #updated line
    return movies_index.index(movie_data_id)

In [15]:
get_movie_id_by_name("Matrix, The (1999)")

402

In [22]:
get_movie_name_by_id(402)

'Matrix, The (1999)'

### Item-item collaborative filtering

Please compute item-item collaborative filtering from the utility matrix (review_data). The output of your computation should be pairwise similarities between all movies.

As the order of the pair does not change the similarity metric, you can represent it as a dictionary with use [frozensets](https://docs.python.org/2.4/lib/types-set.html) as its keys. For example:

similarity = {}

similarity[frozenset(movie1_id, movie2_id)] = 0.67

In [81]:
# similarity = {}

# --------------- YOUR CODE HERE ---------------
def cos_similarity(matrix):
    """
    Calculating pairwise cosine distance for a sparse matrix
    
    This results in the same as the following but takes more time to work:
    >> from sklearn.metrics.pairwise import cosine_similarity
    >> similarity = cosine_similarity(review_data.transpose())
    """
    dense_matrix = matrix.todense()
    dot_product = np.dot(dense_matrix.transpose(), dense_matrix)
    norm = np.linalg.norm(dense_matrix, axis=0)
    norm_product = np.outer(norm, norm)
    similarity = np.zeros(dot_product.shape)
    np.divide(dot_product, norm_product, out = similarity, where = (norm_product!=0))
    return similarity

# ----------------------------------------------

In [82]:
similarity = cos_similarity(review_data)

### Finding most similar movies

Using your item-item similarity, find 5 movies you would recommend the someone who likes the following:
- Matrix, The (1999)
- Toy Story (1995)
- From Dusk Till Dawn (1996)
- Gone with the Wind (1939)
- Iron Man (2008)

In other words, find 5 most similar movies to each of the above using your similarity metric. You may find useful functions get_movie_name_by_id() and get_movie_id_by_name() here.

In [83]:
# --------------- YOUR CODE HERE ---------------
predict_movies = [
    "Matrix, The (1999)"
    ,"Toy Story (1995)"
    ,"From Dusk Till Dawn (1996)"
    ,"Gone with the Wind (1939)"
    ,"Iron Man (2008)"
]

def recommend(movie_name, similarities, n_recommendations = 5):
    """
    Returns movies which are most similar to movie_name
    according to similarities matrix
    
    Movies are returned in descending order by similarity
    """
    movie_id = get_movie_id_by_name(movie_name)
    # take -similarities to receive sorting in descending order
    recommended_ids = (-similarities[movie_id]).argsort()[1:(n_recommendations+1)]
    return [get_movie_name_by_id(movie_id) for movie_id in recommended_ids]

for movie in predict_movies:
    print('Recommendations for those who liked "{}":'.format(movie))
    print('\n'.join(recommend(movie, similarity)))
    print('\n')

# ----------------------------------------------

Recommendations for those who liked "Matrix, The (1999)":
Lord of the Rings: The Fellowship of the Ring, The (2001)
Lord of the Rings: The Two Towers, The (2002)
Fight Club (1999)
Back to the Future (1985)
Lord of the Rings: The Return of the King, The (2003)


Recommendations for those who liked "Toy Story (1995)":
Toy Story 2 (1999)
Star Wars: Episode IV - A New Hope (1977)
Forrest Gump (1994)
Independence Day (a.k.a. ID4) (1996)
Groundhog Day (1993)


Recommendations for those who liked "From Dusk Till Dawn (1996)":
Nightmare on Elm Street, A (1984)
Sleepy Hollow (1999)
Batman Returns (1992)
Candyman (1992)
Alien³ (a.k.a. Alien 3) (1992)


Recommendations for those who liked "Gone with the Wind (1939)":
Casablanca (1942)
It's a Wonderful Life (1946)
Wizard of Oz, The (1939)
African Queen, The (1951)
North by Northwest (1959)


Recommendations for those who liked "Iron Man (2008)":
Dark Knight, The (2008)
Star Trek (2009)
Batman Begins (2005)
Avatar (2009)
Avengers, The (2012)


