### Movie Recommender with matrix factorization by Irinej Slapal

First we import the necessary libraries (we will use nupy and pandas) and
read the data from the csv file into a pandas dataframe.

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import csv
import collections

In [2]:

df_ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
df_movies = pd.read_csv('data/ml-latest-small/movies.csv')
df_links = pd.read_csv('data/ml-latest-small/links.csv')
df_cast = pd.read_csv('data/ml-latest-small/cast.csv')
df_tags = pd.read_csv('data/ml-latest-small/tags.csv')


Now, for the matrix factorization to work, we need to form a matrix X (users * movies), this matriks will show us with what rating and which movies a certain user rated.


In [15]:

print(df_ratings.groupby('movieId').describe())

        userId                                                             \
         count        mean         std    min    25%    50%    75%    max   
movieId                                                                     
1        247.0  338.558704  189.961452    7.0  164.5  353.0  500.5  671.0   
2        107.0  318.906542  196.630078   15.0  142.0  298.0  498.0  665.0   
3         59.0  374.423729  205.514846    5.0  173.5  430.0  545.5  665.0   
4         13.0  355.538462  225.175345   19.0  168.0  391.0  518.0  650.0   
5         56.0  320.785714  198.127389   15.0  147.5  300.0  514.5  665.0   
...        ...         ...         ...    ...    ...    ...    ...    ...   
161944     1.0  287.000000         NaN  287.0  287.0  287.0  287.0  287.0   
162376     1.0   73.000000         NaN   73.0   73.0   73.0   73.0   73.0   
162542     1.0  611.000000         NaN  611.0  611.0  611.0  611.0  611.0   
162672     1.0  611.000000         NaN  611.0  611.0  611.0  611.0  611.0   

In [10]:
#what tables we need from data
# - movie genres                                                ×
# - avg rating for each movie                                   ×
# - avg rating for each genre
# - rating count for each movie                                 ×
# - what movies user has rated and the ratings for each user    ×
# - how much movies has each user rated                         ×
# - movie references (everything connected to one movie id)

numOfMovies = df_movies['movieId'].nunique()
numOfUsers = df_ratings['userId'].nunique()

#set of movie genres
movie_genres = set(genre.lower() 
                    for i in range(len(df_movies)) 
                    for genre in df_movies.loc[i, 'genres'].split('|'))
print(movie_genres)
print("-------------------------------------------")

#avg rating for each movie
avgMovieRating = {movieId[0]: [movieId[1]['rating'].mean(), len(movieId[1])] 
                    for movieId in df_ratings.groupby('movieId')}
print(avgMovieRating)
print("-------------------------------------------")

#avg rating for each genre
avgGenreRating = {movieId[0]: [movieId[1]['rating'].mean(), len(movieId[1])] 
                    for movieId in df_ratings.groupby('movieId')}
print(avgGenreRating)
print("-------------------------------------------")

#movie references movieId: (genres + tags + cast)
list_movie_tag = list(zip(df_tags['movieId'], df_tags['tag']))
print(list_movie_tag)

movie_tags = {movie[0]: movie[1] for movie in list_movie_tag}
print(movie_tags)
print("-------------------------------------------")
movie_references = {movie[0]: (movie[1]['title'].values, movie[1]['cast'].values) 
                    for movie in pd.merge(df_movies, df_cast, on='movieId').groupby('movieId')}
print(movie_references)

for movieId in movie_tags:
    movie_references[movieId] = (movie_tags[movieId], movie_references[movieId][0], movie_references[movieId][1])

print(movie_references)



{'animation', 'thriller', 'war', 'mystery', 'film-noir', 'musical', 'western', 'fantasy', 'crime', 'action', 'horror', 'sci-fi', 'adventure', 'drama', '(no genres listed)', 'comedy', 'romance', 'children', 'documentary', 'imax'}
-------------------------------------------
{1: [3.8724696356275303, 247], 2: [3.4018691588785046, 107], 3: [3.1610169491525424, 59], 4: [2.3846153846153846, 13], 5: [3.267857142857143, 56], 6: [3.8846153846153846, 104], 7: [3.2830188679245285, 53], 8: [3.8, 5], 9: [3.15, 20], 10: [3.4508196721311477, 122], 11: [3.6890243902439024, 82], 12: [2.861111111111111, 18], 13: [3.9375, 8], 14: [3.4516129032258065, 31], 15: [2.3181818181818183, 11], 16: [3.9488636363636362, 88], 17: [3.9244186046511627, 86], 18: [3.2884615384615383, 26], 19: [2.597826086956522, 92], 20: [2.5384615384615383, 13], 21: [3.536842105263158, 95], 22: [3.3552631578947367, 38], 23: [3.090909090909091, 22], 24: [3.0441176470588234, 34], 25: [3.742574257425743, 101], 26: [4.1, 5], 27: [3.14285714

In [8]:

#matrix x for users and movies
users = sorted(list(set(df_ratings['userId'].values)))
x = {userId[0]: list(zip(userId[1]['movieId'].values, userId[1]['rating'].values)) for userId in users}


IndexError: invalid index to scalar variable.

In [7]:
print(pd.merge(df_movies, df_cast, on='movieId'))

      movieId                                              title  \
0           1                                   Toy Story (1995)   
1           2                                     Jumanji (1995)   
2           3                            Grumpier Old Men (1995)   
3           4                           Waiting to Exhale (1995)   
4           5                 Father of the Bride Part II (1995)   
...       ...                                                ...   
9120   162672                                Mohenjo Daro (2016)   
9121   163056                               Shin Godzilla (2016)   
9122   163949  The Beatles: Eight Days a Week - The Touring Y...   
9123   164977                           The Gay Desperado (1936)   
9124   164979                              Women of '69, Unboxed   

                                           genres  \
0     Adventure|Animation|Children|Comedy|Fantasy   
1                      Adventure|Children|Fantasy   
2                       