### Movie Recommender with matrix factorization by Irinej Slapal

First we import the necessary libraries (we will use nupy and pandas) and
read the data from the csv file into a pandas dataframe.

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import csv
import collections

In [2]:

df_ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
df_movies = pd.read_csv('data/ml-latest-small/movies.csv')
df_links = pd.read_csv('data/ml-latest-small/links.csv')
df_cast = pd.read_csv('data/ml-latest-small/cast.csv')
df_tags = pd.read_csv('data/ml-latest-small/tags.csv')


In [6]:
#what tables we need from data
# - movie genres                                                ×
# - avg rating for each movie                                   ×
# - avg rating for each genre
# - rating count for each movie                                 ×
# - what movies user has rated and the ratings for each user    ×
# - how much movies has each user rated                         ×
# - movie references (everything connected to one movie id)

num_movies = df_movies.shape[0]
num_users = df_ratings.shape[0]

#set of movie genres
movie_genres = set(genre.lower() 
                    for i in range(len(df_movies)) 
                    for genre in df_movies.loc[i, 'genres'].split('|'))


#avg rating for each movie
avgMovieRating = {movieId[0]: [movieId[1]['rating'].mean(), len(movieId[1])] 
                    for movieId in df_ratings.groupby('movieId')}


#avg rating for each genre
avgGenreRating = {movieId[0]: [movieId[1]['rating'].mean(), len(movieId[1])] 
                    for movieId in df_ratings.groupby('movieId')}


#movie references movieId: (genres + tags + cast)
list_movie_tag = list(zip(df_tags['movieId'], df_tags['tag']))
movie_tags = {movie[0]: movie[1] for movie in list_movie_tag}

movie_references = {movie[0]: (movie[1]['title'].values, movie[1]['cast'].values) 
                    for movie in pd.merge(df_movies, df_cast, on='movieId').groupby('movieId')}

for mid in movie_tags:
    movie_references[mid] = (movie_tags[mid], movie_references[mid][0], movie_references[mid][1])


#matrix x for users and movies
users = sorted(list(set(df_ratings['userId'].values)))
x = {userId[0]: list(zip(userId[1]['movieId'].values, userId[1]['rating'].values)) 
            for userId in users}


   movieId                                               cast  movieId  \
0        1  Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...        1   
1        2  Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...        2   
2        3  Walter Matthau|Jack Lemmon|Sophia Loren|Burges...        3   
3        4  Whitney Houston|Angela Bassett|Loretta Devine|...        4   
4        5  Steve Martin|Diane Keaton|Martin Short|George ...        5   

                                title  \
0                    Toy Story (1995)   
1                      Jumanji (1995)   
2             Grumpier Old Men (1995)   
3            Waiting to Exhale (1995)   
4  Father of the Bride Part II (1995)   

                                        genres  userId  movieId  \
0  Adventure|Animation|Children|Comedy|Fantasy    15.0    339.0   
1                   Adventure|Children|Fantasy    15.0   1955.0   
2                               Comedy|Romance    15.0   7478.0   
3                         Comedy|Drama|R

In [10]:
print(pd.merge(df_movies, df_cast, on='movieId'))

      movieId                                              title  \
0           1                                   Toy Story (1995)   
1           2                                     Jumanji (1995)   
2           3                            Grumpier Old Men (1995)   
3           4                           Waiting to Exhale (1995)   
4           5                 Father of the Bride Part II (1995)   
...       ...                                                ...   
9120   162672                                Mohenjo Daro (2016)   
9121   163056                               Shin Godzilla (2016)   
9122   163949  The Beatles: Eight Days a Week - The Touring Y...   
9123   164977                           The Gay Desperado (1936)   
9124   164979                              Women of '69, Unboxed   

                                           genres  \
0     Adventure|Animation|Children|Comedy|Fantasy   
1                      Adventure|Children|Fantasy   
2                       