In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd
import os 
import src.utils as utils

# Enable logging on Jupyter notebook
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

### Download and load the data from MovieLens

In [3]:
# downloads and unzips dataset from MovieLens 
#'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
dataset_folder = os.path.join(os.getcwd(), 'data')
dataset_folder_ready = utils.load_dataset(dataset_folder)

INFO:root:downloading dataset http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
INFO:root:dataset stored in: /Users/hcorona/github/recsys-101-workshop/data/ml-latest-small


### How to export your IMDb ratings 
1. Go to [IMDb](http://imdb.com) and make an account if you don't have one
2. If you have no ratings in your account, rate at least 10 movies you liked and 10 movies you didn't like. 
2. Go to your account (top right) and click on "your Ratings"
3. Go to the bottom of the page, next to the "next" button, you will find an "Export this list" button. It will export your ratings to your Downloads folder. 
4. Move the file inside your /data folder in this repo and name it **ratings-imdb.csv**


In [4]:
# Export IMDB ratings to the right format 
imdb_ratings = os.path.join(dataset_folder,'ratings-imdb.csv')
links_file = os.path.join(dataset_folder, 'ml-latest-small', 'links.csv')
ratings_file = os.path.join(dataset_folder, 'ml-latest-small','ratings-merged.csv')
utils.import_imdb_ratings(imdb_ratings, links_file, ratings_file)

# adds personal ratings to original dataset ratings file.
[ratings, my_customer_number] = utils.merge_datasets(dataset_folder_ready, ratings_file)

INFO:root:wrote IMDB ratings into the dataset format to /Users/hcorona/github/recsys-101-workshop/data/ml-latest-small/ratings-merged.csv
INFO:root:loaded 44 personal ratings
INFO:root:loaded 9125 movies
INFO:root:loaded 100048 ratings in total


### Understand the data 

In [5]:
# the data is stored in a long pandas dataframe
# we need to pivot the data to create a [user x movie] matrix
ratings_matrix = ratings.pivot_table(index='customer', columns='movie', values='rating', fill_value=0)
ratings_matrix = ratings_matrix.transpose()

In [6]:
# the personal ratings are now stored together with the rest of the ratings
ratings.loc[ratings.customer == my_customer_number]

Unnamed: 0,customer,movie,rating
39381,672,Exit Through the Gift Shop (2010),4.0
39493,672,Inception (2010),4.5
39547,672,127 Hours (2010),4.5
41243,672,Gran Torino (2008),4.5
41398,672,Harry Potter and the Deathly Hallows: Part 2 (...,5.0
65134,672,Slumdog Millionaire (2008),4.0
65204,672,"Wrestler, The (2008)",4.5
65771,672,"Serious Man, A (2009)",4.0
65841,672,Up in the Air (2009),3.5
66046,672,Shutter Island (2010),4.5


In [7]:
# A list with some of the movies in the dataset
movie_list = pd.DataFrame(ratings_matrix.index)
movie_list.head(20)

Unnamed: 0,movie
0,"""Great Performances"" Cats (1998)"
1,$9.99 (2008)
2,'Hellboy': The Seeds of Creation (2004)
3,'Neath the Arizona Skies (1934)
4,'Round Midnight (1986)
5,'Salem's Lot (2004)
6,'Til There Was You (1997)
7,"'burbs, The (1989)"
8,'night Mother (1986)
9,(500) Days of Summer (2009)
