# Notebook: data gathering: Downloads and reads the data

In [1]:
import os
os.chdir('..')

In [2]:
# Import all the packages we need to generate recommendations
import pandas as pd
import src.utils as utils
import src.recommenders as recommenders
import src.similarity as similarity

# Enable logging on Jupyter notebook
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

#### Download and load the data 

In [3]:
# downloads and unzips dataset from MovieLens 
#'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
dataset_folder = os.getcwd()+'/data/'
dataset_folder_ready = utils.load_dataset(dataset_folder)

INFO:root:dataset was already downloaded
INFO:root:dataset stored in: /Users/hcorona/github/recsys-101-workshop/data/ml-latest-small


In [4]:
# reads personal ratings
# adds personal ratings to original dataset ratings file. 
my_ratings_file = dataset_folder+'/ratings_humberto.csv'
[ratings, my_customer_number] = utils.merge_datasets(dataset_folder_ready, my_ratings_file)

INFO:root:loaded 12 personal ratings
INFO:root:loaded 9125 movies
INFO:root:loaded 100016 ratings in total


#### Understand the data 

In [5]:
# the data is stored in a long pandas dataframe
# we need to pivot the data to create a [user x movie] matrix
ratings_matrix = ratings.pivot_table(index='customer', columns='movie', values='rating', fill_value=0)
ratings_matrix = ratings_matrix.transpose()

In [6]:
# the personal ratings are now stored together with the rest of the ratings
ratings.ix[ratings.customer == my_customer_number]

Unnamed: 0,customer,movie,rating
17858,672,Annie Hall (1977),3.0
53920,672,"Beach, The (2000)",3.0
75779,672,"Sea Inside, The (Mar adentro) (2004)",4.5
81547,672,Blue Is the Warmest Color (La vie d'Adèle) (2013),4.5
82662,672,Hard Candy (2005),4.0
83296,672,"Perks of Being a Wallflower, The (2012)",5.0
83345,672,Amour (2012),5.0
84290,672,"Single Man, A (2009)",5.0
86667,672,Mysterious Skin (2004),4.0
94841,672,"Skin I Live In, The (La piel que habito) (2011)",4.0


In [7]:
# A list with some of the movies in the dataset
movie_list = pd.DataFrame(ratings_matrix.index)
movie_list.head(20)

Unnamed: 0,movie
0,"""Great Performances"" Cats (1998)"
1,$9.99 (2008)
2,'Hellboy': The Seeds of Creation (2004)
3,'Neath the Arizona Skies (1934)
4,'Round Midnight (1986)
5,'Salem's Lot (2004)
6,'Til There Was You (1997)
7,"'burbs, The (1989)"
8,'night Mother (1986)
9,(500) Days of Summer (2009)
