# MovieLens 25M Dataset ETL
MovieLens 25M dataset contains some 62,000 movies and 162,000 users. This ETL will produce several training and test sets of varying proportions of the original data set.

In [1]:
import os
import logging
from pprint import pprint
from recsys.data.source import WebDataSource
from recsys.io.zip import extractzip
from recsys.io.file import IOService
from recsys.data.rating import RatingsDataset

In [2]:
logger = logging.getLogger("MovieLens25M ETL")

In [3]:
# Configuration
NAME = 'movielens25m'
DESCRIPTION = 'MovieLens 25M Dataset'
WEBSITE = "https://grouplens.org/datasets/movielens/"
URLS = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
DOWNLOAD_DESTINATION = "data/ext/"
EXTRACT_DESTINATION = "data/raw/"
DOWNLOAD_FILEPATH = os.path.join(DOWNLOAD_DESTINATION, "ml-25m.zip")
# Ratings Filepaths
RATINGS_RAW_FILEPATH = 'data/raw/ml-25m/ratings.csv'
RATINGS_FULL_FILEPATH = 'data/raw/ratings.pkl'
TRAIN_FILEPATH = "data/input/train.pkl"
TEST_FILEPATH = "data/input/test.pkl"
# directories
INPUT_DIR = "data/input/"


## Download, Extract and Pickle Dataset
Download the data from the grouplens site into the external data directory and convert to pickle format for faster io. 

In [4]:
# Download
if not os.path.exists(DOWNLOAD_DESTINATION):
    source = WebDataSource(name=NAME, description=DESCRIPTION, website=WEBSITE, urls=URLS)
    source.download(destination=DOWNLOAD_DESTINATION, force=False)
assert os.path.exists(DOWNLOAD_FILEPATH)
logger.info("Download Complete")

# Extract
if not os.path.exists(RATINGS_RAW_FILEPATH):
    extractzip(source=DOWNLOAD_FILEPATH, destination=EXTRACT_DESTINATION)
assert os.path.exists(RATINGS_RAW_FILEPATH)    
logger.info("Extract Complete")

# Convert Data to Pickle Format
if not os.path.exists(RATINGS_FULL_FILEPATH):
    ratings = IOService.read(RATINGS_RAW_FILEPATH)
    IOService.write(filepath=RATINGS_FULL_FILEPATH, data=ratings)    
assert os.path.exists(RATINGS_FULL_FILEPATH)
logger.info("Conversion Complete")

## Full Dataset Summary

In [5]:
display(HTML('<h3>Full Dataset Summary</h3>'))
ratings = RatingsDataset(filepath=RATINGS_FULL_FILEPATH)
_ = ratings.summarize()

## Create Training and Test Sets
The data will be split using a temporal global strategy, whereby the first 80% of the ratings by timestamp will be allocated for training, the rest for test

In [6]:
ratings = RatingsDataset(filepath=RATINGS_FULL_FILEPATH)
if not os.path.exists(TRAIN_FILEPATH) or os.path.exists(TEST_FILEPATH):    
    ratings.split(train_filepath=TRAIN_FILEPATH, test_filepath=TEST_FILEPATH, train_prop=0.8)
assert os.path.exists(TRAIN_FILEPATH)
assert os.path.exists(TEST_FILEPATH)
logger.info("Train/Test Split Complete")

## Training Set Summary

In [7]:
display(HTML('<h3>Train Dataset Summary</h3>'))
ratings = RatingsDataset(filepath=TRAIN_FILEPATH)
_ = ratings.summarize()

## Test Set Summary

In [8]:
display(HTML('<h3>Test Dataset Summary</h3>'))
ratings = RatingsDataset(filepath=TEST_FILEPATH)
_ = ratings.summarize()

## Generate Development Sets

In [13]:
dev_sets = {}
fracs = [0.5, 0.1, 0.01, 0.005]
pcts = [50,10,1,.5]
for pct, frac in zip(pcts, fracs):
    filename = 'ratings_' + str(pct) + '_pct.pkl'
    d = {'name': os.path.basename(filename),
         'title': 'Ratings ' + str(pct) + ' %',
         'frac': frac,
         'filepath': os.path.join('data/dev/',filename),
         }
    dev_sets[pct] = d
pprint(dev_sets)


{0.5: {'filepath': 'data/dev/ratings_0.5_pct.pkl',
       'frac': 0.005,
       'name': 'ratings_0.5_pct.pkl',
       'title': 'Ratings 0.5 %'},
 1: {'filepath': 'data/dev/ratings_1_pct.pkl',
     'frac': 0.01,
     'name': 'ratings_1_pct.pkl',
     'title': 'Ratings 1 %'},
 10: {'filepath': 'data/dev/ratings_10_pct.pkl',
      'frac': 0.1,
      'name': 'ratings_10_pct.pkl',
      'title': 'Ratings 10 %'},
 50: {'filepath': 'data/dev/ratings_50_pct.pkl',
      'frac': 0.5,
      'name': 'ratings_50_pct.pkl',
      'title': 'Ratings 50 %'}}


In [14]:
def create_dev_sets(dev_sets, force: bool = False):
    ratings = IOService.read(RATINGS_FULL_FILEPATH)
    for pct, dev in dev_sets.items():        
        if force or not os.path.exists(dev['filepath']):
            df = ratings.sample(frac=dev['frac'])        
            IOService.write(filepath=dev['filepath'], data=df)
        assert os.path.exists(dev['filepath'])
create_dev_sets(dev_sets, force=True)
logger.info("Development Sets Created")

## Development Sets Summary

In [11]:
def summarize(dev_sets):
    for pct, dev in dev_sets.items():
        print(f"{dev['title']}")        
        ratings = RatingsDataset(dev['filepath'])
        print(ratings.summarize())
summarize(dev_sets)

Ratings 50 %
                                               Count
Rows                                    2,500,010.00
Columns                                         4.00
Users                                      30,863.00
Movies                                     46,015.00
Memory Usage                                   64.96
Size                                   10,000,040.00
Matrix Size                         1,420,160,945.00
Sparsity                                        0.70
Density                                         0.30
Maximum Number of Ratings by User          10,062.00
Average Number of Ratings by User              15.38
Median Number of Ratings by User                0.00
Maximum Number of Ratings for Movie         8,486.00
Average Number of Ratings for Movie            42.34
Median Number of Ratings for Movie              2.00
Ratings 10 %
                                             Count
Rows                                    500,002.00
Columns                 