# MovieLens 25M Dataset ETL
MovieLens 25M dataset contains some 62,000 movies and 162,000 users. This ETL will produce several training and test sets of varying proportions of the original data set.

In [1]:
import os
import logging
from pprint import pprint
from recsys.data.source import WebDataSource
from recsys.io.zip import extractzip
from recsys.io.file import IOService
from recsys.data.rating import RatingsDataset

In [2]:
logger = logging.getLogger("MovieLens25M ETL")

In [3]:
# Configuration
NAME = 'movielens25m'
DESCRIPTION = 'MovieLens 25M Dataset'
WEBSITE = "https://grouplens.org/datasets/movielens/"
URLS = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
DOWNLOAD_DESTINATION = "data/ext/"
EXTRACT_DESTINATION = "data/raw/"
DOWNLOAD_FILEPATH = os.path.join(DOWNLOAD_DESTINATION, "ml-25m.zip")
# Ratings Filepaths
RATINGS_RAW_FILEPATH = 'data/raw/ml-25m/ratings.csv'
RATINGS_FULL_FILEPATH = 'data/raw/ratings.pkl'
TRAIN_FILEPATH = "data/input/train.pkl"
TEST_FILEPATH = "data/input/test.pkl"
# directories
INPUT_DIR = "data/input/"


## Download, Extract and Pickle Dataset
Download the data from the grouplens site into the external data directory and convert to pickle format for faster io. 

In [5]:
# Download
source = WebDataSource(name=NAME, description=DESCRIPTION, website=WEBSITE, urls=URLS)
source.download(destination=DOWNLOAD_DESTINATION, force=False)
assert os.path.exists(DOWNLOAD_FILEPATH)
logger.info("Download Complete")

# Extract
if not os.path.exists(RATINGS_RAW_FILEPATH):
    extractzip(source=DOWNLOAD_FILEPATH, destination=EXTRACT_DESTINATION)
assert os.path.exists(RATINGS_RAW_FILEPATH)    
logger.info("Extract Complete")

# Convert Data to Pickle Format
if not os.path.exists(RATINGS_FULL_FILEPATH):
    ratings = IOService.read(RATINGS_RAW_FILEPATH)
    IOService.write(filepath=RATINGS_FULL_FILEPATH, data=ratings)    
assert os.path.exists(RATINGS_FULL_FILEPATH)
logger.info("Conversion Complete")

100%|██████████| 1/1 [00:00<00:00, 3758.34it/s]


## Full Dataset Summary

In [6]:
display(HTML('<h3>Full Dataset Summary</h3>'))
ratings = RatingsDataset(filepath=RATINGS_FULL_FILEPATH)
_ = ratings.summarize()

                                         Count
Size                            800,003,168.00
Rows                             25,000,095.00
Columns                                   4.00
Users                               162,541.00
Movies                               59,047.00
Utility Matrix Size             100,000,380.00
Utility Matrix Memory Size (Mb)          95.37
Utility Matrix Sparsity                   0.25
Maximum Ratings per User             32,202.00
Average Ratings per User                153.81
Median Ratings per User                  71.00
Minimum Ratings per User                 20.00
Maximum Ratings per Movie            81,491.00
Average Ratings per Movie               423.39
Median Ratings per Movie                  6.00
Minimum Ratings per Movie                 1.00


## Create Training and Test Sets
The data will be split using a temporal global strategy, whereby the first 80% of the ratings by timestamp will be allocated for training, the rest for test

In [7]:
ratings = RatingsDataset(filepath=RATINGS_FULL_FILEPATH)
if not os.path.exists(TRAIN_FILEPATH) or os.path.exists(TEST_FILEPATH):    
    ratings.split(train_filepath=TRAIN_FILEPATH, test_filepath=TEST_FILEPATH, train_prop=0.8)
assert os.path.exists(TRAIN_FILEPATH)
assert os.path.exists(TEST_FILEPATH)
logger.info("Train/Test Split Complete")

## Training Set Summary

In [8]:
display(HTML('<h3>Train Dataset Summary</h3>'))
ratings = RatingsDataset(filepath=TRAIN_FILEPATH)
_ = ratings.summarize()

                                         Count
Size                            800,003,040.00
Rows                             20,000,076.00
Columns                                   4.00
Users                               137,883.00
Movies                               34,461.00
Utility Matrix Size              80,000,304.00
Utility Matrix Memory Size (Mb)          76.29
Utility Matrix Sparsity                   0.25
Maximum Ratings per User             12,097.00
Average Ratings per User                145.05
Median Ratings per User                  68.00
Minimum Ratings per User                  1.00
Maximum Ratings per Movie            67,782.00
Average Ratings per Movie               580.37
Median Ratings per Movie                  9.00
Minimum Ratings per Movie                 1.00


## Test Set Summary

In [9]:
display(HTML('<h3>Test Dataset Summary</h3>'))
ratings = RatingsDataset(filepath=TEST_FILEPATH)
_ = ratings.summarize()

                                         Count
Size                            200,000,760.00
Rows                              5,000,019.00
Columns                                   4.00
Users                                31,059.00
Movies                               55,199.00
Utility Matrix Size              20,000,076.00
Utility Matrix Memory Size (Mb)          19.07
Utility Matrix Sparsity                   0.25
Maximum Ratings per User             20,105.00
Average Ratings per User                160.98
Median Ratings per User                  77.00
Minimum Ratings per User                  1.00
Maximum Ratings per Movie            16,800.00
Average Ratings per Movie                90.58
Median Ratings per Movie                  4.00
Minimum Ratings per Movie                 1.00


## Generate Development Sets

In [4]:
dev_sets = {}
fracs = [0.5, 0.1, 0.01]
pcts = [50,10,1]
for pct, frac in zip(pcts, fracs):
    filename = 'ratings_' + str(pct) + '_pct.pkl'
    d = {'name': os.path.basename(filename),
         'title': 'Ratings ' + str(pct) + ' %',
         'frac': frac,
         'filepath': os.path.join('data/dev/',filename),
         }
    dev_sets[pct] = d
pprint(dev_sets)


{1: {'filepath': 'data/dev/ratings_1_pct.pkl',
     'frac': 0.01,
     'name': 'ratings_1_pct.pkl',
     'title': 'Ratings 1 %'},
 10: {'filepath': 'data/dev/ratings_10_pct.pkl',
      'frac': 0.1,
      'name': 'ratings_10_pct.pkl',
      'title': 'Ratings 10 %'},
 50: {'filepath': 'data/dev/ratings_50_pct.pkl',
      'frac': 0.5,
      'name': 'ratings_50_pct.pkl',
      'title': 'Ratings 50 %'}}


In [10]:
def create_dev_sets(dev_sets):
    for pct, dev in dev_sets.items():        
        if not os.path.exists(dev['filepath']):
            _ = ratings.sample(frac=dev['frac'], filepath=dev['filepath'])        
        assert os.path.exists(dev['filepath'])
create_dev_sets(dev_sets)
logger.info("Development Sets Created")

## Development Sets Summary

In [12]:
def summarize(dev_sets):
    for pct, dev in dev_sets.items():
        print(f"{dev['title']}")        
        ratings = RatingsDataset(dev['filepath'])
        _ = ratings.summarize()
summarize(dev_sets)

Ratings 50 %
                                        Count
Size                            80,000,448.00
Rows                             2,500,010.00
Columns                                  4.00
Users                               30,856.00
Movies                              45,994.00
Utility Matrix Size             10,000,040.00
Utility Matrix Memory Size (Mb)          9.54
Utility Matrix Sparsity                  0.25
Maximum Ratings per User             9,966.00
Average Ratings per User                81.02
Median Ratings per User                 39.00
Minimum Ratings per User                 1.00
Maximum Ratings per Movie            8,294.00
Average Ratings per Movie               54.36
Median Ratings per Movie                 3.00
Minimum Ratings per Movie                1.00
Ratings 10 %
                                        Count
Size                            16,000,192.00
Rows                               500,002.00
Columns                                  4.00
Users   