# MovieLens 25M Dataset ETL
MovieLens 25M dataset contains some 62,000 movies and 162,000 users. This ETL will produce several training and test sets of varying proportions of the original data set.

In [1]:
import os
import logging
from pprint import pprint

from recsys.data.source import WebDataSource
from recsys.io.zip import extractzip
from recsys.io.file import IOService
from recsys.data.rating import RatingsDataset

In [2]:
logger = logging.getLogger("MovieLens25M ETL")

In [3]:
# Configuration
NAME = 'movielens25m'
DESCRIPTION = 'MovieLens 25M Dataset'
WEBSITE = "https://grouplens.org/datasets/movielens/"
URLS = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
DOWNLOAD_DESTINATION = "data/ext/"
EXTRACT_DESTINATION = "data/raw/"
DOWNLOAD_FILEPATH = os.path.join(DOWNLOAD_DESTINATION, "ml-25m.zip")
# Raw Data Filepath
RATINGS_RAW_FILEPATH = "data/raw/ml-25m/ratings.csv"
RATINGS_PKL_FILEPATH = "data/raw/ratings.pkl"
# Environemnt Filepaths
PROD_RATINGS_FILEPATH = "data/prod/ratings.pkl"
PROD_TRAIN_FILEPATH = "data/prod/train.pkl"
PROD_TEST_FILEPATH = "data/prod/test.pkl"
DEV_RATINGS_FILEPATH = 'data/dev/ratings.pkl'
DEV_TRAIN_FILEPATH = "data/dev/train.pkl"
DEV_TEST_FILEPATH = "data/dev/test.pkl"
TEST_RATINGS_FILEPATH = 'tests/data/ratings.pkl'
TEST_TRAIN_FILEPATH = 'tests/data/train.pkl'
TEST_TEST_FILEPATH = 'tests/data/test.pkl'
# Subsample fractions
PROD_FRAC = 1
DEV_FRAC = .01
TEST_FRAC = .001


## Download, Extract and Pickle Dataset
Download the data from the grouplens site into the external data directory and convert to pickle format for faster io. 

In [4]:
# Download
if not os.path.exists(DOWNLOAD_DESTINATION):
    source = WebDataSource(name=NAME, description=DESCRIPTION, website=WEBSITE, urls=URLS)
    source.download(destination=DOWNLOAD_DESTINATION, force=False)
assert os.path.exists(DOWNLOAD_FILEPATH)
logger.info("Download Complete")

# Extract
if not os.path.exists(RATINGS_RAW_FILEPATH):
    extractzip(source=DOWNLOAD_FILEPATH, destination=EXTRACT_DESTINATION)
assert os.path.exists(RATINGS_RAW_FILEPATH)    
logger.info("Extract Complete")

# Convert Data to Pickle Format
if not os.path.exists(RATINGS_PKL_FILEPATH):
    ratings = IOService.read(RATINGS_RAW_FILEPATH)
    IOService.write(filepath=RATINGS_PKL_FILEPATH, data=ratings)    
assert os.path.exists(RATINGS_PKL_FILEPATH)
logger.info("Conversion Complete")

## Create Dev, Test and Prod Sets

In [None]:
ratings = IOService.read(filepath=RATINGS_PKL_FILEPATH)

In [None]:
# Dataset packages
dp = {
    'prod': {
        "frac": PROD_FRAC,
        "ratings": PROD_RATINGS_FILEPATH,
    },

    'dev': {
        "frac": DEV_FRAC,
        "ratings": DEV_RATINGS_FILEPATH,
    },
    'test': {
        "frac": TEST_FRAC,
        "ratings": TEST_RATINGS_FILEPATH,
    },
}

In [None]:
def create_ratings(ratings, frac, filepath):
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    df = ratings.sample(frac=frac)
    IOService.write(filepath=filepath, data=df)

In [None]:
def create_tts(filepath, train_size: float = 0.8):
    df = IOService.read(filepath=filepath)
    df = df.sort_values(by='timestamp')
    split = int(df.shape[0] * train_size)
    train = df[:split]
    test = df[split:]
    directory = os.path.dirname(filepath)
    train_filepath = os.path.join(directory, 'train.pkl')
    test_filepath = os.path.join(directory, 'test.pkl')
    IOService.write(filepath=train_filepath, data=train)
    IOService.write(filepath=test_filepath, data=test)



In [None]:
for mode, spec in dp.items():
    create_ratings(ratings, frac=spec['frac'], filepath=spec['ratings'])

In [None]:
for mode, spec in dp.items():
    create_tts(filepath=spec['ratings'], train_size=0.8)