# MovieLens 25M Dataset ETL
MovieLens 25M dataset contains some 62,000 movies and 162,000 users. This ETL will produce several training and test sets of varying proportions of the original data set.

In [1]:
import os
import logging
from recsys.data.source import WebDataSource
from recsys.io.zip import extractzip
from recsys.io.file import IOService
from recsys.data.rating import RatingsDataset

In [2]:
logger = logging.getLogger("MovieLens25M ETL")

In [3]:
# Configuration
NAME = 'movielens25m'
DESCRIPTION = 'MovieLens 25M Dataset'
WEBSITE = "https://grouplens.org/datasets/movielens/"
URLS = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
DOWNLOAD_DESTINATION = "data/ext/"
EXTRACT_DESTINATION = "data/raw/"
DOWNLOAD_FILEPATH = os.path.join(DOWNLOAD_DESTINATION, "ml-25m.zip")
# Ratings Filepaths
RATINGS_RAW_FILEPATH = 'data/raw/ml-25m/ratings.csv'
RATINGS_FULL_FILEPATH = 'data/raw/ratings.pkl'
TRAIN_FILEPATH = "data/input/train.pkl"
TEST_FILEPATH = "data/input/test.pkl"
RATINGS_50_FILEPATH = "data/input/ratings_50.pkl"
RATINGS_10_FILEPATH = "data/input/ratings_10.pkl"
RATINGS_1_FILEPATH = "data/input/ratings_1.pkl"


## Download, Extract and Pickle Dataset
Download the data from the grouplens site into the external data directory and convert to pickle format for faster io. 

In [4]:
# Download
source = WebDataSource(name=NAME, description=DESCRIPTION, website=WEBSITE, urls=URLS)
source.download(destination=DOWNLOAD_DESTINATION, force=False)
assert os.path.exists(DOWNLOAD_FILEPATH)
logger.info("Download Complete")

# Extract
if not os.path.exists(RATINGS_RAW_FILEPATH):
    extractzip(source=DOWNLOAD_FILEPATH, destination=EXTRACT_DESTINATION)
assert os.path.exists(RATINGS_RAW_FILEPATH)    
logger.info("Extract Complete")

# Convert Data to Pickle Format
if not os.path.exists(RATINGS_FULL_FILEPATH):
    ratings = IOService.read(RATINGS_RAW_FILEPATH)
    IOService.write(filepath=RATINGS_FULL_FILEPATH, data=ratings)    
assert os.path.exists(RATINGS_FULL_FILEPATH)
logger.info("Conversion Complete")

100%|██████████| 1/1 [00:00<00:00, 5833.52it/s]


## Create Training Sets
The data will be split using a temporal global strategy, whereby the first 80% of the ratings by timestamp will be allocated for training, the rest for test

In [5]:
ratings = RatingsDataset(filepath=RATINGS_FULL_FILEPATH)
if not os.path.exists(TRAIN_FILEPATH) or os.path.exists(TEST_FILEPATH):    
    ratings.split(train_filepath=TRAIN_FILEPATH, test_filepath=TEST_FILEPATH, train_prop=0.8)
assert os.path.exists(TRAIN_FILEPATH)
assert os.path.exists(TEST_FILEPATH)
logger.info("Train/Test Split Complete")

## Create Development Sets

In [6]:
if not os.path.exists(RATINGS_50_FILEPATH):
    _ = ratings.sample(frac=0.5, filepath=RATINGS_50_FILEPATH)
if not os.path.exists(RATINGS_10_FILEPATH):
    _ = ratings.sample(frac=0.1, filepath=RATINGS_10_FILEPATH)
if not os.path.exists(RATINGS_1_FILEPATH):    
    _ = ratings.sample(frac=0.01, filepath=RATINGS_1_FILEPATH)
assert os.path.exists(RATINGS_50_FILEPATH)
assert os.path.exists(RATINGS_10_FILEPATH)
assert os.path.exists(RATINGS_1_FILEPATH)
logger.info("Development Sets Created")

Unnamed: 0,userId,movieId,rating,timestamp
0,53343,223,4.00,894421847
1,60121,3462,4.00,1449160376
2,23458,2324,5.00,1494326564
3,72162,6410,3.00,1054349476
4,115636,521,5.00,841934409
...,...,...,...,...
12500043,2154,48516,2.50,1213919232
12500044,151394,2717,2.50,1248386056
12500045,112231,454,3.00,841600068
12500046,45581,1283,4.00,1122763211


Unnamed: 0,userId,movieId,rating,timestamp
0,33312,3983,4.00,979867353
1,12708,1,5.00,833979506
2,60455,2,4.00,844449688
3,141430,81834,4.00,1437443097
4,9826,32,3.00,956926374
...,...,...,...,...
2500005,38041,42725,5.00,1209536127
2500006,86467,1541,3.00,909183904
2500007,95032,147,2.50,1061787535
2500008,80307,1641,4.00,943961884


Unnamed: 0,userId,movieId,rating,timestamp
0,139709,11,1.00,1111482534
1,57115,2944,4.00,974758173
2,144938,1965,3.00,1047410572
3,30311,512,4.00,918656810
4,48881,2121,2.50,1111817359
...,...,...,...,...
249996,125667,1,5.00,1140146498
249997,84056,3409,3.50,1461061376
249998,125871,513,2.00,976137976
249999,146999,2616,4.00,997508023


## Dataset Summmary

In [13]:
display(HTML('<h3>Full Dataset Summary</h3>'))
ratings = RatingsDataset(filepath=RATINGS_FULL_FILEPATH)
_ = ratings.summarize()

                                   Count
Size                      800,003,168.00
Rows                       25,000,095.00
Columns                             4.00
Users                         162,541.00
Movies                         59,047.00
Maximum Ratings per User       32,202.00
Average Ratings per User          153.81
Median Ratings per User            71.00
Minimum Ratings per User           20.00
Maximum Ratings per Movie      81,491.00
Average Ratings per Movie         423.39
Median Ratings per Movie            6.00
Minimum Ratings per Movie           1.00


In [14]:
display(HTML('<h3>Train Dataset Summary</h3>'))
ratings = RatingsDataset(filepath=TRAIN_FILEPATH)
_ = ratings.summarize()

                                   Count
Size                      800,003,040.00
Rows                       20,000,076.00
Columns                             4.00
Users                         137,883.00
Movies                         34,461.00
Maximum Ratings per User       12,097.00
Average Ratings per User          145.05
Median Ratings per User            68.00
Minimum Ratings per User            1.00
Maximum Ratings per Movie      67,782.00
Average Ratings per Movie         580.37
Median Ratings per Movie            9.00
Minimum Ratings per Movie           1.00


In [15]:
display(HTML('<h3>Test Dataset Summary</h3>'))
ratings = RatingsDataset(filepath=TEST_FILEPATH)
_ = ratings.summarize()

                                   Count
Size                      200,000,760.00
Rows                        5,000,019.00
Columns                             4.00
Users                          31,059.00
Movies                         55,199.00
Maximum Ratings per User       20,105.00
Average Ratings per User          160.98
Median Ratings per User            77.00
Minimum Ratings per User            1.00
Maximum Ratings per Movie      16,800.00
Average Ratings per Movie          90.58
Median Ratings per Movie            4.00
Minimum Ratings per Movie           1.00


In [16]:
display(HTML('<h3>Ratings 50 Dataset Summary</h3>'))
ratings = RatingsDataset(filepath=RATINGS_50_FILEPATH)
_ = ratings.summarize()

                                   Count
Size                      400,001,664.00
Rows                       12,500,048.00
Columns                             4.00
Users                         162,541.00
Movies                         50,948.00
Maximum Ratings per User       16,047.00
Average Ratings per User           76.90
Median Ratings per User            35.00
Minimum Ratings per User            1.00
Maximum Ratings per Movie      40,811.00
Average Ratings per Movie         245.35
Median Ratings per Movie            4.00
Minimum Ratings per Movie           1.00


In [17]:
display(HTML('<h3>Ratings 10 Dataset Summary</h3>'))
ratings = RatingsDataset(filepath=RATINGS_10_FILEPATH)
_ = ratings.summarize()

                                  Count
Size                      80,000,448.00
Rows                       2,500,010.00
Columns                            4.00
Users                        159,385.00
Movies                        31,736.00
Maximum Ratings per User       3,196.00
Average Ratings per User          15.69
Median Ratings per User            7.00
Minimum Ratings per User           1.00
Maximum Ratings per Movie      8,259.00
Average Ratings per Movie         78.78
Median Ratings per Movie           3.00
Minimum Ratings per Movie          1.00


In [18]:
display(HTML('<h3>Ratings 1 Dataset Summary</h3>'))
ratings = RatingsDataset(filepath=RATINGS_1_FILEPATH)
_ = ratings.summarize()

                                 Count
Size                      8,000,160.00
Rows                        250,001.00
Columns                           4.00
Users                        90,105.00
Movies                       14,283.00
Maximum Ratings per User        316.00
Average Ratings per User          2.77
Median Ratings per User           2.00
Minimum Ratings per User          1.00
Maximum Ratings per Movie       841.00
Average Ratings per Movie        17.50
Median Ratings per Movie          3.00
Minimum Ratings per Movie         1.00
