# Config

In [14]:
import pandas as pd
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")

from msr.python_splitters import (
    python_random_split, 
    python_chrono_split, 
    python_stratified_split
)

In [2]:
COL_USER = "userId"
COL_ITEM = "movieId"
COL_RATING = "rating"
COL_TIMESTAMP = "timestamp"

# Data Preparation

In [3]:
data = pd.read_csv("./data/ratings.csv")

In [None]:
data[COL_TIMESTAMP] = data.apply(
    lambda x: datetime.strftime(
        datetime(1970, 1, 1, 0, 0, 0) + timedelta(seconds=x[COL_TIMESTAMP].item()), 
        "%Y-%m-%d %H:%M:%S"
        ),
    axis=1
)

# Random Split

## 2분할

In [25]:
data_train, data_test = python_random_split(data, ratio=0.9)

In [27]:
data_train.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
14430,91,1829,2.0,2005-04-05 15:58:44
43498,292,98,2.0,2010-02-09 01:49:19
73590,474,1321,3.5,2003-05-15 17:57:35
19181,124,97,3.5,2012-05-09 17:25:26
97254,605,3571,2.5,2010-06-22 03:21:21
3660,21,9445,3.0,2017-07-22 05:32:39
84414,542,43,5.0,2006-11-13 03:01:53
71070,453,2802,5.0,2000-10-27 04:37:55
73420,474,912,4.0,2006-03-30 01:51:22
23224,159,9256,4.5,2017-10-22 02:59:57


In [28]:
data_test.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
67037,432,7316,4.5,2012-04-23 00:07:21
42175,288,412,3.0,2001-01-02 19:59:25
93850,599,3217,3.0,2017-06-27 00:49:02
6187,42,2248,4.0,2001-07-27 19:37:57
12229,75,1210,4.0,2006-09-23 05:37:21
7433,51,149,4.0,2009-01-02 21:10:34
53802,354,6416,3.5,2008-01-20 22:58:50
65098,416,602,4.5,2007-08-19 03:54:19
68041,438,4403,0.5,2005-01-13 22:11:59
11854,73,5253,3.5,2016-05-25 17:39:38


In [29]:
data_train.shape[0], data_test.shape[0]

(90752, 10084)

In [30]:
set(data_train.userId) - set(data_test.userId)

{138, 158, 257, 375, 392, 496, 578}

## 다중 분할

In [31]:
data_train, data_validate, data_test = python_random_split(data, ratio=[0.6, 0.2, 0.2])

In [32]:
data_train.shape[0], data_validate.shape[0], data_test.shape[0]

(60502, 20167, 20167)

## 정수 분할

In [33]:
data_train, data_validate, data_test = python_random_split(data, ratio=[3, 1, 1])

In [34]:
data_train.shape[0], data_validate.shape[0], data_test.shape[0]

(60502, 20167, 20167)

# Chronological Split

In [35]:
data_train, data_test = python_chrono_split(
    data,
    ratio=0.7,
    filter_by="user",
    col_user=COL_USER,
    col_item=COL_ITEM,
    col_timestamp=COL_TIMESTAMP
)

In [None]:
data_train[data_train[COL_USER] == 1].head(10)

Unnamed: 0,userId,movieId,rating,timestamp
56,1,789,5.0,2000-07-30 18:46:31
138,1,1598,5.0,2000-07-30 18:47:18
127,1,1542,5.0,2000-07-30 18:47:18
38,1,551,5.0,2000-07-30 18:47:18
131,1,1558,4.0,2000-07-30 18:47:18
35,1,513,5.0,2000-07-30 18:47:18
128,1,1552,5.0,2000-07-30 18:47:18
53,1,786,5.0,2000-07-30 18:47:35
51,1,782,5.0,2000-07-30 18:47:56
135,1,1576,5.0,2000-07-30 18:47:56


In [None]:
data_test[data_test[COL_USER] == 1].tail(10)

Unnamed: 0,userId,movieId,rating,timestamp
230,1,2987,4.0,2000-07-30 18:48:23
122,1,1504,5.0,2000-07-30 18:48:23
54,1,787,3.0,2000-07-30 18:48:23
4,1,46,5.0,2000-07-30 18:48:51
36,1,520,5.0,2000-07-30 18:48:51
74,1,913,5.0,2000-07-30 18:49:11
103,1,1217,5.0,2000-07-30 18:49:11
62,1,827,5.0,2000-07-30 18:49:11
16,1,257,3.0,2000-07-30 18:49:27
144,1,1686,4.0,2000-07-30 18:49:49


In [38]:
set(data_train.userId) - set(data_test.userId)

set()

# Stratified Split

In [39]:
data_train, data_test = python_stratified_split(
    data,
    filter_by="user",
    ratio=0.7,
    col_user=COL_USER,
    col_item=COL_ITEM
)

In [None]:
data_train[data_train[COL_USER] == 1].head(10)

In [None]:
data_train[data_train[COL_USER] == 1].tail(10)

In [None]:
set(data_train.userId) - set(data_test.userId)