# Config

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")

from msr.python_splitters import (
    python_random_split, 
    python_chrono_split, 
    python_stratified_split
)

from msr.pandas_df_utils import negative_feedback_sampler

In [None]:
COL_USER = "userId"
COL_ITEM = "movieId"
COL_RATING = "rating"
COL_TIMESTAMP = "timestamp"
RATIO_NEG_PER_USER = 1
SEED = 42

# Data Preparation

In [3]:
data = pd.read_csv("./data/ratings.csv")

In [4]:
data[COL_TIMESTAMP] = data.apply(
    lambda x: datetime.strftime(
        datetime(1970, 1, 1, 0, 0, 0) + timedelta(seconds=x[COL_TIMESTAMP].item()), 
        "%Y-%m-%d %H:%M:%S"
        ),
    axis=1
)

In [5]:
data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,0,4.0,2000-07-30 18:45:03
1,1,2,4.0,2000-07-30 18:20:47
2,1,5,4.0,2000-07-30 18:37:04
3,1,43,5.0,2000-07-30 19:03:35
4,1,46,5.0,2000-07-30 18:48:51
...,...,...,...,...
100831,610,9416,4.0,2017-05-03 21:53:22
100832,610,9443,5.0,2017-05-03 22:21:31
100833,610,9444,5.0,2017-05-08 19:50:47
100834,610,9445,5.0,2017-05-03 21:19:12


# Negative Sampling

In [6]:
sample = negative_feedback_sampler(
    df=data,
    col_user=COL_USER,
    col_item=COL_ITEM,
    col_label=COL_RATING,
    ratio_neg_per_user=RATIO_NEG_PER_USER,
    seed=SEED
)

In [7]:
sample[sample[COL_RATING]==1]

Unnamed: 0,userId,movieId,rating
0,1,0,1
230,1,2987,1
83,1,956,1
82,1,954,1
81,1,938,1
...,...,...,...
199936,610,7120,1
199937,610,7122,1
199938,610,7123,1
199939,610,7128,1


In [8]:
sample[sample[COL_RATING]==0]

Unnamed: 0,userId,movieId,rating
316,1,9664,0
315,1,5528,0
314,1,3684,0
313,1,6163,0
312,1,8681,0
...,...,...,...
201241,610,3492,0
201242,610,1245,0
201224,610,8688,0
201670,610,422,0


# Random Split

## 2분할

In [25]:
data_train, data_test = python_random_split(data, ratio=0.9)

In [27]:
data_train.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
14430,91,1829,2.0,2005-04-05 15:58:44
43498,292,98,2.0,2010-02-09 01:49:19
73590,474,1321,3.5,2003-05-15 17:57:35
19181,124,97,3.5,2012-05-09 17:25:26
97254,605,3571,2.5,2010-06-22 03:21:21
3660,21,9445,3.0,2017-07-22 05:32:39
84414,542,43,5.0,2006-11-13 03:01:53
71070,453,2802,5.0,2000-10-27 04:37:55
73420,474,912,4.0,2006-03-30 01:51:22
23224,159,9256,4.5,2017-10-22 02:59:57


In [28]:
data_test.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
67037,432,7316,4.5,2012-04-23 00:07:21
42175,288,412,3.0,2001-01-02 19:59:25
93850,599,3217,3.0,2017-06-27 00:49:02
6187,42,2248,4.0,2001-07-27 19:37:57
12229,75,1210,4.0,2006-09-23 05:37:21
7433,51,149,4.0,2009-01-02 21:10:34
53802,354,6416,3.5,2008-01-20 22:58:50
65098,416,602,4.5,2007-08-19 03:54:19
68041,438,4403,0.5,2005-01-13 22:11:59
11854,73,5253,3.5,2016-05-25 17:39:38


In [29]:
data_train.shape[0], data_test.shape[0]

(90752, 10084)

In [30]:
set(data_train.userId) - set(data_test.userId)

{138, 158, 257, 375, 392, 496, 578}

## 다중 분할

In [31]:
data_train, data_validate, data_test = python_random_split(data, ratio=[0.6, 0.2, 0.2])

In [32]:
data_train.shape[0], data_validate.shape[0], data_test.shape[0]

(60502, 20167, 20167)

## 정수 분할

In [33]:
data_train, data_validate, data_test = python_random_split(data, ratio=[3, 1, 1])

In [34]:
data_train.shape[0], data_validate.shape[0], data_test.shape[0]

(60502, 20167, 20167)

# Chronological Split

In [35]:
data_train, data_test = python_chrono_split(
    data,
    ratio=0.7,
    filter_by="user",
    col_user=COL_USER,
    col_item=COL_ITEM,
    col_timestamp=COL_TIMESTAMP
)

In [None]:
data_train[data_train[COL_USER] == 1].head(10)

Unnamed: 0,userId,movieId,rating,timestamp
56,1,789,5.0,2000-07-30 18:46:31
138,1,1598,5.0,2000-07-30 18:47:18
127,1,1542,5.0,2000-07-30 18:47:18
38,1,551,5.0,2000-07-30 18:47:18
131,1,1558,4.0,2000-07-30 18:47:18
35,1,513,5.0,2000-07-30 18:47:18
128,1,1552,5.0,2000-07-30 18:47:18
53,1,786,5.0,2000-07-30 18:47:35
51,1,782,5.0,2000-07-30 18:47:56
135,1,1576,5.0,2000-07-30 18:47:56


In [None]:
data_test[data_test[COL_USER] == 1].tail(10)

Unnamed: 0,userId,movieId,rating,timestamp
230,1,2987,4.0,2000-07-30 18:48:23
122,1,1504,5.0,2000-07-30 18:48:23
54,1,787,3.0,2000-07-30 18:48:23
4,1,46,5.0,2000-07-30 18:48:51
36,1,520,5.0,2000-07-30 18:48:51
74,1,913,5.0,2000-07-30 18:49:11
103,1,1217,5.0,2000-07-30 18:49:11
62,1,827,5.0,2000-07-30 18:49:11
16,1,257,3.0,2000-07-30 18:49:27
144,1,1686,4.0,2000-07-30 18:49:49


In [38]:
set(data_train.userId) - set(data_test.userId)

set()

# Stratified Split

In [9]:
data_train, data_test = python_stratified_split(
    data=data,
    filter_by="user",
    ratio=0.7,
    col_user=COL_USER,
    col_item=COL_ITEM
)

In [10]:
data_train[data_train[COL_USER] == 1].head(10)

Unnamed: 0,userId,movieId,rating,timestamp
219,1,2670,5.0,2000-07-30 18:11:08
66,1,855,4.0,2000-07-30 18:41:53
9,1,130,5.0,2000-07-30 19:08:20
170,1,1971,2.0,2000-07-30 18:43:08
15,1,224,5.0,2000-07-30 18:28:00
201,1,2301,5.0,2000-07-30 19:08:06
25,1,398,5.0,2000-07-30 18:31:49
197,1,2257,4.0,2000-07-30 18:14:56
154,1,1805,4.0,2000-07-30 18:38:30
126,1,1525,5.0,2000-07-30 18:40:00


In [11]:
data_train[data_train[COL_USER] == 1].tail(10)

Unnamed: 0,userId,movieId,rating,timestamp
43,1,632,4.0,2000-07-30 18:08:19
145,1,1690,4.0,2000-07-30 18:38:30
207,1,2432,3.0,2000-07-30 18:51:48
3,1,43,5.0,2000-07-30 19:03:35
105,1,1223,5.0,2000-07-30 18:58:24
53,1,786,5.0,2000-07-30 18:47:35
133,1,1566,4.0,2000-07-30 18:28:45
209,1,2458,5.0,2000-07-30 18:58:56
177,1,1996,3.0,2000-07-30 18:57:06
220,1,2692,4.0,2000-07-30 18:11:23


In [12]:
set(data_train.userId) - set(data_test.userId)

set()