# Sampling Strategies


In [1]:
from IPython.display import display, HTML
import numpy as np
import pandas as pd
from recsys.services.io import IOService
from recsys.data.dataset import Dataset
from recsys.operator.data.sampling import UserRandomSampling, UserStratifiedRandomSampling, TemporalInteractionThresholdSampling, RandomTemporalInteractionThresholdSampling

## UserRandomSampling

## Temporal Interaction Threshold Sampling
Take the last umax user interactions and the last imax item interactions

### Interaction Threshold Sampling


In [8]:
source = "tests/testdata/operators/data_operators/ratings.csv"
destination = "tests/testdata/operators/data_operators/sampling/temporaralthreshold/ratings_temporal_sampling_1000.pkl"
s = TemporalInteractionThresholdSampling(source=source, destination=destination, umax=1000, imax=1000, force=True)
s.execute()

Unnamed: 0,userId,movieId,rating,timestamp
3,1,665,5.00,1147878820
14,1,2068,2.50,1147869044
16,1,2351,4.50,1147877957
17,1,2573,4.00,1147878923
18,1,2632,5.00,1147878248
...,...,...,...,...
25000075,162541,7042,3.50,1240952153
25000080,162541,7193,5.00,1240950032
25000082,162541,8610,2.50,1240950626
25000083,162541,8905,3.00,1240949310


In [9]:
df1 = IOService.read(source)
df2 = IOService.read(destination)

In [10]:
ratings1 = Dataset(name='ratings_raw', description="Ratings Raw", data=df1)
ratings2 = Dataset(name='ratings_subsampled', description="Ratings Subsampled", data=df2)

In [None]:
ratings1.compare(ratings2)

## Random Temporal Threshold Sampling

In [13]:
source = "tests/testdata/operators/data_operators/ratings.csv"
destination = "tests/testdata/operators/data_operators/sampling/temporaralthreshold/ratings_random_temporal_sampling_1000.pkl"
s = RandomTemporalInteractionThresholdSampling(source=source, destination=destination, umax=1000, imax=1000, force=True)
df3 = s.execute()


In [14]:
ratings3 = Dataset(name='ratings_random_temporal_subsampling', description='Random Temporal Subsample', data=df3)


Unnamed: 0,ratings_raw,ratings_random_temporal_subsampling,% change
nrows,25000095.0,5957526.0,76.17
ncols,8.0,8.0,0.0
n_users,162541.0,141779.0,12.77
n_items,59047.0,47171.0,20.11
max_ratings_per_user,32202.0,878.0,97.27
mean_ratings_per_user,153.81,42.02,72.68
max_ratings_per_item,81491.0,1000.0,98.77
mean_ratings_per_item,423.39,126.3,70.17
user_item_ratio,2.75,3.01,-9.19
item_user_ratio,0.36,0.33,8.41


In [15]:
ratings2.compare(ratings3)

Unnamed: 0,ratings_subsampled,ratings_random_temporal_subsampling,% change
nrows,6233491.0,5957526.0,4.43
ncols,8.0,8.0,0.0
n_users,135642.0,141779.0,-4.52
n_items,54563.0,47171.0,13.55
max_ratings_per_user,1000.0,878.0,12.2
mean_ratings_per_user,45.96,42.02,8.56
max_ratings_per_item,1000.0,1000.0,0.0
mean_ratings_per_item,114.24,126.3,-10.55
user_item_ratio,2.49,3.01,-20.9
item_user_ratio,0.4,0.33,17.29


In [19]:
r1 = ratings1.summary()
r2 = ratings2.summary()
r3 = ratings3.summary()
r = pd.concat([r1,r2,r3],axis=1)
r

Unnamed: 0,ratings_raw,ratings_subsampled,ratings_random_temporal_subsampling
nrows,25000095.0,6233491.0,5957526.0
ncols,8.0,8.0,8.0
n_users,162541.0,135642.0,141779.0
n_items,59047.0,54563.0,47171.0
max_ratings_per_user,32202.0,1000.0,878.0
mean_ratings_per_user,153.81,45.96,42.02
max_ratings_per_item,81491.0,1000.0,1000.0
mean_ratings_per_item,423.39,114.24,126.3
user_item_ratio,2.75,2.49,3.01
item_user_ratio,0.36,0.4,0.33
