# Create Development Datasets
We have two datasets to sample: the core and commmon features datasets.

- Core Dataset: The core dataset should be a stratified sample of the complete core dataset, reflecting the same frequency  distribution of clicks and conversions. The target sample size is a parameter set to 0.001 of the complete dataset.
- Common Features Dataset: This a lookup dataset with a foreign key on the core dataset. We sample this with the contraing that every core feature set also exists in the core dataset sample. 

In [1]:
import pandas as pd
from deepcvr.data.profile import CoreProfiler
from deepcvr.data.sampling import TaobaoSampler
from deepcvr.utils.io import load_csv
pd.set_option('display.float_format', lambda x: '%.5f' % x)


In [2]:
#Parameters
FRAC_CORE = 0.001
MAX_FRAC_COMMON_FEATURES = 0.01
RANDOM_STATE = 602
# Filepaths
FILEPATHS = {}
FILEPATHS['TRAIN_CORE'] = "data/archive/raw/sample_skeleton_train.csv"
FILEPATHS['TRAIN_COMMON_FEATURES'] = "data/archive/raw/common_features_train.csv"
FILEPATHS['TEST_CORE'] = "data/archive/raw/sample_skeleton_test.csv"
FILEPATHS['TEST_COMMON_FEATURES'] = "data/archive/raw/common_features_test.csv"


## Read Data

In [3]:
core = load_csv(filepath=FILEPATHS['TRAIN_CORE'])

Rows read: 100%|██████████| 42300135/42300135 [02:42<00:00, 260356.82it/s]


In [4]:
common = load_csv(filepath=FILEPATHS['TRAIN_COMMON_FEATURES'])

Rows read: 100%|██████████| 730600/730600 [01:46<00:00, 6866.71it/s]


## Create Sample

In [5]:
profiler = CoreProfiler(core)
stats = profiler.execute()
stats

memory                       16324884838.00000
impressions                     42300135.00000
no_action                       40655649.00000
clicks                           1644256.00000
conversions                         9032.00000
click_through_rate                     3.88712
conversion_rate                        0.02135
unique_common_feature_sets        730600.00000
min_features_per_sample                1.00000
max_features_per_sample               59.00000
num_na                                 0.00000
dtype: float64

### Core Dataset Target Sample Statistics

In [6]:
core.head()

Unnamed: 0,0,1,2,3,4,5
0,1,0,0,bacff91692951881,9,21090522181.021090645531.021090934451....
1,2,0,0,bacff91692951881,10,21091097321.021090462841.021090990351....
2,3,1,0,bacff91692951881,20,21090897311.021090475601.050995117692....
3,4,0,0,bacff91692951881,13,30193516651.021090503641.021090833881....
4,5,0,0,bacff91692951881,9,20549456631.030193516651.021691721791....


### Core Dataset Common Features Frequency Distribution


In [7]:
common_features_value_counts = core[3].value_counts(normalize=False)
common_features_value_counts = common_features_value_counts.to_frame()
common_features_value_counts.reset_index(inplace=True)
common_features_value_counts.columns = ['common_feature_index','count']
common_features_value_counts.head()

Unnamed: 0,common_feature_index,count
0,d38b408b739306ae,1229
1,974869115883abef,1210
2,9a625acb23321d58,933
3,106b9ac1a4714d88,919
4,a9df094140bfa51f,914


## Common Features Selection
Sort common features dataset by size of the feature list and merge it with the counts.

In [8]:
common_with_counts = pd.merge(left=common, right=common_features_value_counts, left_on=0, right_on='common_feature_index')
common_with_counts.sort_values(by='count', ascending=False,inplace=True)
common_with_counts['cum'] = common_with_counts['count'].cumsum()
common_with_counts.head()

Unnamed: 0,0,1,2,common_feature_index,count,cum
428460,d38b408b739306ae,1592,110_1410826420.6931512134387491.0122343...,d38b408b739306ae,1229,1229
274490,974869115883abef,827,110_1425085611.098611011690031.01243438...,974869115883abef,1210,2439
337614,9a625acb23321d58,2030,109_144559161.09861101240111.0124343876...,9a625acb23321d58,933,3372
571873,106b9ac1a4714d88,1592,110_1420512390.6931512134387491.0122343...,106b9ac1a4714d88,919,4291
283804,a9df094140bfa51f,1883,110_1432968991.6094412134386581.0122343...,a9df094140bfa51f,914,5205


Obtain FRAC_CORE observations from thee common dataset, and sum the count to get the number of core observations from which to sample.

### Select by Size of Feature List

In [9]:
select_by_size = common_with_counts.sort_values(by=1, axis=0)
select_by_size = select_by_size[0:int(select_by_size.shape[0]*FRAC_CORE)]
select_by_size.head()

Unnamed: 0,0,1,2,common_feature_index,count,cum
425110,ccdd5f84bcb52a0a,1,1011481851.0,ccdd5f84bcb52a0a,27,40453652
62317,7d051c0cc01b70a6,1,1013448181.0,7d051c0cc01b70a6,41,34825499
248596,63d664c8169c2362,1,1013450881.0,63d664c8169c2362,32,39339705
252076,6a9d35b8700db8e5,1,1013460281.0,6a9d35b8700db8e5,48,28709734
676718,fa674f78249df72f,1,101942691.0,fa674f78249df72f,50,26748547


In [10]:
stats = {}
stats['total_common_features'] = common.shape[0]
stats['sample_common_features'] = select_by_size.shape[0]
stats['total_core_observations'] = select_by_size['count'].sum() 
stats['size'] = select_by_size[[0,1,2]].memory_usage(deep=True).sum()
stats = pd.Series(stats)
stats

total_common_features      730600
sample_common_features        730
total_core_observations     28678
size                       169735
dtype: int64

## Select by Count

In [11]:
select_by_count = common_with_counts.sort_values(by='count', ascending=False, axis=0)
select_by_count = select_by_count[0:int(select_by_count.shape[0]*FRAC_CORE)]
select_by_count.head()

Unnamed: 0,0,1,2,common_feature_index,count,cum
428460,d38b408b739306ae,1592,110_1410826420.6931512134387491.0122343...,d38b408b739306ae,1229,1229
274490,974869115883abef,827,110_1425085611.098611011690031.01243438...,974869115883abef,1210,2439
337614,9a625acb23321d58,2030,109_144559161.09861101240111.0124343876...,9a625acb23321d58,933,3372
571873,106b9ac1a4714d88,1592,110_1420512390.6931512134387491.0122343...,106b9ac1a4714d88,919,4291
283804,a9df094140bfa51f,1883,110_1432968991.6094412134386581.0122343...,a9df094140bfa51f,914,5205


In [12]:
stats = {}
stats['total_common_features'] = common.shape[0]
stats['sample_common_features'] = select_by_count.shape[0]
stats['total_core_observations'] = select_by_count['count'].sum() 
stats['size'] = select_by_count[[0,1,2]].memory_usage(deep=True).sum()
stats = pd.Series(stats)
stats

total_common_features        730600
sample_common_features          730
total_core_observations      396235
size                       17310479
dtype: int64

## Filter Core Sample by Selected Common Features

In [13]:
core_filtered = pd.merge(left=select_by_count[0], left_on=0, right=core, right_on=3)
core_filtered = core_filtered[['0_y',1,2,3,4,5]]
core_filtered.columns = [0,1,2,3,4,5]
core_filtered.head()

Unnamed: 0,0,1,2,3,4,5
0,13678876,0,0,d38b408b739306ae,18,21090202881.021090832581.021090515691....
1,13678877,0,0,d38b408b739306ae,12,20789790851.021090713481.021090895191....
2,13678878,0,0,d38b408b739306ae,11,21090603421.021692983741.021090309701....
3,13678879,0,0,d38b408b739306ae,14,853100223112.74084853100580683.58815210...
4,13678880,0,0,d38b408b739306ae,18,50893563095.14749853100247865.510068531...


## Create Stratified Sample

In [14]:
sampler = TaobaoSampler(core_data=core, common_features_data=common, frac=0.001, random_state=602)
core_sample, common_features_sample = sampler.execute()

NameError: name 'sample' is not defined

In [15]:
profiler = CoreProfiler(core_sample)
sample_stats = profiler.execute()
sample_stats

memory                       16720886.00000
impressions                     42298.00000
no_action                       40655.00000
clicks                           1643.00000
conversions                         8.00000
click_through_rate                  3.88434
conversion_rate                     0.01891
unique_common_feature_sets        730.00000
min_features_per_sample             4.00000
max_features_per_sample            42.00000
num_na                              0.00000
dtype: float64

In [16]:
common_features_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42298 entries, 0 to 42297
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       42298 non-null  int64 
 1   1       42298 non-null  int64 
 2   2       42298 non-null  int64 
 3   3       42298 non-null  object
 4   4       42298 non-null  int64 
 5   5       42298 non-null  object
dtypes: int64(4), object(2)
memory usage: 2.3+ MB
