# Create Development Datasets
We have two datasets to sample: the core and commmon features datasets.

- Core Dataset: The core dataset should be a stratified sample of the complete core dataset, reflecting the same frequency  distribution of clicks and conversions. The target sample size is a parameter set to 0.001 of the complete dataset.
- Common Features Dataset: This a lookup dataset with a foreign key on the core dataset. We sample this with the contraing that every core feature set also exists in the core dataset sample. 

In [1]:
import os
import tarfile
import pandas as pd
import boto3
from botocore.exceptions import ClientError
import logging
import tempfile

from deepcvr.data.sampling import TaobaoSampler
from deepcvr.data.profile import CoreProfiler
from deepcvr.utils.io import CsvIO
pd.set_option('display.float_format', lambda x: '%.5f' % x)
# ------------------------------------------------------------------------------------------------ #
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
#Parameters
FRAC_CORE = 0.001
MAX_FRAC_COMMON_FEATURES = 0.01
RANDOM_STATE = 602
# Filepaths
FILEPATHS = {}
FILEPATHS['TRAIN_CORE'] = "data/archive/production/raw/sample_skeleton_train.csv"
FILEPATHS['TRAIN_COMMON_FEATURES'] = "data/archive/production/raw/common_features_train.csv"
FILEPATHS['TEST_CORE'] = "data/archive/production/raw/sample_skeleton_test.csv"
FILEPATHS['TEST_COMMON_FEATURES'] = "data/archive/production/raw/common_features_test.csv"


## Read Data

In [3]:
def read_data(core_filepath, common_features_filepath):
    io = CsvIO()
    core = io.load(core_filepath, n_chunks=10)
    common = io.load(common_features_filepath, n_chunks=10)
    return core, common


In [4]:
core_train, common_train = read_data(FILEPATHS['TRAIN_CORE'],FILEPATHS['TRAIN_COMMON_FEATURES'])
core_test, common_test = read_data(FILEPATHS['TEST_CORE'],FILEPATHS['TEST_COMMON_FEATURES'])

Rows read: 100%|██████████| 42300135/42300135 [02:38<00:00, 266541.83it/s]
Rows read: 100%|██████████| 730600/730600 [01:49<00:00, 6660.60it/s]
Rows read: 100%|██████████| 43016840/43016840 [03:04<00:00, 232735.14it/s]
Rows read: 100%|██████████| 884212/884212 [02:33<00:00, 5747.07it/s]


In [37]:
print(core_train.head())
print(common_train.head())
print(core_test.head())
print(common_test.head())


   0  1  2                 3   4  \
0  1  0  0  bacff91692951881   9   
1  2  0  0  bacff91692951881  10   
2  3  1  0  bacff91692951881  20   
3  4  0  0  bacff91692951881  13   
4  5  0  0  bacff91692951881   9   

                                                   5  
0  21090522181.021090645531.021090934451....  
1  21091097321.021090462841.021090990351....  
2  21090897311.021090475601.050995117692....  
3  30193516651.021090503641.021090833881....  
4  20549456631.030193516651.021691721791....  
                  0    1                                                  2
0  84dceed2e3a667f8  343  101313191.012534387741.012634387791.0...
1  0000350f0c2121e7  811  127_1437162241.94591127_1435146270.69315...
2  000091a89d1867ab    7  12534387731.012434387691.012234387611....
3  0001a4114b0ae8bf  231  150_1439166842.3979150_1439407981.070561...
4  0001def19d7cb335  964  150_1439091500.84715150_1439330134.44265.

## Create Samples

In [22]:
def create_sample(core_data, common_features_data):
    sampler = TaobaoSampler(core_data=core_data, common_features_data=common_features_data, random_state=RANDOM_STATE)
    core_sample, common_features_sample = sampler.execute()
    msg = "Samples Created:\n\tCore Data Observations: {}\n\tCommon Features Observations: {}".format(str(core_sample.shape[0]),str(common_features_sample.shape[0]))
    logger.info(msg)
    return core_sample, common_features_sample


### Create Training Samples

In [23]:
core_train_sample, common_train_sample = create_sample(core_data=core_train, common_features_data=common_train)


INFO:__main__:Samples Created:
	Core Data Observations: 42298
	Common Features Observations: 730


### Create Test Samples

In [24]:
core_test_sample, common_test_sample = create_sample(core_data=core_test, common_features_data=common_test)


INFO:__main__:Samples Created:
	Core Data Observations: 43016
	Common Features Observations: 884


## Save Samples

In [25]:
def save_sample(data, filepath):    
    data.to_csv(filepath,header=False, index=False, sep=",")
    logger.info("{} Saved".format(filepath))

### Save Training Sample

In [26]:
data = core_train_sample
train_core_sample_filepath = "sample_skeleton_train.csv"
save_sample(data=data,filepath=train_core_sample_filepath)

data=common_train_sample
train_common_sample_filepath = "common_features_train.csv"
save_sample(data=data,filepath=train_common_sample_filepath)

INFO:__main__:sample_skeleton_train.csv Saved
INFO:__main__:common_features_train.csv Saved


In [39]:
data.head()

Unnamed: 0,0,1,2
0,01abe8393343d4de,1091,127_1437970180.69315127_1434927010.69315...
45,22c9833fa97388be,980,150_1438787872.80838150_1438742481.92176...
88,2836d62cf5190ea1,1062,150_1439348391.55814150_1439245181.20387...
132,37ec091d91ab49d9,749,110_1414067420.6931512134386701.0122343...
168,53075e7639dc5634,241,150_1439076971.09861150_1438895831.17865...


### Save Test Sample

In [27]:
data = core_test_sample
test_core_sample_filepath = "sample_skeleton_test.csv"
save_sample(data=data,filepath=test_core_sample_filepath)

data=common_test_sample
test_common_sample_filepath = "common_features_test.csv"
save_sample(data=data,filepath=test_common_sample_filepath)

INFO:__main__:sample_skeleton_test.csv Saved
INFO:__main__:common_features_test.csv Saved


## Create TARGZ Files

In [28]:
def create_targz_file(core_sample_filepath, common_features_filepath, tar_filepath):
    with tarfile.open(tar_filepath,"w:gz") as tar:
        tar.add(core_sample_filepath)
        logger.info("Added {} to {}".format(core_sample_filepath, tar_filepath))
        tar.add(common_features_filepath)
        logger.info("Added {} to {}".format(common_features_filepath, tar_filepath))
        

### Create Training TarGZ Files


In [29]:
train_tarfile = "taobao_train.tar.gz"
create_targz_file(core_sample_filepath=train_core_sample_filepath, common_features_filepath=train_common_sample_filepath, tar_filepath=train_tarfile)

INFO:__main__:Added sample_skeleton_train.csv to taobao_train.tar.gz
INFO:__main__:Added common_features_train.csv to taobao_train.tar.gz


### Create Test TarGZ File

In [30]:
test_tarfile = "taobao_test.tar.gz"
create_targz_file(core_sample_filepath=test_core_sample_filepath, common_features_filepath=test_common_sample_filepath, tar_filepath=test_tarfile)

INFO:__main__:Added sample_skeleton_test.csv to taobao_test.tar.gz
INFO:__main__:Added common_features_test.csv to taobao_test.tar.gz


## Upload Sample Data

In [31]:
def upload(tar_filepath, bucket, object_name):
    
    s3 = boto3.resource('s3')
    try:
        s3.Bucket(bucket).upload_file(Filename=tar_filepath, Key=object_name)
    except ClientError as e:
        logging.error(e)
        return False
    logger.info("Uploaded {}".format(tar_filepath))
    return True

### Upload Training File

In [32]:
object_name = "development/taobao_train.tar.gz"
upload(train_tarfile, bucket="deepcvr-data", object_name=object_name)

DEBUG:botocore.hooks:Event choose-service-name: calling handler <function handle_service_name_alias at 0x7fbb0aa34ee0>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function add_generate_presigned_post at 0x7fbb0aa62dc0>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function lazy_call.<locals>._handler at 0x7fb7fd342d30>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function add_generate_presigned_url at 0x7fbb0aa62b80>
DEBUG:botocore.endpoint:Setting s3 timeout as (60, 60)
DEBUG:botocore.client:Registering retry handlers for service: s3
DEBUG:boto3.resources.factory:Loading s3:s3
DEBUG:boto3.resources.factory:Loading s3:Bucket
DEBUG:boto3.resources.model:Renaming Bucket attribute name
DEBUG:botocore.hooks:Event creating-resource-class.s3.Bucket: calling handler <function lazy_call.<locals>._handler at 0x7fb7fd3831f0>
DEBUG:s3transfer.utils:Acquiring 0
DEBUG:s3transfer.tasks:UploadSubmissionTask(transfer_id=0, {'tr

True

### Upload Test File

In [33]:
object_name = "development/taobao_test.tar.gz"
upload(test_tarfile, bucket="deepcvr-data", object_name=object_name)

DEBUG:botocore.hooks:Event choose-service-name: calling handler <function handle_service_name_alias at 0x7fbb0aa34ee0>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function add_generate_presigned_post at 0x7fbb0aa62dc0>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function lazy_call.<locals>._handler at 0x7fb7fd342d30>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function add_generate_presigned_url at 0x7fbb0aa62b80>
DEBUG:botocore.endpoint:Setting s3 timeout as (60, 60)
DEBUG:botocore.client:Registering retry handlers for service: s3
DEBUG:boto3.resources.factory:Loading s3:s3
DEBUG:boto3.resources.factory:Loading s3:Bucket
DEBUG:boto3.resources.model:Renaming Bucket attribute name
DEBUG:botocore.hooks:Event creating-resource-class.s3.Bucket: calling handler <function lazy_call.<locals>._handler at 0x7fb7fd3831f0>
DEBUG:s3transfer.utils:Acquiring 0
DEBUG:s3transfer.tasks:UploadSubmissionTask(transfer_id=0, {'tr

True

## Cleanup
Remove files created during the process

In [34]:
def remove_file(filepath):
    if os.path.exists(filepath):
        os.remove(filepath)

In [35]:
files = [train_core_sample_filepath, train_common_sample_filepath, test_core_sample_filepath, test_common_sample_filepath, train_tarfile, test_tarfile]
for file in files:
    remove_file(file)

## Check Results
### Core Data

In [2]:
filepath = "tests/data/development/staged/sample_skeleton_train.csv"
df = pd.read_csv(filepath)

In [3]:
df.head()

Unnamed: 0,sample_id,click_label,conversion_label,common_features_index,num_core_features,features_list,partition
0,11515523,0,0,d5f794198192a713,8,20787184361.021091122781.021090428991....,0
1,11060573,0,0,5ab1af84e729a269,13,21090299621.021090904571.021691506511....,1
2,19579112,0,0,15100e25b3982cc3,17,30193516661.021692536751.021091144811....,2
3,12131559,0,0,4e21ad4cf5b148d7,10,20555650711.021090633511.021090746901....,3
4,34008079,0,0,654765381422bb63,17,21090623511.021090762221.050893550770....,4


### Common Features

In [4]:
filepath = "tests/data/development/staged/common_features_train.csv"
df = pd.read_csv(filepath)

In [5]:
df.head()

Unnamed: 0,common_features_index,num_common_features,features_list,partition
0,023a8f5b7b8a3348,1052,110_1414381141.09861110_1418460592.07944...,0
1,030dab7c09c9213d,748,150_1438980633.37304150_1439196042.03693...,1
2,05b3fd32a3e72c87,852,127_1434944021.09861127_1438180851.09861...,2
3,09ed88afc2780752,459,150_1439081542.19722150_1438815952.99987...,3
4,0b7a30a3cacee086,459,150_1439261452.6390612134386581.0122343...,4
