# Create Development Datasets
We have two datasets to sample: the core and commmon features datasets.

- Core Dataset: The core dataset should be a stratified sample of the complete core dataset, reflecting the same frequency  distribution of clicks and conversions. The target sample size is a parameter set to 0.001 of the complete dataset.
- Common Features Dataset: This a lookup dataset with a foreign key on the core dataset. We sample this with the contraing that every core feature set also exists in the core dataset sample. 

In [1]:
import os
import tarfile
import pandas as pd
import boto3
from botocore.exceptions import ClientError
import logging
import tempfile

from deepcvr.data.sampling import TaobaoSampler
from deepcvr.data.profile import CoreProfiler
from deepcvr.utils.io import load_csv
pd.set_option('display.float_format', lambda x: '%.5f' % x)
# ------------------------------------------------------------------------------------------------ #
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
#Parameters
FRAC_CORE = 0.001
MAX_FRAC_COMMON_FEATURES = 0.01
RANDOM_STATE = 602
# Filepaths
FILEPATHS = {}
FILEPATHS['TRAIN_CORE'] = "data/archive/raw/sample_skeleton_train.csv"
FILEPATHS['TRAIN_COMMON_FEATURES'] = "data/archive/raw/common_features_train.csv"
FILEPATHS['TEST_CORE'] = "data/archive/raw/sample_skeleton_test.csv"
FILEPATHS['TEST_COMMON_FEATURES'] = "data/archive/raw/common_features_test.csv"


## Read Data

In [3]:
def read_data(core_filepath, common_features_filepath):
    core = load_csv(core_filepath, n_chunks=10)
    common = load_csv(common_features_filepath, n_chunks=10)
    return core, common


In [4]:
core_train, common_train = read_data(FILEPATHS['TRAIN_CORE'],FILEPATHS['TRAIN_COMMON_FEATURES'])
core_test, common_test = read_data(FILEPATHS['TEST_CORE'],FILEPATHS['TEST_COMMON_FEATURES'])

Rows read: 100%|██████████| 42300135/42300135 [02:44<00:00, 257582.59it/s]
Rows read: 100%|██████████| 730600/730600 [01:46<00:00, 6888.36it/s]
Rows read: 100%|██████████| 43016840/43016840 [03:04<00:00, 232717.48it/s]
Rows read: 100%|██████████| 884212/884212 [02:31<00:00, 5820.16it/s]


## Create Samples

In [5]:
def create_sample(core_data, common_features_data):
    sampler = TaobaoSampler(core_data=core_data, common_features_data=common_features_data, random_state=RANDOM_STATE)
    core_sample, common_features_sample = sampler.execute()
    msg = "Samples Created:\n\tCore Data Observations: {}\n\tCommon Features Observations: {}".format(str(core_sample.shape[0]),str(common_features_sample.shape[0]))
    logger.info(msg)
    return core_sample, common_features_sample


### Create Training Samples

In [6]:
core_train_sample, common_train_sample = create_sample(core_data=core_train, common_features_data=common_train)


INFO:__main__:Samples Created:
	Core Data Observations: 42298
	Common Features Observations: 730


### Create Test Samples

In [7]:
core_test_sample, common_test_sample = create_sample(core_data=core_test, common_features_data=common_test)


INFO:__main__:Samples Created:
	Core Data Observations: 43016
	Common Features Observations: 884


## Save Samples

In [8]:
def save_sample(data, filepath):
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    data.to_csv(filepath,header=False, index=False, sep=",")
    logger.info("{} Saved".format(filepath))

### Save Training Sample

In [9]:
data = core_train_sample
directory = "data/temp"
train_core_sample_filepath = os.path.join(directory,"sample_skeleton_train.csv")
save_sample(data=data,filepath=train_core_sample_filepath)

data=common_train_sample
train_common_sample_filepath = os.path.join(directory,"common_features_train.csv")
save_sample(data=data,filepath=train_common_sample_filepath)

INFO:__main__:data/temp/sample_skeleton_train.csv Saved
INFO:__main__:data/temp/common_features_train.csv Saved


### Save Test Sample

In [10]:
data = core_test_sample
directory = "data/temp"
test_core_sample_filepath = os.path.join(directory,"sample_skeleton_test.csv")
save_sample(data=data,filepath=test_core_sample_filepath)

data=common_test_sample
test_common_sample_filepath = os.path.join(directory,"common_features_test.csv")
save_sample(data=data,filepath=test_common_sample_filepath)

INFO:__main__:data/temp/sample_skeleton_test.csv Saved
INFO:__main__:data/temp/common_features_test.csv Saved


## Create TARGZ Files

In [11]:
def create_targz_file(core_sample_filepath, common_features_filepath, tar_filepath):
    with tarfile.open(tar_filepath,"w:gz") as tar:
        tar.add(core_sample_filepath)
        logger.info("Added {} to {}".format(core_sample_filepath, tar_filepath))
        tar.add(common_features_filepath)
        logger.info("Added {} to {}".format(common_features_filepath, tar_filepath))

### Create Training TarGZ Files


In [12]:
train_tarfile = os.path.join(directory, "taobao_train.tar.gz")
create_targz_file(core_sample_filepath=train_core_sample_filepath, common_features_filepath=train_common_sample_filepath, tar_filepath=train_tarfile)

INFO:__main__:Added data/temp/sample_skeleton_train.csv to data/temp/taobao_train.tar.gz
INFO:__main__:Added data/temp/common_features_train.csv to data/temp/taobao_train.tar.gz


### Create Test TarGZ File

In [13]:
test_tarfile = os.path.join(directory, "taobao_test.tar.gz")
create_targz_file(core_sample_filepath=test_core_sample_filepath, common_features_filepath=test_common_sample_filepath, tar_filepath=test_tarfile)

INFO:__main__:Added data/temp/sample_skeleton_test.csv to data/temp/taobao_test.tar.gz
INFO:__main__:Added data/temp/common_features_test.csv to data/temp/taobao_test.tar.gz


## Upload Sample Data

In [14]:
def upload(tar_filepath, bucket, object_name):
    
    s3 = boto3.resource('s3')
    try:
        s3.Bucket(bucket).upload_file(Filename=tar_filepath, Key=object_name)
    except ClientError as e:
        logging.error(e)
        return False
    logger.info("Uploaded {}".format(tar_filepath))
    return True

### Upload Training File

In [16]:
object_name = "development/taobao_train.tar.gz"
upload(train_tarfile, bucket="deepcvr-data", object_name=object_name)

DEBUG:botocore.hooks:Event choose-service-name: calling handler <function handle_service_name_alias at 0x7f289fd91ee0>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function add_generate_presigned_post at 0x7f289fdbcdc0>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function lazy_call.<locals>._handler at 0x7f289fc83af0>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function add_generate_presigned_url at 0x7f289fdbcb80>
DEBUG:botocore.endpoint:Setting s3 timeout as (60, 60)
DEBUG:botocore.client:Registering retry handlers for service: s3
DEBUG:boto3.resources.factory:Loading s3:s3
DEBUG:boto3.resources.factory:Loading s3:Bucket
DEBUG:boto3.resources.model:Renaming Bucket attribute name
DEBUG:botocore.hooks:Event creating-resource-class.s3.Bucket: calling handler <function lazy_call.<locals>._handler at 0x7f2592f17c10>
DEBUG:s3transfer.utils:Acquiring 0
DEBUG:s3transfer.tasks:UploadSubmissionTask(transfer_id=0, {'tr

True

### Upload Test File

In [18]:
object_name = "development/taobao_test.tar.gz"
upload(test_tarfile, bucket="deepcvr-data", object_name=object_name)

DEBUG:botocore.hooks:Event choose-service-name: calling handler <function handle_service_name_alias at 0x7f289fd91ee0>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function add_generate_presigned_post at 0x7f289fdbcdc0>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function lazy_call.<locals>._handler at 0x7f289fc83af0>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function add_generate_presigned_url at 0x7f289fdbcb80>
DEBUG:botocore.endpoint:Setting s3 timeout as (60, 60)
DEBUG:botocore.client:Registering retry handlers for service: s3
DEBUG:boto3.resources.factory:Loading s3:s3
DEBUG:boto3.resources.factory:Loading s3:Bucket
DEBUG:boto3.resources.model:Renaming Bucket attribute name
DEBUG:botocore.hooks:Event creating-resource-class.s3.Bucket: calling handler <function lazy_call.<locals>._handler at 0x7f2592f17c10>
DEBUG:s3transfer.utils:Acquiring 0
DEBUG:s3transfer.tasks:UploadSubmissionTask(transfer_id=0, {'tr

True