# Alibaba 
## Extract Transform Load
This notebook 

In [1]:
import pandas as pd
from deepctr.data.dag import DagBuilder
from deepctr.utils.io import YamlIO
from deepctr.utils.sample import sample_from_file

In [2]:
# Flags
ETL = False
INSPECT = False
SAMPLE = True
# Parameters
SAMPLE_SIZE = 50000
HEADER=True
# FILEPATHS
FILEPATH = {
    'raw': {
        'impression': 'data/alibaba/raw/raw_sample.csv',
        'ad': 'data/alibaba/raw/ad_feature.csv',
        'user': 'data/alibaba/raw/user_profile.csv',
        'behavior': 'data/alibaba/raw/behavior_log.csv'
    },
    'development': {
        'impression': 'data/alibaba/development/raw_sample.csv',
        'ad': 'data/alibaba/development/ad_feature.csv',
        'user': 'data/alibaba/development/user_profile.csv',
        'behavior': 'data/alibaba/development/behavior_log.csv'        
    }
}

In [3]:
if ETL:
    config_filepath = "config/alibaba.yml"
    yaml = YamlIO()
    config = yaml.read(config_filepath)
    builder = DagBuilder(config=config)
    dag = builder.build()
    dag.run()


# Data Inspection

In [4]:
def inspect(name, filepath):
    
    df = pd.read_csv(filepath, index_col=False)
    code = compile(source='name',filename="<string>", mode='eval')
    eval(code)        
    print("\n\nDataset {} has {} columns and {} observations".format(name.capitalize(), str(df.shape[1]), str(df.shape[0])),"\n")
    print(df.info())
    print(df.head())        


In [5]:
if INSPECT: 
    files = {'raw': "data/alibaba/raw/raw_sample.csv", 'user': "data/alibaba/raw/user_profile.csv",'ad': "data/alibaba/raw/ad_feature.csv", 'behavior': "data/alibaba/raw/behavior_log.csv"}
    for name, filepath in files.items():
        inspect(name, filepath)


## Alibaba Development Set
Create development set from 1000 impressions.

In [6]:
def summarize(df: pd.DataFrame, name: str):
    rows = df.shape[0]
    cols = df.shape[1]
    mem = df.memory_usage(deep=True).sum()
    print("{} file has {} rows and {} columns. Total memory usage is {} bytes".format(name, str(rows),str(cols), str(mem)))

In [7]:
# Sample impressions
impression = sample_from_file(source=FILEPATH['raw']['impression'], size=SAMPLE_SIZE, header=HEADER)
impression.to_csv(FILEPATH['development']['impression'], header=True, index=False)
begin_date = impression['time_stamp'].min()
end_date = impression['time_stamp'].max()
summarize(df=impression, name='impression')

impression file has 50000 rows and 6 columns. Total memory usage is 5400128 bytes


In [8]:
# Sample user
user = pd.read_csv(FILEPATH['raw']['user'], sep=",", header=0,index_col=None, low_memory=False)
user = user.loc[user['userid'].isin(impression['user'])]
user.to_csv(FILEPATH['development']['user'], header=True, index=False)
summarize(df=user, name='user')

user file has 40734 rows and 9 columns. Total memory usage is 3258720 bytes


In [9]:
# Sample AD
ad = pd.read_csv(FILEPATH['raw']['ad'], sep=",", header=0,index_col=None, low_memory=False)
ad = ad.loc[ad['adgroup_id'].isin(impression['adgroup_id'])]
ad.to_csv(FILEPATH['development']['ad'], header=True, index=False)
summarize(df=ad, name='ad')

ad file has 34683 rows and 6 columns. Total memory usage is 1942248 bytes


In [10]:
# Sample Behavior
behavior = pd.read_csv(FILEPATH['raw']['behavior'], sep=",", header=0,index_col=None, low_memory=False)
behavior = behavior.loc[(behavior['user'].isin(impression['user'])) & (behavior['time_stamp'] >= begin_date) & (behavior['time_stamp'] <= end_date)]
behavior.to_csv(FILEPATH['development']['behavior'], header=True, index=False)
summarize(df=behavior, name='behavior')

behavior file has 18473919 rows and 5 columns. Total memory usage is 1830046295 bytes
