# Alibaba Data Generation
This notebook generates class-proportional, distribution-preserving downsamples of the Alibaba Ad Display Click dataset at downsampling rates of 1% and 10%.

In [10]:
import numpy as np
import pandas as pd
import multiprocessing
import pickle
from scipy.stats import anderson
from datetime import datetime
from pyspark.sql.functions import col
from deepctr.dal.file import DataParams, DataAccessObject

## Impressions Data

In [11]:
filepath = "data/alibaba/vesuvio/raw/raw_sample.csv"
df = pd.read_csv(filepath, header=0, low_memory=False)
print(df.head())

     user  time_stamp  adgroup_id          pid  nonclk  clk
0  581738  1494137644           1  430548_1007       1    0
1  449818  1494638778           3  430548_1007       1    0
2  914836  1494650879           4  430548_1007       1    0
3  914836  1494651029           5  430548_1007       1    0
4  399907  1494302958           8  430548_1007       1    0


In [15]:
def distribution_numeric(column: str, data: int) -> dict:
    data = data.dropna().values
    stat = anderson(data, dist='norm')[0]
    return column, stat


In [16]:
def driver():
    PROCESSES = 5
    with multiprocessing.Pool(PROCESSES) as pool:
        params =  [('user',df['user']), 
                   ('time_stamp',df['time_stamp']), 
                   ('adgroup_id',df['adgroup_id']),  
                   ('nonclk',df['nonclk']),  
                   ('clk',df['clk'])]
        results = [pool.apply_async(distribution_numeric, p) for p in params]
        distributions = {}
        for r in results:
            result = r.get()
            distributions[result[0]] = result[1]
        return distributions
        




In [17]:
metadata = "data/alibaba/vesuvio/metadata/distributions.pickle"
distributions = driver()


	 ('user', 291249.6999502629)
	 ('time_stamp', 314456.14361579716)
	 ('adgroup_id', 477517.62918816507)
	 ('nonclk', 9569965.01305689)
	 ('clk', 9569965.01305689)
