In [1]:
import numpy as np
import pandas as pd
import causalml
import random

## 1. Train / test split

In [2]:
df = pd.read_csv('../data/criteo-uplift-v2.1.csv')
df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,treatment,conversion,visit,exposure
0,12.616365,10.059654,8.976429,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
1,12.616365,10.059654,9.002689,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
2,12.616365,10.059654,8.964775,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
3,12.616365,10.059654,9.002801,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
4,12.616365,10.059654,9.037999,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0


In [3]:
df['treatment_group'] = df.treatment.replace({
    0:'control', 
    1:'treatment'
})

df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,treatment,conversion,visit,exposure,treatment_group
0,12.616365,10.059654,8.976429,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0,treatment
1,12.616365,10.059654,9.002689,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0,treatment
2,12.616365,10.059654,8.964775,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0,treatment
3,12.616365,10.059654,9.002801,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0,treatment
4,12.616365,10.059654,9.037999,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0,treatment


In [4]:
df.groupby("treatment_group").conversion.count()/df.shape[0]*100

treatment_group
control      14.999987
treatment    85.000013
Name: conversion, dtype: float64

In [5]:
from sklearn.model_selection import train_test_split

train_df, val_test_df = train_test_split(df, test_size=0.3, random_state=123, stratify=df.treatment_group)
test_df, val_df = train_test_split(val_test_df, test_size=0.5, random_state=123, stratify=val_test_df.treatment_group)

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [6]:
print("Train: {} rows representing {}% of data".format(train_df.shape[0], round(train_df.shape[0]/df.shape[0]*100)))
train_df.groupby("treatment_group").conversion.count()/train_df.shape[0]*100

Train: 9785714 rows representing 70% of data


treatment_group
control      14.999989
treatment    85.000011
Name: conversion, dtype: float64

In [7]:
print("Val: {} rows representing {}% of data".format(val_df.shape[0], round(val_df.shape[0]/df.shape[0]*100)))
val_df.groupby("treatment_group").conversion.count()/val_df.shape[0]*100

Val: 2096939 rows representing 15% of data


treatment_group
control      15.000007
treatment    84.999993
Name: conversion, dtype: float64

In [8]:
print("Test: {} rows representing {}% of data".format(test_df.shape[0], round(test_df.shape[0]/df.shape[0]*100)))
test_df.groupby("treatment_group").conversion.count()/test_df.shape[0]*100

Test: 2096939 rows representing 15% of data


treatment_group
control      14.999959
treatment    85.000041
Name: conversion, dtype: float64

## 2. Rescaling

In [9]:
cols = ['f0', 'f1', 'f10', 'f11', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
train_df[cols] = scaler.fit_transform(train_df[cols])
val_df[cols] = scaler.transform(val_df[cols])
test_df[cols] = scaler.transform(test_df[cols])

In [11]:
train_df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,treatment,conversion,visit,exposure,treatment_group
0,0.42317,-0.098397,-0.7758,0.374792,-0.169941,0.201578,0.190755,-0.222262,0.675653,-0.404445,-0.196344,0.100345,1,0,0,0,treatment
1,1.223937,-0.098397,-0.7758,0.374792,-0.169941,0.201578,0.035312,-0.222262,0.675653,-0.404445,-0.196344,0.100345,1,0,0,0,treatment
2,1.099174,-0.098397,-0.7758,0.374792,-0.169941,0.201578,0.626275,-0.222262,0.675653,-0.404445,-0.196344,0.100345,1,0,0,0,treatment
3,0.504254,-0.098397,-0.7758,0.374792,-0.169941,0.201578,0.381003,-0.222262,0.675653,-0.404445,-0.196344,0.100345,1,0,0,0,treatment
4,0.426042,-0.098397,-0.7758,0.374792,-0.169941,0.201578,0.035312,-0.222262,0.675653,-0.404445,-0.196344,0.100345,1,0,0,0,treatment


## 3. Save

In [12]:
train_df.to_csv("../outputs/criteo_train.csv", index=False)
val_df.to_csv("../outputs/criteo_val.csv", index=False)
test_df.to_csv("../outputs/criteo_test.csv", index=False)