# Preprocessing

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import pickle
from tqdm import tqdm
%matplotlib inline
plt.style.use('ggplot')

In [17]:
#根据选出的12个渠道过滤数据
def data_pickup(datapd, camp):
    criteo_process = datapd[datapd.campaign==camp[0]]
    for i in camp[1:]:
        tmp = datapd[datapd.campaign==i]
        criteo_process =pd.concat([criteo_process, tmp], axis=0)
    return criteo_process

# 特征维度展示
def show_feature_dims(criteo_process):
    for i in criteo_process.columns[-9:]:
        print(len(set(criteo_process[i])))

# 数据集包含用户个数
def num_of_users(criteo_process):
    return len(list(set(criteo_process.uid)))

#数据集正负样本比例
def pos_neg(criteo_process):
    return len(criteo_process[criteo_process.conversion==1])/len(criteo_process[criteo_process.conversion==0])


In [18]:
def data_reput(criteo_process):
    users = list(set(criteo_process.uid))
    U = []
    C = []
    cat1_9 = []
    cost = []
    T = []
    Y = []
    CPO = []
    count_1 = 0
    count_0 = 0
    for i in tqdm(users):
        cp = criteo_process[criteo_process.uid==i].reset_index()

        if cp['conversion'].tolist().count(1) > 0:
            id_1_list= cp[cp['conversion']==1].index.tolist()
            id_start = 0
            id_end = id_1_list[-1]
            
            for id_1 in id_1_list:
                count_1 += 1
                U.append(i)
                tmp = cp.iloc[id_start:id_1+1, :]

                C.append(tmp.campaign.values.tolist())
                cat_jn = []
                for idx, catp in tmp.iterrows():
                    cat_jn.append(catp[-9:].values.tolist())
                cat1_9.append(cat_jn)
                cost.append(tmp.cost.values.tolist())
                T.append(tmp.timestamp.values.tolist())
                Y.append(1)
                CPO.append(tmp.cpo.values.tolist())
                id_start = id_1+1
            
            if id_end < len(cp)-3: # 只保存list
                count_0 += 1
                U.append(i)
                tmp = cp.iloc[id_end+1:, :]

                C.append(tmp.campaign.values.tolist())
                cat_jn = []
                for idx, catp in tmp.iterrows():
                    cat_jn.append(catp[-9:].values.tolist())
                cat1_9.append(cat_jn)
                cost.append(tmp.cost.values.tolist())
                T.append(tmp.timestamp.values.tolist())
                Y.append(0)
                CPO.append(tmp.cpo.values.tolist())
        else:
            count_0 += 1
            U.append(i)
            tmp = cp
            C.append(tmp.campaign.values.tolist())
            cat_jn = []
            for idx, catp in tmp.iterrows():
                cat_jn.append(catp[-9:].values.tolist())
            cat1_9.append(cat_jn)
            cost.append(tmp.cost.values.tolist())
            T.append(tmp.timestamp.values.tolist())
            Y.append(0)
            CPO.append(tmp.cpo.values.tolist())
    return U, C, cat1_9, cost, T, CPO, Y

In [19]:
def score_alg(data):
    rank = {}
    s_p = 1
    s_n = 0.1
    for i in list(set(data.campaign)):
        rank[i] = 0

    for i, row in data.iterrows():
        if row['conversion'] == 1:
            rank[row['campaign']] += s_p
        else:
            rank[row['campaign']] -= s_n
    
    sorted(rank.items(), key=lambda x:x[1])
    rank = pd.DataFrame(list(rank.items()), columns=['c_id','score'])
    rank = rank.sort_values('score', ascending=False)
    camp15 = list(rank.iloc[:15, 0])
    return camp15

def longseq_alg(data):
    rank = {}
    for i in list(set(data.campaign)):
        rank[i] = 0


    for i in list(set(data.campaign)):
        userdf = data[data.campaign==i]['uid'].value_counts()
        rank[i] = userdf.mean()
    
    sorted(rank.items(), key=lambda x:x[1])
    rank_df = pd.DataFrame(list(rank.items()), columns=['c_id','score'])
    rank_df = rank_df.sort_values('score', ascending=False)
    camp150 = list(rank_df.iloc[:150, 0])
    return camp150

def random_alg(data):
    return random.sample(list(set(data.campaign)),12)

In [20]:
def encoding_features(criteo_process):
    users = {}
    camps = {}
    cats = {}
    for i in range(9):
        cats[i] = {}
    
    for idx, u in enumerate(set(criteo_process.uid)):
        users[u] = idx
    for idx, c in enumerate(set(criteo_process.campaign)):
        camps[c] = idx
    for i in range(9):
        for idx, cat in enumerate(set(criteo_process[f'cat{i+1}'])):
            cats[i][cat] = idx
            
    for idx, row in criteo_process.iterrows():
        criteo_process.loc[idx, 'uid'] = users[row['uid']]
        criteo_process.loc[idx, 'campaign'] = camps[row['campaign']]
        for i in range(9):
            criteo_process.loc[idx, f'cat{i+1}'] = cats[i][row[f'cat{i+1}']]


    return criteo_process

In [21]:
print('extract data...')
DATA_FILE='./criteo_attribution_dataset.tsv.gz'
df = pd.read_csv(DATA_FILE, sep='\t', compression='gzip')
FEATURES_col = ['conversion','uid','campaign', 'timestamp', 'cpo', 'cost', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 
            'cat7', 'cat8', 'cat9']
data_raw = df[FEATURES_col]
data_raw.head()

extract data...


Unnamed: 0,conversion,uid,campaign,timestamp,cpo,cost,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9
0,0,20073966,22589171,0,0.390794,1e-05,5824233,9312274,3490278,29196072,11409686,1973606,25162884,29196072,29196072
1,0,24607497,884761,2,0.0596,1e-05,30763035,9312274,14584482,29196072,11409686,1973606,22644417,9312274,21091111
2,0,28474333,18975823,2,0.149706,0.000183,138937,9312274,10769841,29196072,5824237,138937,1795451,29196072,15351056
3,1,7306395,29427842,3,0.154785,9.4e-05,28928366,26597095,12435261,23549932,5824237,1973606,9180723,29841067,29196072
4,0,25357769,13365547,3,0.037583,3.2e-05,138937,26597094,31616034,29196072,11409684,26597096,4480345,29196072,29196072


In [56]:
# camp150 = longseq_alg(data_raw)
# c150 = data_pickup(data_raw, camp150)
# c150.head()
# camp15= score_alg(c150)
# c15 = data_pickup(c150, camp15)
# c15.head()
while(1):
    camp12 = random_alg(data_raw)
    c12 = data_pickup(data_raw, camp12)
    tp_len = len(c12)
    user_len = len(set(c12['uid']))
    print(user_len, tp_len, pos_neg(c12))
    if user_len > 150000 and tp_len > 700000:
        break

88227 206774 0.040456087030905634
108105 192710 0.032904363486286725
101510 237976 0.05018909723172244
116023 237873 0.07640199285937309
96807 206368 0.06836748428779987
102016 219285 0.1058468148626296
161264 438500 0.1098203280613704
188183 456817 0.12313176884212079
55576 110575 0.036112855014477
104980 191459 0.025979175932822823
80649 152833 0.02605537354315484
128970 266942 0.02069751346886961
21280 36875 0.05351122792983258
130919 338317 0.06713496702236045
122649 276020 0.056402444858639864
81586 166672 0.043676462300481535
86508 218805 0.06977324063480889
71920 157452 0.03413352599257824
256990 514278 0.04107176475589537
103211 230467 0.10269709047238557
207172 516571 0.06921518180369299
126329 253560 0.02686634862266428
91486 151343 0.06898013095347408
88267 221552 0.05443711508990358
54939 98764 0.028406014411262445
103539 211544 0.08736706297192966
109618 182903 0.023182049575126287
144462 309724 0.0795162246000488
166299 279707 0.03764282534500668
78420 143366 0.0498619624

In [58]:
print('encoding f...')
data = encoding_features(c12)
data.head()

encoding f...


Unnamed: 0,conversion,uid,campaign,timestamp,cpo,cost,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9
172,0,218265,5,95,0.152029,0.00066,5,19,37,5,4,4,528,1,3
281,0,108442,5,164,0.178822,0.000337,7,4,14,1,4,4,171,1,13
282,0,108442,5,165,0.178822,0.001348,7,4,14,1,4,4,171,1,24
653,0,318327,5,380,0.256796,0.000461,3,20,36,1,4,15,1017,2,29
820,0,65758,5,489,0.233612,1.9e-05,7,7,2,1,4,4,1017,1,13


In [59]:
print('data reputing...')
U, C, cat1_9, cost, T, CPO, Y = data_reput(c12)

data reputing...


100%|██████████| 362962/362962 [04:31<00:00, 1339.20it/s]


In [62]:
print('dataset info')
print(f'# users: {len(set(U))}')
print(f'# campaigns: {12}')
print(f'# journeys: {len(C)}')
print(f'# convert journeys: {len(c12[c12.conversion==1])}')
print(f'# touch points: {len(c12)}')


dataset info
# users: 362962
# campaigns: 12
# journeys: 398370
# convert journeys: 52296
# touch points: 865435


In [64]:
c1 = 0
c3 = 0
c5 = 0
cn = 0
csame = 0
for i in C:
    if len(i) == 1:
        c1 += 1
    if len(i) == 3:
        c3 += 1
    if len(i) == 5:
        c5 += 5
    if len(i) > 5:
        cn += 1
    if len(set(i)) == 1:
        csame += 1
print(c1, c3, c5, cn, csame)

240502 32466 54170 26131 396478


In [28]:
len(C)

633796

In [61]:
print('saving')
f_list = ['U','C','cat1_9','cost','T','CPO', 'Y']
info = [U,C,cat1_9,cost,T,CPO, Y]
for i,inf in zip(f_list,info):
    with open('../'+i+'.pkl', 'wb') as f:
        pickle.dump(inf, f)

saving
