In [13]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2019/dataset'

In [4]:
op_df = pd.read_csv(f'{data_path}/pre/ad_operation.dat',names=['aid','updateTime','op_type','updateAttr','updateAttrValue'] ,sep='\t')
print(op_df)

aid      updateTime  op_type  updateAttr                                                                                                                                        updateAttrValue
0       593323               0        2           2                                                                                                                                                     90
1       593323               0        2           3                                                                                                                                                    all
2       593323               0        2           4                                        281474976710655,281474976710655,281474976710655,281474976710655,281474976710655,281474976710655,281474976710655
3       593323  20190217000000        1           1                                                                                                                                                    

In [5]:
ad_static_df = pd.read_csv(f'{data_path}/pre/ad_static_feature.out',names=['aid','createTime','account','goods_id','goods_type','industry_id','size'], sep='\t')
print(ad_static_df)

aid  createTime  account goods_id  goods_type industry_id size
0       106452  1529958950    22226    16088          13         225  NaN
1       233649  1538221936    25681     7356          13         136    1
2       547531  1550731020    20696       -1           1         186   40
3       707841  1551857857     3968       -1           3         186   40
4       457009  1550439402    23614     7447          13         172  NaN
5       733436  1552977426    22405    31722           5         117   64
6       249105  1552641796    11360    29999          18         145   44
7       160014  1552532512     6441     2373          18         198   36
8       541096  1552467888     5117      220           5         232   44
9       634000  1552292527    28588    20164          18          76   64
10      367826  1552113270    26093       -1           1         221   64
11      660913  1551920796    10035    24783           3         164   64
12      584892  1550913776    12891    32505     

In [6]:
user_df = pd.read_csv(f"{data_path}/pre/user_data",names=['uid','age','gender','area','status','edu','consuption','device','work','connection_type','behavior'],sep='\t')
print(user_df)

                                         12527,14188,9841,9019,14605,6222,9962,6643,1720,1053,1231,7202,8361,5547,-1     13    7           3       2    4                4   
3         863717    1       2                                                   12527,3435,2398,11069,1292,9019,14605,14348,6222,6643,10341,7260,11570,1053,14223,13157,1231,5440,-1      6    5           3       2    0                2   
4        1305469  714       3                                             8928,9019,1540,719,12527,6051,10545,12118,7107,14605,2947,6222,3728,6643,917,1053,1231,12237,14369,1764,-1     13    5           3       2    0                2   
5         690515  731       2                                                                       12610,12527,8057,3228,9019,14605,6222,5779,9543,6643,2322,1053,9516,1231,1384,-1     13    2           2       2    2                4   
6         575658  202       2                                                                      12527,2103,64

In [7]:
log_df = pd.read_hdf(f'{data_path}/pre/totalExposureLog.h5',key='totalExposureLog')
print(log_df)



aid_request  request_time  aid_location      uid     aid  aid_size  bid       pctr  quality_ecpm     totalEcpm
0             53991770    1550409746            94  1160618  451525        50   46  47.218750    944.340027   3122.340088
1             25942318    1550370892            79   203814  214797        64   10  49.093750    981.880005   1471.880005
2             66156247    1550416600            18   808543   92253        40   96   3.824219     76.480003    443.584015
3              5935886    1550365898           198     7270  160082        64   85   6.121094    122.459999    642.914978
4             11624425    1550361159           168   852707  253902        64   60   4.328125     86.580002    346.320007
5             87085212    1550406108           209   794015  126362        64  236  13.914062    278.279999   3561.983887
6             13220086    1550412325           320   228155  307878        64  107   7.539062    150.759995    957.325989
7             86036534    155036640

In [8]:
test_df = pd.read_csv(f"{data_path}/pre/test_sample.dat",names=['id','aid','createTime','size','industry_id','goods_type','goods_id','account','exposure_time','crowd','bid'],sep='\t')
print(test_df)


id     aid  createTime  size  industry_id  goods_type  goods_id  account                                                                                                    exposure_time  \
0          1  394352  1529648412    34           84          13     29663    26657  281474976645120,281474976645120,281474976645120,281474976645120,281474976645120,281474976645120,281474976645120   
1          2  585401  1553076190    40          221           1        -1     6262  281474976579587,281474976579587,281474976579587,281474976579587,281474976579587,281474976579587,281474976579587   
2          3  419408  1553031394    30          122          13     32110    17436         17592185782272,17592185782272,17592185782272,17592185782272,17592185782272,17592185782272,17592185782272   
3          4  405326  1553238836    64          136           1        -1    22359  281474976694272,281474976694272,281474976694272,281474976694272,281474976694272,281474976694272,281474976694272   
4          5  5

In [9]:
 test_output_df =test_df.set_index('id')[['aid', 'bid']].groupby('aid')['bid']
 #.apply(lambda row: pd.Series(dict(zip(row.index, row.rank()/6)))).round(4)
 print(test_output_df.apply(lambda row:  row.rank()/6))


id
1        0.500000
2        1.166667
3        0.166667
4        1.000000
5        1.000000
6        1.500000
7        1.000000
8        0.500000
9        1.166667
10       0.833333
11       1.666667
12       1.333333
13       1.833333
14       1.000000
15       1.000000
16       0.166667
17       1.166667
18       1.000000
19       0.833333
20       1.500000
21       1.000000
22       1.833333
23       0.833333
24       1.333333
25       0.333333
           ...   
20266    0.500000
20267    1.000000
20268    1.666667
20269    0.166667
20270    0.666667
20271    1.166667
20272    0.500000
20273    0.833333
20274    0.666667
20275    0.666667
20276    1.000000
20277    1.333333
20278    1.833333
20279    1.500000
20280    0.333333
20281    0.333333
20282    0.166667
20283    1.666667
20284    1.166667
20285    1.333333
20286    0.333333
20287    0.666667
20288    0.500000
20289    0.166667
20290    0.166667
Name: bid, Length: 20290, dtype: float64


In [16]:
#log_df['request_date'] = pd.to_datetime(log_df['request_time'], unit='s')
#log_df['request_date'] = log_df['request_date'].apply(lambda x:x.strftime('%Y%m%d'))
#print(log_df['request_date'].value_counts()) #.plot.bar()
#print(log_df)


In [11]:
aids= set()
for i in test_output_df[['aid']].values:
    aids.add(i[-1])
print(len(aids))




509280
