In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from  collections import Counter
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
def get_cols_statics(grouped_df,pivod,output_col):
    max_df = grouped_df.groupby([pivod])[[output_col]].max().reset_index().rename(columns={output_col:f'{output_col}_max'})
    min_df = grouped_df.groupby([pivod])[[output_col]].min().reset_index().rename(columns={output_col:f'{output_col}_min'})
    mean_df = grouped_df.groupby([pivod])[[output_col]].mean().reset_index().rename(columns={output_col:f'{output_col}_mean'})
    std_df = grouped_df.groupby([pivod])[[output_col]].std().reset_index().rename(columns={output_col:f'{output_col}_std'})
    ret_df  = max_df.merge(min_df,on=pivod)
    ret_df  = ret_df.merge(mean_df,on=pivod)
    ret_df  = ret_df.merge(std_df,on=pivod)
    return ret_df 


In [3]:
def get_user_statics(log):
    user_df =  log.groupby(['user_id']).click_times.sum().reset_index().rename(columns={'click_times':'total_clks'})
    user_active_days_df = log.groupby(['user_id']).time.nunique().reset_index().rename(columns={'time':'active_days'})
    return user_df.merge(user_active_days_df,on='user_id')

In [4]:
def get_user_day_statics(log):
    user_day_clks_df = log.groupby(['user_id','time']).creative_id.count().reset_index().rename(columns={'creative_id':'day_clks'})
    user_day_clks_df['week'] =   user_day_clks_df['time'].apply(lambda x :int((x+ 5) /7))
    user_day_clks_df['wday'] =   user_day_clks_df['time'].apply(lambda x :int((x+ 5) %7))
    user_day_clks_df['is_weekend'] =   user_day_clks_df['wday'].apply(lambda x : 1 if x > 5 else 0 )
    user_day_clks_df['month'] =   user_day_clks_df['time'].apply(lambda x :int((x /30)))
    return user_day_clks_df

In [5]:
merged_df = pd.read_pickle(f'{preprocess_path}/train_merged_log.pkl' )
print(merged_df)

          time   user_id  creative_id  click_times      ad_id  product_id  product_category  advertiser_id  industry
0          9.0   30920.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
1         15.0  320815.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
2         11.0  355089.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
...        ...       ...          ...          ...        ...         ...               ...            ...       ...
30082768  76.0  309198.0    3686554.0          2.0  3172440.0      3979.0               2.0        52506.0     247.0
30082769  50.0  309198.0    2386973.0          1.0  2057811.0      1946.0               2.0        17757.0     259.0
30082770  12.0   30920.0     717026.0          1.0   634512.0        -1.0              18.0        26466.0     126.0

[30082771 rows x 9 columns]


In [6]:
user_df = get_user_statics(merged_df)
print(user_df)

         user_id  total_clks  active_days
0            1.0        14.0           10
1            2.0        46.0           28
2            3.0        30.0           23
...          ...         ...          ...
899997  899998.0        15.0           10
899998  899999.0        22.0           17
899999  900000.0        12.0           12

[900000 rows x 3 columns]


In [7]:
user_day_df = get_user_day_statics(merged_df)
print(user_day_df)
del merged_df


           user_id  time  day_clks  week  wday  is_weekend  month
0              1.0  20.0         3     3     4           0      0
1              1.0  39.0         1     6     2           0      1
2              1.0  40.0         1     6     3           0      1
...            ...   ...       ...   ...   ...         ...    ...
19825288  900000.0  72.0         1    11     0           0      2
19825289  900000.0  74.0         1    11     2           0      2
19825290  900000.0  91.0         1    13     5           0      3

[19825291 rows x 7 columns]


In [8]:
day_statics_df =  get_cols_statics(user_day_df,'user_id','day_clks')
print(day_statics_df)

         user_id  day_clks_max  day_clks_min  day_clks_mean  day_clks_std
0            1.0             3             1       1.300000      0.674949
1            2.0             4             1       1.607143      0.994030
2            3.0             3             1       1.304348      0.634950
...          ...           ...           ...            ...           ...
899997  899998.0             3             1       1.400000      0.699206
899998  899999.0             2             1       1.294118      0.469668
899999  900000.0             1             1       1.000000      0.000000

[900000 rows x 5 columns]


In [9]:
user_week_clks_df = user_day_df.groupby(['user_id','week']).day_clks.sum().reset_index().rename(columns={'day_clks':'week_clks'})
week_statics_df =  get_cols_statics(user_week_clks_df,'user_id','week_clks')
print(week_statics_df)

         user_id  week_clks_max  week_clks_min  week_clks_mean  week_clks_std
0            1.0              3              1        2.166667       0.983192
1            2.0              7              1        4.090909       2.256304
2            3.0              6              1        3.000000       1.763834
...          ...            ...            ...             ...            ...
899997  899998.0              7              1        2.333333       2.338090
899998  899999.0              4              1        2.444444       1.130388
899999  900000.0              2              1        1.333333       0.500000

[900000 rows x 5 columns]


In [10]:
weekend_df = user_day_df[user_day_df.is_weekend > 0]
print(weekend_df)
user_weekend_clks_df = weekend_df.groupby(['user_id','week']).day_clks.sum().reset_index().rename(columns={'day_clks':'weekend_clks'})
weekend_statics_df =  get_cols_statics(user_weekend_clks_df,'user_id','weekend_clks')
print(weekend_statics_df)

           user_id  time  day_clks  week  wday  is_weekend  month
3              1.0  43.0         1     6     6           1      1
7              1.0  64.0         2     9     6           1      2
32             2.0  78.0         1    11     6           1      2
...            ...   ...       ...   ...   ...         ...    ...
19825269  899999.0  29.0         2     4     6           1      0
19825286  900000.0  57.0         1     8     6           1      1
19825287  900000.0  71.0         1    10     6           1      2

[2966620 rows x 7 columns]
         user_id  weekend_clks_max  weekend_clks_min  weekend_clks_mean  weekend_clks_std
0            1.0                 2                 1               1.50          0.707107
1            2.0                 1                 1               1.00          0.000000
2            3.0                 2                 1               1.25          0.500000
...          ...               ...               ...                ...             

In [11]:
workday_df = user_day_df[user_day_df.is_weekend == 0]
user_work_clks_df = workday_df.groupby(['user_id','week']).day_clks.sum().reset_index().rename(columns={'day_clks':'workday_clks'})
workday_statics_df =  get_cols_statics(user_work_clks_df,'user_id','workday_clks')
print(workday_statics_df)

         user_id  workday_clks_max  workday_clks_min  workday_clks_mean  workday_clks_std
0            1.0                 3                 1           1.666667          0.816497
1            2.0                 7                 1           4.300000          2.057507
2            3.0                 5                 1           2.500000          1.509231
...          ...               ...               ...                ...               ...
899992  899998.0                 4                 1           2.000000          1.224745
899993  899999.0                 4                 1           2.222222          1.201850
899994  900000.0                 2                 1           1.250000          0.462910

[899995 rows x 5 columns]


In [12]:
pd.set_option('display.max_rows', 100)
print(workday_statics_df)
pd.set_option('display.max_rows', 6)

         user_id  workday_clks_max  workday_clks_min  workday_clks_mean  workday_clks_std
0            1.0                 3                 1           1.666667          0.816497
1            2.0                 7                 1           4.300000          2.057507
2            3.0                 5                 1           2.500000          1.509231
3            4.0                11                 1           4.166667          3.868678
4            5.0                 7                 1           2.727273          1.848833
5            6.0                 4                 1           2.500000          1.224745
6            7.0                 2                 1           1.166667          0.408248
7            8.0                 6                 1           3.363636          1.286291
8            9.0                 5                 1           2.000000          1.322876
9           10.0                 2                 1           1.500000          0.547723
10        

In [13]:
user_month_clks_df = user_day_df.groupby(['user_id','month']).day_clks.sum().reset_index().rename(columns={'day_clks':'month_clks'})
month_statics_df =  get_cols_statics(user_month_clks_df,'user_id','month_clks')
print(month_statics_df)

         user_id  month_clks_max  month_clks_min  month_clks_mean  month_clks_std
0            1.0               5               3         4.333333        1.154701
1            2.0              21               1        11.250000        8.958236
2            3.0              12               8        10.000000        2.000000
...          ...             ...             ...              ...             ...
899997  899998.0               9               1         3.500000        3.785939
899998  899999.0               9               5         7.333333        2.081666
899999  900000.0               4               1         3.000000        1.414214

[900000 rows x 5 columns]


In [14]:
user_df =  user_df.merge(day_statics_df,on='user_id')
user_df =  user_df.merge(week_statics_df,on='user_id')
user_df =  user_df.merge(weekend_statics_df,on='user_id')
user_df =  user_df.merge(workday_statics_df,on='user_id')
user_df =  user_df.merge(month_statics_df,on='user_id')
user_df = user_df.fillna(0)
user_df.to_pickle(f'{preprocess_path}/train_user.pkl' )
print(user_df)
