In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from  collections import Counter

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
base_path = '/newdata/worksapce/kimi/vs_code/tencent_ads/2020/kimi'
data_path = f'{base_path}/../dataset'
preprocess_path = f'{base_path}/preprocess'

In [2]:
log_df = pd.read_pickle(f'{preprocess_path}/filtered_merged_log.pkl')
print(log_df)

          time    user_id  creative_id  click_times      ad_id  product_id  product_category  advertiser_id  industry
2134125    1.0  3969503.0     146454.0          1.0   131508.0        -1.0              18.0        34137.0      40.0
25196907   1.0     2267.0     249047.0          1.0   223979.0      1927.0               2.0        15784.0     322.0
...        ...        ...          ...          ...        ...         ...               ...            ...       ...
23168403  91.0   498277.0    3877769.0          1.0  3331468.0     26858.0               3.0           23.0      60.0
13125865  91.0   141441.0    4329985.0          1.0  3712996.0     39503.0              17.0        13328.0      -1.0

[58904657 rows x 9 columns]


In [3]:
agg_dic = {
    'time':['nunique'],
    'click_times':['sum','count'], 
    'creative_id':['nunique'],
    'ad_id':['nunique'],
    'product_id':['nunique'],
    'product_category':['nunique'],
    'advertiser_id':['nunique'],
    'industry':['nunique'],
    }
user_base_statics_df =  log_df.groupby(['user_id']).agg(agg_dic,axis=0)

user_base_statics_df.columns = = ['_'.join(i) for i in  user_base_statics_df.columns.values]
user_base_statics_df.to_pickle(f'{preprocess_path}/user_base_statics.pkl')
print(user_base_statics_df)


             time click_times       creative_id   ad_id product_id product_category advertiser_id industry
          nunique         sum count     nunique nunique    nunique          nunique       nunique  nunique
user_id                                                                                                   
1.0            10        14.0    13          12      12          6                3            12        9
2.0            28        46.0    45          42      42         20                3            36       15
...           ...         ...   ...         ...     ...        ...              ...           ...      ...
3999999.0      45        86.0    80          79      79         26                4            63       29
4000000.0      39        67.0    63          63      63         27                6            59       27

[1878303 rows x 9 columns]


In [6]:
gc.collect()
clk_times_agg_dic = {
    'click_times':['sum']
}


usr_statics_with_feat_df = None
feats =  ['time','creative_id','ad_id','product_id','product_category','advertiser_id','industry']
for i in feats:
    print(f'start feat {i}...')
    feat_df = log_df[['user_id',i,'click_times']].groupby(['user_id',i]).agg(clk_times_agg_dic,axis=0)
    feat_df.columns  = ['_'.join(i) for i in  feat_df.columns.values]
    feat_df = feat_df.reset_index().rename(columns={'click_times_sum':f'{i}_click_times_sum'})
    print(feat_df)
    user_feat_agg_dic = {
    f'{i}_click_times_sum':['mean','max','min','median','std']
    }

    feat_user_df = feat_df.groupby(['user_id']).agg(user_feat_agg_dic,axis=0)
    feat_user_df.columns  = ['_'.join(i) for i in  feat_user_df.columns.values]
    feat_user_df = feat_user_df.reset_index()
    print(feat_user_df)
    
    if usr_statics_with_feat_df is None:
        usr_statics_with_feat_df = feat_user_df
    else:
        usr_statics_with_feat_df = usr_statics_with_feat_df.merge(feat_user_df,on='user_id',how='left')
print(usr_statics_with_feat_df)
    

start feat time...
            user_id  time  time_click_times_sum
0               1.0  20.0                   3.0
1               1.0  39.0                   1.0
...             ...   ...                   ...
40364232  4000000.0  76.0                   2.0
40364233  4000000.0  77.0                   1.0

[40364234 rows x 3 columns]
           user_id  time_click_times_sum_mean  time_click_times_sum_max  time_click_times_sum_min  time_click_times_sum_median  time_click_times_sum_std
0              1.0                   1.400000                       3.0                       1.0                          1.0                  0.699206
1              2.0                   1.642857                       4.0                       1.0                          1.0                  1.026114
...            ...                        ...                       ...                       ...                          ...                       ...
1878301  3999999.0                   1.911111       

In [7]:
usr_statics_with_feat_df.to_pickle(f'{preprocess_path}/user_feat_statics.pkl')


In [10]:
user_statics_df = user_base_statics_df.merge(usr_statics_with_feat_df,on='user_id',how='left')
print(user_statics_df)

           user_id  time_nunique  click_times_sum  click_times_count  creative_id_nunique  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  time_click_times_sum_mean  time_click_times_sum_max  time_click_times_sum_min  \
0              1.0            10             14.0                 13                   12             12                   6                         3                     12                 9                   1.400000                       3.0                       1.0   
1              2.0            28             46.0                 45                   42             42                  20                         3                     36                15                   1.642857                       4.0                       1.0   
...            ...           ...              ...                ...                  ...            ...                 ...                       ...                    ...     

In [11]:
user_statics_df.to_pickle(f'{preprocess_path}/user_statics.pkl')