In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from  collections import Counter
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
flag ='test'
merged_df = pd.read_pickle(f'{preprocess_path}/{flag}_merged_log.pkl' )
print(merged_df)

          time  user_id  creative_id  click_times   ad_id  product_id  product_category  advertiser_id  industry
0           20  3131989       645764            1  573314          58                 2          14689         6
1           10  3142948       645764            1  573314          58                 2          14689         6
2           14  3170643       645764            1  573314          58                 2          14689         6
...        ...      ...          ...          ...     ...         ...               ...            ...       ...
33585509     5  3131984       322785            1  290114        2031                 2          19976       238
33585510    16  3131984       900585            1  793805        1766                 2          33097       319
33585511    16  3131988       899450            1  792851          -1                18          22897        47

[33585512 rows x 9 columns]


In [3]:
agg_dic = {
    'click_times':['sum','count'], 
    'ad_id':['nunique'],
    'product_id':['nunique'],
    'product_category':['nunique'],
    'advertiser_id':['nunique'],
    'industry':['nunique'],
    }
user_df =  merged_df.groupby('user_id').agg(agg_dic,axis=0)


print(user_df)

        click_times         ad_id product_id product_category advertiser_id industry
                sum count nunique    nunique          nunique       nunique  nunique
user_id                                                                             
3000001          11    11      11          7                4            10        5
3000002          65    61      49         24                5            41       21
3000003          36    36      35         15                5            29       17
...             ...   ...     ...        ...              ...           ...      ...
3999998          20    19      18         10                5            16       13
3999999          86    80      79         26                4            63       29
4000000          67    63      63         27                6            59       27

[1000000 rows x 7 columns]


In [4]:

user_df = user_df.fillna(0)
user_df.columns = ['_'.join(i) for i in user_df.columns.values]
user_df.reset_index()
user_df.to_pickle(f'{preprocess_path}/{flag}_user_base_statics.pkl' )
print(user_df)


         click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique
user_id                                                                                                                                          
3000001               11                 11             11                   7                         4                     10                 5
3000002               65                 61             49                  24                         5                     41                21
3000003               36                 36             35                  15                         5                     29                17
...                  ...                ...            ...                 ...                       ...                    ...               ...
3999998               20                 19             18                  10                         5                    