In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from  collections import Counter
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
merged_df = pd.read_pickle(f'{preprocess_path}/train_merged_log.pkl' )
label_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv')
merged_df = merged_df.merge(label_df,on='user_id')
print(merged_df)

          time   user_id  creative_id  click_times      ad_id  product_id  product_category  advertiser_id  industry
0          9.0   30920.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
1         15.0  320815.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
2         11.0  355089.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
...        ...       ...          ...          ...        ...         ...               ...            ...       ...
30082768  76.0  309198.0    3686554.0          2.0  3172440.0      3979.0               2.0        52506.0     247.0
30082769  50.0  309198.0    2386973.0          1.0  2057811.0      1946.0               2.0        17757.0     259.0
30082770  12.0   30920.0     717026.0          1.0   634512.0        -1.0              18.0        26466.0     126.0

[30082771 rows x 9 columns]


In [None]:
agg_dic = {
    'age':['count','min','max','std','var'], 
}

In [7]:
industry_df =  merged_df.groupby('industry').agg(agg_dic,axis=0)
print(industry_df.head(500))

         click_times         ad_id product_id product_category advertiser_id industry
                 sum count nunique    nunique          nunique       nunique  nunique
user_id                                                                              
1.0             14.0    13      12          6                3            12        9
2.0             46.0    45      42         20                3            36       15
3.0             30.0    30      30         17                6            28        8
...              ...   ...     ...        ...              ...           ...      ...
899998.0        15.0    14      14          5                4             9        5
899999.0        22.0    22      18          5                7            16       14
900000.0        12.0    12      12          2                2            10       10

[900000 rows x 7 columns]


In [10]:
advertiser_id_df =  merged_df.groupby('advertiser_id').agg(agg_dic,axis=0)
print(advertiser_id_df.head(500))

          click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique
user_id                                                                                                                                           
1.0                  14.0                 13             12                   6                         3                     12                 9
2.0                  46.0                 45             42                  20                         3                     36                15
3.0                  30.0                 30             30                  17                         6                     28                 8
...                   ...                ...            ...                 ...                       ...                    ...               ...
899998.0             15.0                 14             14                   5                         4             

In [None]:
product_category_df =  merged_df.groupby('product_category').agg(agg_dic,axis=0)
print(product_category_df.head(500))

In [None]:
product_id_df =  merged_df.groupby('product_id').agg(agg_dic,axis=0)
print(product_id.head(500))