In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from  collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold


np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'


In [2]:
merged_df = pd.read_pickle(f'{preprocess_path}/train_merged_log.pkl' )
label_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv')
merged_df = merged_df.merge(label_df,on='user_id',how='left')
merged_df['age'] = merged_df['age'] -1
merged_df['gender'] = merged_df['gender'] -1
print(merged_df)

          time   user_id  creative_id  click_times      ad_id  product_id  product_category  advertiser_id  industry  age  gender
0          9.0   30920.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0    1       0
1         15.0  320815.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0    2       0
2         11.0  355089.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0    0       0
...        ...       ...          ...          ...        ...         ...               ...            ...       ...  ...     ...
30082768  76.0  309198.0    3686554.0          2.0  3172440.0      3979.0               2.0        52506.0     247.0    1       0
30082769  50.0  309198.0    2386973.0          1.0  2057811.0      1946.0               2.0        17757.0     259.0    1       0
30082770  12.0   30920.0     717026.0          1.0   634512.0        -1.0              18.

In [3]:

for i in range(10):
    merged_df[f'age{i}']= None
    merged_df.loc[merged_df.age == i,[f'age{i}']] = 1
    merged_df[f'age{i}'] = merged_df[f'age{i}'].fillna(0)

print(merged_df)


          time   user_id  creative_id  click_times      ad_id  product_id  product_category  advertiser_id  industry  age  gender  age0  age1  age2  age3  age4  age5  age6  age7  age8  age9
0          9.0   30920.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0    1       0     0     1     0     0     0     0     0     0     0     0
1         15.0  320815.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0    2       0     0     0     1     0     0     0     0     0     0     0
2         11.0  355089.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0    0       0     1     0     0     0     0     0     0     0     0     0
...        ...       ...          ...          ...        ...         ...               ...            ...       ...  ...     ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...
30082768  76.0  309198.0    3686554.0          2.0

In [4]:

for i in range(2):
    merged_df[f'gender{i}']= None
    merged_df.loc[merged_df.gender == i,[f'gender{i}']] = 1
    merged_df[f'gender{i}'] = merged_df[f'gender{i}'].fillna(0)

print(merged_df)

          time   user_id  creative_id  click_times      ad_id  product_id  product_category  advertiser_id  industry  age  gender  age0  age1  age2  age3  age4  age5  age6  age7  age8  age9  gender0  gender1
0          9.0   30920.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0    1       0     0     1     0     0     0     0     0     0     0     0        1        0
1         15.0  320815.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0    2       0     0     0     1     0     0     0     0     0     0     0        1        0
2         11.0  355089.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0    0       0     1     0     0     0     0     0     0     0     0     0        1        0
...        ...       ...          ...          ...        ...         ...               ...            ...       ...  ...     ...   ...   ...   ...   ...   ...   ...   

In [5]:
def get_targe_encoder(log):

    for feat in ['creative_id','ad_id','advertiser_id']:
        feat_df = None
        nums_columns = [f'age{i}' for i in range(10)]
        for f in nums_columns:
            order_label   = log.groupby([feat])[f].mean()
            print(order_label)
            order_label = order_label.reset_index()
            if feat_df is None:
                feat_df = order_label
            else:
                feat_df = feat_df.merge(order_label,on=feat)
                
        nums_columns = [f'gender{i}' for i in range(2)]
        for f in nums_columns:
            order_label   = log.groupby([feat])[f].mean()
            print(order_label)
            order_label = order_label.reset_index()
            if feat_df is None:
                feat_df = order_label
            else:
                feat_df = feat_df.merge(order_label,on=feat)
        feat_df.to_pickle(f'preprocess/{feat}_target_encode.pkl')
        print(feat_df)


In [6]:
get_targe_encoder(merged_df)

creative_id
1.0          0.0
4.0          0.0
7.0          0.0
            ... 
4445716.0    0.0
4445717.0    0.0
4445718.0    0.0
Name: age0, Length: 2481135, dtype: float64
creative_id
1.0          0.00
4.0          0.00
7.0          0.25
             ... 
4445716.0    0.00
4445717.0    0.00
4445718.0    0.50
Name: age1, Length: 2481135, dtype: float64
creative_id
1.0          0.400000
4.0          0.333333
7.0          0.250000
               ...   
4445716.0    1.000000
4445717.0    0.000000
4445718.0    0.000000
Name: age2, Length: 2481135, dtype: float64
creative_id
1.0          0.20
4.0          0.50
7.0          0.25
             ... 
4445716.0    0.00
4445717.0    0.00
4445718.0    0.00
Name: age3, Length: 2481135, dtype: float64
creative_id
1.0          0.200000
4.0          0.166667
7.0          0.000000
               ...   
4445716.0    0.000000
4445717.0    1.000000
4445718.0    0.000000
Name: age4, Length: 2481135, dtype: float64
creative_id
1.0          0.0
4.0         