In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer


np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'

In [2]:
clk_df = pd.read_csv(f'{data_path}/train_preliminary/click_log.csv' )
print(clk_df)

          time  user_id  creative_id  click_times
0            9    30920       567330            1
1           65    30920      3072255            1
2           56    30920      2361327            1
3            6   309204       325532            1
4           59   309204      2746730            1
5           12   309204       726402            1
6           79   309204      2851451            1
7           32   309204      1569716            1
8            5   309204        71956            1
9            8   309204       322354            1
...        ...      ...          ...          ...
30082761    52    30920      2361397            1
30082762    52    30920         9148            1
30082763    53    30920      2361397            1
30082764    40    30920        22298            2
30082765    72    30920      2361397            1
30082766     6    30920       228129            1
30082767    57    30920      2361397            1
30082768    26    30920       629802            1


In [3]:
user_click_sum_df = clk_df.groupby(['user_id']).click_times.sum().reset_index().rename(columns={'click_times':'click_times_total'})
print(user_click_sum_df)

        user_id  click_times_total
0             1                 14
1             2                 46
2             3                 30
3             4                 29
4             5                 34
5             6                 17
6             7                 10
7             8                 52
8             9                 21
9            10                 10
...         ...                ...
899990   899991                153
899991   899992                 37
899992   899993                 47
899993   899994                 14
899994   899995                 20
899995   899996                 14
899996   899997                 20
899997   899998                 15
899998   899999                 22
899999   900000                 12

[900000 rows x 2 columns]


In [4]:
user_day_count_df = clk_df.groupby(['user_id']).time.nunique().reset_index().rename(columns={'time':'active_days'})
print(user_day_count_df)

        user_id  active_days
0             1           10
1             2           28
2             3           23
3             4           15
4             5           26
5             6           14
6             7           10
7             8           34
8             9           17
9            10           10
...         ...          ...
899990   899991           68
899991   899992           23
899992   899993           30
899993   899994           11
899994   899995           16
899995   899996           12
899996   899997           14
899997   899998           10
899998   899999           17
899999   900000           12

[900000 rows x 2 columns]


In [5]:
user_df = user_day_count_df.merge(user_click_sum_df,on='user_id')
print(user_df)

        user_id  active_days  click_times_total
0             1           10                 14
1             2           28                 46
2             3           23                 30
3             4           15                 29
4             5           26                 34
5             6           14                 17
6             7           10                 10
7             8           34                 52
8             9           17                 21
9            10           10                 10
...         ...          ...                ...
899990   899991           68                153
899991   899992           23                 37
899992   899993           30                 47
899993   899994           11                 14
899994   899995           16                 20
899995   899996           12                 14
899996   899997           14                 20
899997   899998           10                 15
899998   899999           17            

In [6]:
ad_df = pd.read_csv(f'{data_path}/train_preliminary/ad.csv' )

In [7]:
merged_df = clk_df.merge(ad_df,on='creative_id')
#merged_df['wday'] = merged_df['time'].apply(lambda x :int(x /7))
#merged_df['month'] = merged_df['time'].apply(lambda x :int(x /30))
print(merged_df)
del clk_df

          time  user_id  creative_id  click_times    ad_id product_id  product_category  advertiser_id industry
0            9    30920       567330            1   504423      30673                 3          32638      319
1           15   320815       567330            1   504423      30673                 3          32638      319
2           11   355089       567330            1   504423      30673                 3          32638      319
3            9   363442       567330            1   504423      30673                 3          32638      319
4           14   370513       567330            1   504423      30673                 3          32638      319
5            9   372445       567330            1   504423      30673                 3          32638      319
6           16   399556       567330            2   504423      30673                 3          32638      319
7           10   429834       567330            1   504423      30673                 3          32638  

In [8]:

def tfidf(log,pivot,f,flag,L):
    #word2vec算法
    #log为曝光日志，以pivot为主键，f为embedding的对象，flag为dev或test，L是embedding的维度
    print("tdidf:",pivot,f)
    
    #构造文档
    log[f]=log[f].fillna(-1).astype(str)
    sentence=[]
    dic={}
    day=0
    log=log.sort_values(by='time')
    log['day']=log['time']
    for item in tqdm(log[['day',pivot,f]].values,total=len(log)):
        try:
            dic[item[1]].append(str(item[2]))
        except:
            dic[item[1]]=[str(item[2])]
    for key in dic:
        sentence.append(" ".join(dic[key]))
    print(len(sentence))
    print(sentence[:3])
    #训练Word2Vec模型
    print('training...')
    #random.shuffle(sentence)
    tfidf_list = TfidfVectorizer(min_df=30,max_features=100000).fit_transform(sentence)
    print('outputing...')
    df_data = []
    arr_len = 0
    for v1,v2 in zip(list(dic.keys()),tfidf_list):
        arr = np.array(v2.todense()).flatten().tolist()
        if arr_len == 0:
            arr_len = len(arr)
            print(arr_len)
        df_data.append([v1] + arr)
    cols = ['user_id'] + [f'tfidf_{i}'  for i in range(arr_len)]
    tfidf_df= pd.DataFrame(df_data,columns=cols)
    #保存文件
    return tfidf_df 



In [None]:
tfidf_df  = tfidf(merged_df,'user_id','ad_id','train',64)
print(tfidf_df)
print(tfidf_df.shape)
tfidf_df.to_pickle("preprocess/train_tfidf_ad_id_age.pkl")

tdidf: user_id ad_id


100%|██████████| 30082771/30082771 [00:36<00:00, 820219.50it/s]


900000
['19795 830863 98161 957742 1389902 8817 781605 1503442 1179303 546184 98161 218984 36050 20627 50437 1590599 2668141 50437 361112 1590599 2966129 361112 3154105 645289', '1987 375895 274066 421202 416757 113365 372482 281421 541204 689820 603359 781743 1004050 77588 1068365 42488 66362 1284704 1273221 256790 33048 1037268 628585 691985 1357225 214929 1022255 1037268 1079521 171487 18677 691985 45721 1288065 1299028 45721 192570 1037268 1022255 44540 1370291 1423553 273021 1282811 691985 1514583 1037268 56505 1573382 876195 691985 36050 1248304 36050 986453 1791905 1831856 1844292 114353 36050 547477 89833 1981795 18260 36050 1966194 86828 2497700 2811670 83984 3217162 33052 31961 2755834 3048693 3125777 3090079 2888766 3062684 2562573 2562495 2562525 3397048 3456811 2143509 64402 2704157 3455012 3614323 3586103 71140 3614342 3440593', '54883 95685 204809 54552 56387 817021 847786 1193330 876084 22877 1338283 646322 927816 960258 1226542 1226542 1226542 1295167 942883 1219782 15