In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from  collections import Counter

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
def get_merged_log(flag):
    merged= f'{flag}_merged_log.pkl'
    merged_path = f'{preprocess_path}/{merged}'
    merged_df = pd.read_pickle(merged_path)
    print(merged_df)
    return merged_df

In [3]:
train_merged_log_df = get_merged_log('train')

          time   user_id  creative_id  click_times      ad_id  product_id  product_category  advertiser_id  industry
0          9.0   30920.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
1         15.0  320815.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
2         11.0  355089.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
...        ...       ...          ...          ...        ...         ...               ...            ...       ...
30082768  76.0  309198.0    3686554.0          2.0  3172440.0      3979.0               2.0        52506.0     247.0
30082769  50.0  309198.0    2386973.0          1.0  2057811.0      1946.0               2.0        17757.0     259.0
30082770  12.0   30920.0     717026.0          1.0   634512.0        -1.0              18.0        26466.0     126.0

[30082771 rows x 9 columns]


In [4]:
test_merged_log_df = get_merged_log('test')

          time  user_id  creative_id  click_times   ad_id  product_id  product_category  advertiser_id  industry
0           20  3131989       645764            1  573314          58                 2          14689         6
1           10  3142948       645764            1  573314          58                 2          14689         6
2           14  3170643       645764            1  573314          58                 2          14689         6
...        ...      ...          ...          ...     ...         ...               ...            ...       ...
33585509     5  3131984       322785            1  290114        2031                 2          19976       238
33585510    16  3131984       900585            1  793805        1766                 2          33097       319
33585511    16  3131988       899450            1  792851          -1                18          22897        47

[33585512 rows x 9 columns]


In [5]:
total_merged_df = pd.concat([train_merged_log_df,test_merged_log_df]).sort_values(by='time')
print(total_merged_df)

del train_merged_log_df
del test_merged_log_df
gc.collect()

          time    user_id  creative_id  click_times      ad_id  product_id  product_category  advertiser_id  industry
2134125    1.0  3969503.0     146454.0          1.0   131508.0        -1.0              18.0        34137.0      40.0
25196907   1.0     2267.0     249047.0          1.0   223979.0      1927.0               2.0        15784.0     322.0
13936212   1.0   512898.0     168451.0          1.0   150988.0      1243.0               2.0        29966.0     322.0
...        ...        ...          ...          ...        ...         ...               ...            ...       ...
28730719  91.0  3773900.0    3680751.0          1.0  3167517.0     42718.0              17.0        10259.0      -1.0
23168403  91.0   498277.0    3877769.0          1.0  3331468.0     26858.0               3.0           23.0      60.0
13125865  91.0   141441.0    4329985.0          1.0  3712996.0     39503.0              17.0        13328.0      -1.0

[63668283 rows x 9 columns]


54

In [6]:
def w2v(log,pivot,f,flag,L,model_path,seq_len=200,sentence_len=100,window=5):
    #word2vec算法
    #log为曝光日志，以pivot为主键，f为embedding的对象，flag为dev或test，L是embedding的维度
    print("w2v:",pivot,f,model_path)
    #构造文档
    sentence=[]
    dic={}
    day=0
    log['day'] = log['time']
    print('build...')
    for item in tqdm(log[['day',pivot,f]].values,total=len(log)):
        try:
            dic[item[1]].append(str(int(item[2])))
        except:
            dic[item[1]]=[str(int(item[2]))]
    
    for key in dic:
        sentence.append(dic[key])
    print(sentence[:5])
    print(len(sentence))
    #训练Word2Vec模型
    print('shuffle...')
    random.shuffle(sentence)
    print('training...')
    model = Word2Vec(sentence, size=L, window=window, min_count=1, workers=10,iter=10)
    print(model_path)
    model.save(model_path+f'_{L}')
    del sentence
    print(model)
    
    ret = []
    for key in dic:
         ret.append([key,dic[key]])
    print(ret[:20])
    cols = ['user_id'] + [f'{f}_seq']
    ret_df = pd.DataFrame(ret,columns=cols)
    output_path = f'{preprocess_path}/{f}_s{L}_{flag}_seq.pkl'
    ret_df.to_pickle(output_path)
    
    
    del model
    del dic
    del ret
    return ret_df
    


In [None]:
time_seq_df =  w2v(total_merged_df,'user_id','time','total',64,'/data/workspace/kimi/tencent_ads/2020/kimi/model/time_emb.model',600,500,10)


In [None]:
product_category_seq_df =  w2v(total_merged_df,'user_id','product_category','total',64,'/data/workspace/kimi/tencent_ads/2020/kimi/model/product_category_emb.model',600,500,10)

In [7]:


#industy_seq_df =  w2v(total_merged_df,'user_id','industry','total',64,'/data/workspace/kimi/tencent_ads/2020/kimi/model/industry_emb.model',600,500,10)
#industy_seq_df.to_pickle(industy_seq_path)
#print(industy_seq_df)

w2v: user_id industry /data/workspace/kimi/tencent_ads/2020/kimi/model/industry_emb.model
build...


100%|██████████| 63668283/63668283 [01:39<00:00, 641434.63it/s]


[['40', '326', '277', '322', '6', '164', '319', '28', '73', '319', '319', '54', '297', '107', '73', '302', '6', '6', '6', '319', '291', '322', '6', '6', '238', '322', '291', '322', '319', '6', '54', '322', '6', '291', '6', '36', '54', '326', '291', '302', '28', '6', '320', '317', '288', '322', '6', '6', '326', '6', '6', '322', '246', '205', '54', '6', '6', '6', '322'], ['322', '322', '319', '6', '6', '319', '334', '322', '202', '319', '6', '6', '88', '297', '241', '322', '334', '29', '29', '319', '6', '47', '200', '200', '319', '25', '47', '230', '319', '29', '293', '238', '319', '238', '322', '6', '238', '6', '322', '238', '6', '242', '319', '319', '238', '319', '300', '291', '6', '241', '319', '300', '277', '6', '322', '6', '319', '161', '334', '28', '300', '319', '300', '13', '319', '322', '319', '290', '289', '6', '13', '322', '319', '238', '319', '6', '321', '230', '6', '317', '-1', '6', '6', '317', '322', '319', '322'], ['322', '319', '27', '322', '73', '322', '322', '322', '322'

In [8]:
#product_id_seq_df = w2v(total_merged_df,'user_id','product_id','total',64,'/data/workspace/kimi/tencent_ads/2020/kimi/model/product_id_emb.model',500,500,10)


w2v: user_id product_id /data/workspace/kimi/tencent_ads/2020/kimi/model/product_id_emb.model
build...


100%|██████████| 63668283/63668283 [01:49<00:00, 582154.16it/s]


[['-1', '-1', '11853', '1828', '1261', '-1', '1769', '-1', '34138', '1769', '1769', '-1', '-1', '-1', '34138', '-1', '-1', '1261', '1261', '-1', '-1', '1674', '-1', '1031', '1862', '-1', '-1', '1243', '-1', '-1', '-1', '2258', '1031', '-1', '1261', '-1', '-1', '-1', '-1', '-1', '-1', '1261', '164', '1033', '-1', '1674', '1261', '-1', '-1', '1261', '1261', '1674', '2400', '-1', '-1', '-1', '1261', '-1', '1674'], ['1927', '-1', '1769', '-1', '-1', '23', '-1', '-1', '-1', '23', '-1', '-1', '-1', '-1', '1367', '1674', '-1', '1523', '1523', '23', '-1', '-1', '-1', '-1', '23', '-1', '-1', '-1', '23', '1523', '-1', '1862', '-1', '1862', '-1', '1261', '1862', '1261', '-1', '1862', '1567', '1064', '23', '1769', '1862', '23', '-1', '-1', '1261', '1367', '1769', '-1', '11853', '1261', '-1', '-1', '1224', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '23', '1873', '1036', '32171', '-1', '-1', '-1', '1674', '23', '2031', '23', '-1', '23', '-1', '1261', '107', '39144', '-1', '-1', '1249', '2103', '23', 

In [9]:
#advertiser_id_seq_df = w2v(total_merged_df,'user_id','advertiser_id','total',64,'/data/workspace/kimi/tencent_ads/2020/kimi/model/advertiser_id_emb.model',400,500,10)


w2v: user_id advertiser_id /data/workspace/kimi/tencent_ads/2020/kimi/model/advertiser_id_emb.model
build...


100%|██████████| 63668283/63668283 [01:46<00:00, 600482.00it/s]


[['34137', '10986', '30278', '25217', '10985', '8200', '13792', '21602', '11985', '13792', '13792', '22237', '10989', '10989', '11985', '21306', '13732', '7115', '10988', '24894', '10989', '13559', '13732', '24850', '31183', '7273', '10989', '34656', '24894', '10986', '22237', '34813', '24850', '10989', '10988', '33540', '10372', '10989', '10986', '21306', '6762', '918', '41149', '17283', '6974', '18129', '918', '13732', '10989', '918', '10988', '23059', '51496', '14682', '8371', '14681', '918', '13732', '18023'], ['15784', '7273', '13792', '9058', '9058', '9065', '10973', '10699', '10989', '9065', '9058', '9058', '14681', '14681', '18234', '18355', '10970', '25', '25', '26775', '9058', '25260', '953', '953', '26775', '953', '25260', '38162', '15479', '25', '10986', '19451', '24894', '19451', '7212', '10985', '28444', '7115', '7212', '31183', '36282', '39230', '26775', '13792', '28784', '26775', '6968', '10989', '10988', '18234', '13792', '8279', '30278', '10985', '7212', '13732', '110

In [10]:
#ad_id_seq_df = w2v(total_merged_df,'user_id','ad_id','total',64,'/data/workspace/kimi/tencent_ads/2020/kimi/model/ad_id_emb.model',400,500,10)


In [11]:
#creative_id_seq_df = w2v(total_merged_df,'user_id','creative_id','total',64,'/data/workspace/kimi/tencent_ads/2020/kimi/model/creative_id_emb.model',400,500,10)
