In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from  collections import Counter

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
def get_merged_log(flag):
    merged= f'{flag}_merged_log.pkl'
    merged_path = f'{preprocess_path}/{merged}'
    merged_df = pd.read_pickle(merged_path)
    print(merged_df)
    return merged_df

In [3]:
train_merged_log_df = get_merged_log('train')
test_merged_log_df = get_merged_log('test')
total_merged_df = pd.concat([train_merged_log_df,test_merged_log_df]).sort_values(by='time')
print(total_merged_df)

del train_merged_log_df
del test_merged_log_df
gc.collect()

          time   user_id  creative_id  click_times      ad_id  product_id  product_category  advertiser_id  industry
0          9.0   30920.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
1         15.0  320815.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
2         11.0  355089.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
...        ...       ...          ...          ...        ...         ...               ...            ...       ...
30082768  76.0  309198.0    3686554.0          2.0  3172440.0      3979.0               2.0        52506.0     247.0
30082769  50.0  309198.0    2386973.0          1.0  2057811.0      1946.0               2.0        17757.0     259.0
30082770  12.0   30920.0     717026.0          1.0   634512.0        -1.0              18.0        26466.0     126.0

[30082771 rows x 9 columns]
          time  user_id  creative_i

14

In [4]:
def w2v(log,pivot,f,flag,L,model_path,seq_len=200,sentence_len=100,window=5,sg=1,negative=5):
    #word2vec算法
    #log为曝光日志，以pivot为主键，f为embedding的对象，flag为dev或test，L是embedding的维度
    print("w2v:",pivot,f,model_path)
    #构造文档
    sentence=[]
    dic={}
    day=0
    log['day'] = log['time']
    print('build...')
    if f != 'time':
        grouped_df =  log.groupby(['user_id', f]).agg({'click_times':'sum','time':'max'}).reset_index().sort_values(by=['user_id','click_times','time'],ascending=[True, False,True])
    else:
        grouped_df =  log.groupby(['user_id', f]).agg({'click_times':'sum'}).reset_index().sort_values(by=['user_id','click_times'],ascending=[True, False])
    print(grouped_df)
    
    for item in tqdm(grouped_df[[pivot,f]].values,total=len(grouped_df)):
        try:
            dic[item[0]].append(str(int(item[1])))
        except:
            dic[item[0]]=[str(int(item[1]))]
    
    for key in dic:
        sentence.append(dic[key])
    print(sentence[:5])
    print(len(sentence))
    #训练Word2Vec模型
    print('shuffle...')
    random.shuffle(sentence)
    print('training...')
    model = Word2Vec(sentence, size=L, window=window, min_count=1, workers=10,sg=sg,negative=negative,iter=10)
    model_file_path = f'{model_path}/{f}_{flag}_s{L}_w{window}_emb.model'
    print(model_file_path)
    model.save(model_file_path)
    print(model)
    
#     ret = []
#     for key in dic:
#          ret.append([key,dic[key]])
#     print(ret[:20])
#     cols = ['user_id'] + [f'{f}_clk_times_seq']
#     ret_df = pd.DataFrame(ret,columns=cols)
#     output_path = f'{preprocess_path}/{f}_{flag}_s{L}_clk_times_ns_seq.pkl'
#     ret_df.to_pickle(output_path)
    
    


In [5]:
size=128
flag = 'clk_ns_total'
model_dir = f'/data/workspace/kimi/tencent_ads/2020/kimi/model'

In [None]:
#for i in ['time', 'creative_id', 'ad_id','product_id','advertiser_id','product_category','industry']:
for i in ['creative_id', 'ad_id','product_id','advertiser_id','product_category','industry']:

    w2v(total_merged_df,'user_id',i,flag,size,model_dir,window=10)
    gc.collect()

w2v: user_id creative_id /data/workspace/kimi/tencent_ads/2020/kimi/model
build...
            user_id  creative_id  click_times  time
9               1.0    2456345.0          2.0  73.0
11              1.0    3592928.0          2.0  76.0
3               1.0     209778.0          1.0  20.0
...             ...          ...          ...   ...
58373027  4000000.0     366858.0          1.0  76.0
58373063  4000000.0    3333680.0          1.0  76.0
58373067  4000000.0    3697105.0          1.0  77.0

[58373068 rows x 4 columns]


HBox(children=(IntProgress(value=0, max=58373068), HTML(value='')))


[['2456345', '3592928', '209778', '821396', '877468', '1683713', '122032', '71691', '1940159', '90171', '2087846', '3068256'], ['1145367', '1696925', '2085566', '15558', '63441', '155822', '39714', '609050', '13069', '441462', '1266180', '1657530', '769749', '1074235', '1252062', '1662244', '96192', '157180', '396652', '2369339', '647980', '2143574', '2489057', '2552139', '2496722', '2624965', '2108551', '2785305', '2862407', '738378', '3246898', '3443654', '2995416', '1416518', '3765766', '3766271', '3574990', '2270719', '22013', '513298', '4194247', '625944'], ['661347', '808612', '593522', '710859', '825434', '726940', '392052', '1173863', '862241', '1725184', '72533', '1014211', '2073029', '2198902', '66009', '1899318', '2081930', '2502798', '3386299', '3403177', '3255755', '599128', '3560473', '3684862', '3696597', '3491821', '2816845', '2032927', '3581615', '2556703'], ['39588', '589886', '574787', '1892854', '31070', '1230094', '1962706', '2264105', '2348342', '2189510', '26972

HBox(children=(IntProgress(value=0, max=58270928), HTML(value='')))


[['2116146', '3093461', '188507', '724607', '773445', '1458878', '109959', '66210', '1678901', '82421', '1803758', '2638858'], ['1003786', '1469873', '1801889', '16946', '58788', '139702', '38066', '541125', '14495', '392680', '1107111', '1436687', '680028', '942883', '1095314', '1440744', '87644', '140916', '354247', '2042824', '575249', '1851077', '2143842', '2197677', '2150246', '2259448', '1821340', '2394728', '2461002', '653169', '2793270', '2964754', '2576212', '1235723', '3238255', '3238685', '3078081', '1959296', '22877', '456635', '3598073', '555841'], ['586668', '713448', '527601', '629278', '728308', '643108', '350235', '1027874', '760110', '1493871', '66929', '891470', '1791274', '1898111', '60437', '1640307', '1798799', '2155455', '2914064', '2118485', '2800842', '532510', '3065742', '3171030', '3180793', '3007026', '2421620', '1757244', '3083678', '2201596'], ['37966', '524312', '511235', '1638619', '30773', '1076286', '1698206', '1953681', '2024723', '1890064', '2320452

HBox(children=(IntProgress(value=0, max=23254114), HTML(value='')))


[['-1', '64', '1261', '136', '1334', '1454'], ['-1', '129', '1064', '87', '80', '111', '1849', '107', '1400', '1261', '1268', '2031', '154', '2103', '15872', '2637', '2065', '128', '1323', '11119'], ['-1', '1674', '1786', '2258', '23', '36256', '40905', '35985', '1031', '38847', '2887', '1261', '1243', '1873', '37933', '37243', '38250'], ['-1', '154', '1261', '2065', '1862', '2625', '38743', '37758', '39422', '39904', '41265', '129', '29', '58', '1334', '39856', '40071', '43761'], ['-1', '136', '87', '1064', '2620', '2510', '1036']]
1900000
shuffle...
training...
/data/workspace/kimi/tencent_ads/2020/kimi/model/product_id_clk_ns_total_s128_w10_emb.model
Word2Vec(vocab=39057, size=128, alpha=0.025)
w2v: user_id advertiser_id /data/workspace/kimi/tencent_ads/2020/kimi/model
build...
            user_id  advertiser_id  click_times  time
0               1.0          188.0          2.0  73.0
4               1.0        10988.0          2.0  76.0
2               1.0         7293.0          1

HBox(children=(IntProgress(value=0, max=44625291), HTML(value='')))


[['188', '10988', '7293', '9702', '29455', '14668', '11411', '14681', '17189', '367', '44865', '23575'], ['42272', '22885', '27843', '26006', '10686', '57597', '17951', '33664', '18562', '25932', '768', '34505', '918', '34503', '17284', '13732', '20033', '25260', '2302', '38785', '14938', '14002', '52049', '29097', '52193', '36700', '45025', '11989', '14681', '40936', '25896', '16473', '38517', '818', '25704', '20193'], ['8371', '32974', '9877', '14186', '17018', '18492', '9058', '2336', '39500', '45169', '14289', '13119', '36384', '48608', '9970', '16764', '15351', '14498', '55248', '16895', '52263', '918', '31365', '34656', '7516', '15120', '17151', '28648'], ['8877', '14002', '19056', '19451', '7976', '13084', '12130', '811', '10172', '13299', '23664', '14682', '20033', '32762', '25896', '817', '19524', '52640', '6783', '29402', '34504', '14681', '11948', '21701', '50584', '55259'], ['2862', '13808', '13558', '11882', '992', '22885', '9706', '38760', '17745', '31552', '22918', '355

HBox(children=(IntProgress(value=0, max=7266177), HTML(value='')))


[['2', '5', '18'], ['2', '18', '8'], ['2', '18', '17', '12', '8', '4'], ['2', '17', '18', '4', '5', '7'], ['18', '2', '5', '8']]
1900000
shuffle...
training...
/data/workspace/kimi/tencent_ads/2020/kimi/model/product_category_clk_ns_total_s128_w10_emb.model
Word2Vec(vocab=18, size=128, alpha=0.025)
w2v: user_id industry /data/workspace/kimi/tencent_ads/2020/kimi/model
build...
            user_id  industry  click_times  time
8               1.0     326.0          3.0  43.0
1               1.0       6.0          3.0  76.0
7               1.0     245.0          2.0  73.0
...             ...       ...          ...   ...
24750304  4000000.0     297.0          1.0  71.0
24750287  4000000.0      36.0          1.0  75.0
24750285  4000000.0      21.0          1.0  76.0

[24750309 rows x 4 columns]


HBox(children=(IntProgress(value=0, max=24750309), HTML(value='')))


[['326', '6', '245', '106', '-1', '73', '217', '64', '238'], ['6', '47', '317', '242', '238', '36', '318', '319', '207', '5', '322', '289', '21', '250', '328'], ['322', '-1', '205', '6', '54', '319', '302', '247'], ['-1', '6', '319', '238', '317', '88', '25', '248', '295', '47'], ['6', '322', '319', '203', '54', '296', '297', '-1', '318', '288', '238', '242', '220', '28', '109', '289', '25', '8']]
1900000
shuffle...
training...
