In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from  collections import Counter

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
total_merged= 'total_merged.pkl'
total_merged_path = f'{preprocess_path}/{total_merged}'
total_merged_df = pd.read_pickle(total_merged_path)
print(total_merged_df)


          time  user_id  creative_id  click_times    ad_id  product_id  product_category  advertiser_id  industry
32216896     1  3969503       146454            1   131508      -34137                18          34137        40
25196907     1     2267       249047            1   223979        1927                 2          15784       322
13936212     1   512898       168451            1   150988        1243                 2          29966       322
...        ...      ...          ...          ...      ...         ...               ...            ...       ...
58813490    91  3773900      3680751            1  3167517       42718                17          10259        -1
23168403    91   498277      3877769            1  3331468       26858                 3             23        60
13125865    91   141441      4329985            1  3712996       39503                17          13328        -1

[63668283 rows x 9 columns]


In [3]:
def w2v(log,pivot,f,flag,L,model_path,seq_len=200,sentence_len=100,window=5):
    #word2vec算法
    #log为曝光日志，以pivot为主键，f为embedding的对象，flag为dev或test，L是embedding的维度
    print("w2v:",pivot,f,model_path)
    #构造文档
    sentence=[]
    dic={}
    day=0
    log['day'] = log['time']
    print('build...')
    for item in tqdm(log[['day',pivot,f]].values,total=len(log)):
        try:
            user_values = dic[item[1]]
            user_list = user_values[0] + [str(int(item[2]))]
            user_values[1].update([str(int(item[2]))])
            dic[item[1]] = [user_list,user_values[1],user_values[2] +1]
        except:
            c=  Counter([str(int(item[2]))])
            dic[item[1]]=[[str(int(item[2]))],c,1]

    for key in dic:
        if len(dic[key][0]) < seq_len:
            sentence.append(dic[key][0])
    print(f"sentence counter:{c}")
    print(len(sentence))
    #训练Word2Vec模型
    print('shuffle...')
    random.shuffle(sentence)
    print('training...')
    model = Word2Vec(sentence, size=L, window=window, min_count=1, workers=10,iter=10)
    print(model_path)
    model.save(model_path+f'_{L}')
    del sentence
    print(model)
    
    ret = []
    for key in dic:
         ret.append([key,dic[key][0]])
    print(ret[:3])
    cols = ['user_id'] + [f'{f}_seq']
    ret_df = pd.DataFrame(ret,columns=cols)
    
    del model
    del dic
    del ret
    return ret_df
    


In [4]:


industy_seq_df =  w2v(total_merged_df,'user_id','industry','train',32,'/data/workspace/kimi/tencent_ads/2020/kimi/industry_emb_model',600,500,10)
#industy_seq_df.to_pickle(industy_seq_path)
#print(industy_seq_df)

w2v: user_id industry /data/workspace/kimi/tencent_ads/2020/kimi/industry_emb_model
build...


100%|██████████| 63668283/63668283 [15:48<00:00, 67136.87it/s]  


sentence counter:Counter({'-1': 13})
1899857
shuffle...
training...
/data/workspace/kimi/tencent_ads/2020/kimi/industry_emb_model
Word2Vec(vocab=332, size=32, alpha=0.025)
[[3969503, ['40', '326', '277', '322', '6', '164', '319', '28', '73', '319', '319', '54', '297', '107', '73', '302', '6', '6', '6', '319', '291', '322', '6', '6', '238', '322', '291', '322', '319', '6', '54', '322', '6', '291', '6', '36', '54', '326', '291', '302', '28', '6', '320', '317', '288', '322', '6', '6', '326', '6', '6', '322', '246', '205', '54', '6', '6', '6', '322']], [2267, ['322', '322', '319', '6', '6', '319', '334', '322', '202', '319', '6', '6', '88', '297', '241', '322', '334', '29', '29', '319', '6', '47', '200', '200', '319', '25', '47', '230', '319', '29', '293', '238', '319', '238', '322', '6', '238', '6', '322', '238', '6', '242', '319', '319', '238', '319', '300', '291', '6', '241', '319', '300', '277', '6', '322', '6', '319', '161', '334', '28', '300', '319', '300', '13', '319', '322', '319',

In [9]:
product_id_seq_df = w2v(total_merged_df,'user_id','product_id','train',64,'/data/workspace/kimi/tencent_ads/2020/kimi/product_id_emb_model',500,500,10)
# product_id_seq= 'product_id_seq.pkl'
# product_id_seq_path = f'{preprocess_path}/{product_id_seq}'
# product_id_seq_df.to_pickle(product_id_seq_path)

w2v: user_id product_id /data/workspace/kimi/tencent_ads/2020/kimi/product_id_emb_model
build...


100%|██████████| 63668283/63668283 [15:49<00:00, 67072.25it/s]  


sentence counter:Counter({11: 93286, 12: 87214, 13: 81029, 10: 77471, 14: 76229, 15: 70573, 16: 66238, 17: 62740, 18: 57944, 19: 55232, 20: 52154, 21: 48979, 22: 46991, 23: 43486, 24: 41647, 25: 39427, 26: 37492, 27: 35835, 28: 34030, 29: 32371, 30: 30529, 31: 29284, 32: 27737, 33: 26608, 34: 25306, 35: 23813, 36: 22840, 9: 22055, 37: 21589, 38: 20817, 39: 19824, 40: 18832, 41: 17805, 42: 16972, 43: 16152, 44: 15485, 45: 14859, 46: 14021, 47: 13440, 48: 12881, 49: 12346, 50: 11843, 51: 10976, 52: 10637, 53: 10448, 54: 9820, 55: 9333, 56: 9043, 57: 8753, 58: 8157, 59: 8077, 60: 7668, 61: 7403, 62: 7140, 63: 6954, 64: 6448, 65: 6262, 66: 6033, 67: 5835, 68: 5552, 69: 5301, 70: 5197, 71: 4957, 72: 4824, 8: 4776, 74: 4573, 73: 4404, 75: 4318, 76: 4079, 77: 3945, 78: 3828, 79: 3707, 80: 3503, 81: 3424, 82: 3370, 83: 3307, 84: 3072, 85: 3062, 86: 2866, 87: 2828, 88: 2781, 89: 2672, 90: 2539, 91: 2440, 92: 2437, 94: 2303, 93: 2298, 95: 2197, 96: 2041, 98: 2003, 97: 1999, 100: 1890, 101: 1863,

In [5]:
advertiser_id_seq_df = w2v(total_merged_df,'user_id','advertiser_id','train',64,'/data/workspace/kimi/tencent_ads/2020/kimi/advertiser_id_emb_model',400,500,10)
# advertiser_id_seq= 'advertiser_id_seq.pkl'
# advertiser_id_seq_path = f'{preprocess_path}/{advertiser_id_seq}'
# advertiser_id_seq_df.to_pickle(advertiser_id_seq_path)

w2v: user_id advertiser_id /data/workspace/kimi/tencent_ads/2020/kimi/advertiser_id_emb_model
build...


100%|██████████| 63668283/63668283 [16:09<00:00, 65672.29it/s]  


sentence counter:Counter({11: 93286, 12: 87214, 13: 81029, 10: 77471, 14: 76229, 15: 70573, 16: 66238, 17: 62740, 18: 57944, 19: 55232, 20: 52154, 21: 48979, 22: 46991, 23: 43486, 24: 41647, 25: 39427, 26: 37492, 27: 35835, 28: 34030, 29: 32371, 30: 30529, 31: 29284, 32: 27737, 33: 26608, 34: 25306, 35: 23813, 36: 22840, 9: 22055, 37: 21589, 38: 20817, 39: 19824, 40: 18832, 41: 17805, 42: 16972, 43: 16152, 44: 15485, 45: 14859, 46: 14021, 47: 13440, 48: 12881, 49: 12346, 50: 11843, 51: 10976, 52: 10637, 53: 10448, 54: 9820, 55: 9333, 56: 9043, 57: 8753, 58: 8157, 59: 8077, 60: 7668, 61: 7403, 62: 7140, 63: 6954, 64: 6448, 65: 6262, 66: 6033, 67: 5835, 68: 5552, 69: 5301, 70: 5197, 71: 4957, 72: 4824, 8: 4776, 74: 4573, 73: 4404, 75: 4318, 76: 4079, 77: 3945, 78: 3828, 79: 3707, 80: 3503, 81: 3424, 82: 3370, 83: 3307, 84: 3072, 85: 3062, 86: 2866, 87: 2828, 88: 2781, 89: 2672, 90: 2539, 91: 2440, 92: 2437, 94: 2303, 93: 2298, 95: 2197, 96: 2041, 98: 2003, 97: 1999, 100: 1890, 101: 1863,

In [7]:
user_total= 'user_total_df.pkl'
user_total_path = f'{preprocess_path}/{user_total}'
user_total_df = pd.read_pickle(user_total_path)

In [9]:
total_final = 'total_final.pkl'
total_final_path = f'{preprocess_path}/{total_final}'


total_df =  user_total_df.merge(advertiser_id_seq_df,on='user_id')
total_df  = total_df.merge(industy_seq_df,on='user_id')
total_df.to_pickle(total_final_path)
print(total_df)

         user_id  active_days  click_times_total                                                                                                                                      advertiser_id_seq  \
0              1           10                 14                                                                    [7293, 9702, 29455, 14668, 11411, 14681, 17189, 367, 44865, 188, 23575, 188, 10988]   
1              2           28                 46  [22885, 10686, 18562, 25932, 22885, 34505, 768, 26006, 918, 34503, 17284, 13732, 20033, 27843, 25260, 27843, 42272, 26006, 38785, 2302, 42272, 149...   
2              3           23                 30  [32974, 9877, 18492, 14186, 17018, 9058, 8371, 2336, 39500, 45169, 14289, 13119, 36384, 48608, 16764, 9970, 15351, 14498, 55248, 16895, 52263, 918...   
...          ...          ...                ...                                                                                                                                            