In [12]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from  collections import Counter
from mittens import GloVe


np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [3]:
def get_merged_log(flag):
    merged= f'{flag}_merged_log.pkl'
    merged_path = f'{preprocess_path}/{merged}'
    merged_df = pd.read_pickle(merged_path)
    print(merged_df)
    return merged_df
train_merged_log_df = get_merged_log('train')
test_merged_log_df = get_merged_log('test')
total_merged_df = pd.concat([train_merged_log_df,test_merged_log_df]).sort_values(by='time')
print(total_merged_df)

del train_merged_log_df
del test_merged_log_df
gc.collect()

          time   user_id  creative_id  click_times      ad_id  product_id  product_category  advertiser_id  industry
0          9.0   30920.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
1         15.0  320815.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
2         11.0  355089.0     567330.0          1.0   504423.0     30673.0               3.0        32638.0     319.0
...        ...       ...          ...          ...        ...         ...               ...            ...       ...
30082768  76.0  309198.0    3686554.0          2.0  3172440.0      3979.0               2.0        52506.0     247.0
30082769  50.0  309198.0    2386973.0          1.0  2057811.0      1946.0               2.0        17757.0     259.0
30082770  12.0   30920.0     717026.0          1.0   634512.0        -1.0              18.0        26466.0     126.0

[30082771 rows x 9 columns]
          time  user_id  creative_i

14

In [20]:
def countCOOC(cooccurrence, window, coreIndex,vocab_dic):
   # cooccurrence：当前共现矩阵
   # window：当前移动窗口数组
   # coreIndex：当前移动窗口数组中的窗口中心位置
   for index in range(len(window)):
       if index == coreIndex:
           continue
       else:
           cooccurrence[vocab_dic[window[coreIndex]]][vocab_dic[window[index]]] = cooccurrence[vocab_dic[window[coreIndex]]][vocab_dic[window[index]]] + 1
   return cooccurrence

def get_cooccurrence(sentences,vocab_size,vocab_dic,window=10,varbose=1):
    cooccurrence = np.zeros((vocab_size, vocab_size), "int64" )
    flag = 0
    for sentence in sentences:
       #itemInt = [int(x) for x in sentence]
       for center_index in range(1, len(sentence)):
           if center_index <= window + 1:
               # 左窗口不足
               curr_window = sentence[1:core + coWindow + 1]
               center_list_index = center_index - 1
               cooccurrence = countCOOC(cooccurrence, curr_window, center_list_index,vocab_dic)
           elif center_index >= len(item) - 1 - window:
               # 右窗口不足
               curr_window = sentence[center_index - window:(len(sentence))]
               center_list_index = window
               cooccurrence = countCOOC(cooccurrence, curr_window, center_list_index,vocab_dic)
           else:
               # 左右均没有问题
               curr_window = sentence[center_index - window:center_index + window + 1]
               center_list_index = window
               cooccurrence = countCOOC(cooccurrence, curr_window, center_list_index,vocab_dic)
       flag = flag + 1
       if flag % 1000 == 0 and varbose >=1 :
           endTime = datetime.datetime.now()
           print("已经计算了%s条数据，用时%s" % (flag, endTime - startTime))

In [18]:
def glove(log,pivot,f,flag,L,model_path,seq_len=200,sentence_len=100,window=5):
    #word2vec算法
    #log为曝光日志，以pivot为主键，f为embedding的对象，flag为dev或test，L是embedding的维度
    print("glove:",pivot,f,model_path)
    #构造文档
    vocab_size=  log[f].nunique() + 1
    print(f'vocab_size:{vocab_size}')
    vocab = log[f].unique()

    word_index = {" ": 0} # 初始化 `[word : token]` ，后期 tokenize 语料库就是用该词典。使用前必须添加一个索引0.
    vocab_dic = {} # 初始化`[word : vector]`字典
    for i in range(len(vocab)):
        word = vocab[i]  # 每个词语
        vocab_dic[word] = i + 1 # 词语：索引

    sentence=[]
    dic={}
    day=0
    log['day'] = log['time']
    print('build...')
    for item in tqdm(log[['day',pivot,f]].values,total=len(log)):
        try:
            dic[item[1]].append(str(int(item[2])))
        except:
            dic[item[1]]=[str(int(item[2]))]
    
    for key in dic:
        sentence.append(dic[key])
    print(sentence[:5])
    print(len(sentence))
    #训练Word2Vec模型
    print('shuffle...')
    random.shuffle(sentence)
    
    print('cooccurrence ...')
    cooccurrence = get_cooccurrence(sentence,vocab_size,vocab_dic)
    print('training...')
    model = GloVe(n=L, max_iter=1000)
    print(model)
    emb = model.fit(cooccurrence)
    model.save(model_path+f'_{L}')
    emb.save(model_path+f'np_{L}')
    print(model)
    print(emb)
    


In [None]:
f = 'ad_id'
time_seq_df =  glove(total_merged_df,'user_id',f,'total',64,f'/data/workspace/kimi/tencent_ads/2020/kimi/model/{f}_glove.model',600,500,10)


glove: user_id ad_id /data/workspace/kimi/tencent_ads/2020/kimi/model/ad_id_glove.model
3027361
build...


HBox(children=(IntProgress(value=0, max=63668283), HTML(value='')))

In [9]:


#industy_seq_df =  w2v(total_merged_df,'user_id','industry','total',64,'/data/workspace/kimi/tencent_ads/2020/kimi/model/industry_emb.model',600,500,10)
#industy_seq_df.to_pickle(industy_seq_path)
#print(industy_seq_df)

In [10]:
#product_id_seq_df = w2v(total_merged_df,'user_id','product_id','total',64,'/data/workspace/kimi/tencent_ads/2020/kimi/model/product_id_emb.model',500,500,10)


In [11]:
#advertiser_id_seq_df = w2v(total_merged_df,'user_id','advertiser_id','total',64,'/data/workspace/kimi/tencent_ads/2020/kimi/model/advertiser_id_emb.model',400,500,10)


In [12]:
#ad_id_seq_df = w2v(total_merged_df,'user_id','ad_id','total',64,'/data/workspace/kimi/tencent_ads/2020/kimi/model/ad_id_emb.model',400,500,10)


In [13]:
#creative_id_seq_df = w2v(total_merged_df,'user_id','creative_id','total',64,'/data/workspace/kimi/tencent_ads/2020/kimi/model/creative_id_emb.model',400,500,10)
