In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from  collections import Counter

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
total_merged= 'total_merged.pkl'
total_merged_path = f'{preprocess_path}/{total_merged}'
total_merged_df = pd.read_pickle(total_merged_path)
print(total_merged_df)


          time  user_id  creative_id  click_times    ad_id  product_id  product_category  advertiser_id  industry
32216896     1  3969503       146454            1   131508      -34137                18          34137        40
25196907     1     2267       249047            1   223979        1927                 2          15784       322
13936212     1   512898       168451            1   150988        1243                 2          29966       322
...        ...      ...          ...          ...      ...         ...               ...            ...       ...
58813490    91  3773900      3680751            1  3167517       42718                17          10259        -1
23168403    91   498277      3877769            1  3331468       26858                 3             23        60
13125865    91   141441      4329985            1  3712996       39503                17          13328        -1

[63668283 rows x 9 columns]


In [5]:

def deepwalk(log,f1,f2,flag,L):
    #Deepwalk算法，
    print("deepwalk:",f1,f2)
    #构建图
    dic={}
    for item in tqdm(log[[f1,f2]].values,total=len(log)):
        try:
            str(int(item[1]))
            str(int(item[0]))
        except:
            continue
        try:
            dic['item_'+str(int(item[1]))].add('user_'+str(int(item[0])))
        except:
            dic['item_'+str(int(item[1]))]=set(['user_'+str(int(item[0]))])
        try:
            dic['user_'+str(int(item[0]))].add('item_'+str(int(item[1])))
        except:
            dic['user_'+str(int(item[0]))]=set(['item_'+str(int(item[1]))])
    dic_cont={}
    for key in dic:
        dic[key]=list(dic[key])
        dic_cont[key]=len(dic[key])
    print("creating")     
    #构建路径
    path_length=10        
    sentences=[]
    length=[]
    for key in tqdm(dic,total=len(dic)):
        sentence=[key]
        while len(sentence)!=path_length:
            key=dic[sentence[-1]][random.randint(0,dic_cont[sentence[-1]]-1)]
            if len(sentence)>=2 and key == sentence[-2]:
                break
            else:
                sentence.append(key)
        sentences.append(sentence)
        length.append(len(sentence))
        if len(sentences)%100000==0:
            print(len(sentences))
    print(np.mean(length))
    print(len(sentences))
    #训练Deepwalk模型
    print('training...')
    random.shuffle(sentences)
    model = Word2Vec(sentences, size=L, window=4,min_count=1,sg=1, workers=10,iter=20)
    print('outputing...')
    model.save(f'deepwalk_{f2}_{L}')
    #输出
    values=set(log[f1].values)
    w2v=[]
    for v in values:
        try:
            a=[int(v)]
            a.extend(model['user_'+str(int(v))])
            w2v.append(a)
        except:
            pass
    out_df=pd.DataFrame(w2v)
    names=[f1]
    for i in range(L):
        names.append(f1+'_'+ f2+'_'+names[0]+'_deepwalk_embedding_'+str(L)+'_'+str(i))
    out_df.columns = names
    print(out_df.head())
    out_df.to_pickle('preprocess_path/' +f1+'_'+ f2+'_'+f1 +'_'+flag +'_deepwalk_'+str(L)+'.pkl') 
    ########################
    values=set(log[f2].values)
    w2v=[]
    for v in values:
        try:
            a=[int(v)]
            a.extend(model['item_'+str(int(v))])
            w2v.append(a)
        except:
            pass
    out_df=pd.DataFrame(w2v)
    names=[f2]
    for i in range(L):
        names.append(f1+'_'+ f2+'_'+names[0]+'_deepwalk_embedding_'+str(L)+'_'+str(i))
    out_df.columns = names
    print(out_df.head())
    out_df.to_pickle('preprocess_path/' +f1+'_'+ f2+'_'+f2 +'_'+flag +'_deepwalk_'+str(L)+'.pkl') 

In [None]:
deepwalk(total_merged_df,'user_id','advertiser_id','train',64)

deepwalk: user_id advertiser_id



  0%|          | 0/63668283 [00:00<?, ?it/s][A
  0%|          | 22367/63668283 [00:00<04:44, 223663.81it/s][A
  0%|          | 44968/63668283 [00:00<04:43, 224362.44it/s][A
  0%|          | 56446/63668283 [00:04<2:00:53, 8769.22it/s][A
  0%|          | 78616/63668283 [00:04<1:26:02, 12318.62it/s][A
  0%|          | 100764/63668283 [00:04<1:01:38, 17188.31it/s][A
  0%|          | 123883/63668283 [00:04<44:30, 23796.48it/s]  [A
  0%|          | 146365/63668283 [00:04<32:33, 32519.76it/s][A
  0%|          | 168904/63668283 [00:04<24:11, 43751.38it/s][A
  0%|          | 190912/63668283 [00:05<18:22, 57594.77it/s][A
  0%|          | 212243/63668283 [00:05<14:20, 73744.65it/s][A
  0%|          | 234878/63668283 [00:05<11:26, 92441.86it/s][A
  0%|          | 257476/63668283 [00:05<09:24, 112361.06it/s][A
  0%|          | 280211/63668283 [00:05<07:58, 132458.92it/s][A
  0%|          | 303086/63668283 [00:05<06:57, 151603.30it/s][A
  1%|          | 325812/63668283 [00:05<06:16, 

  5%|▍         | 2941375/63668283 [00:21<04:18, 234543.50it/s][A
  5%|▍         | 2965432/63668283 [00:21<04:16, 236318.91it/s][A
  5%|▍         | 2989140/63668283 [00:21<04:16, 236125.01it/s][A
  5%|▍         | 3012917/63668283 [00:21<04:16, 236613.82it/s][A
  5%|▍         | 3036616/63668283 [00:21<04:17, 235896.90it/s][A
  5%|▍         | 3060519/63668283 [00:21<04:15, 236826.73it/s][A
  5%|▍         | 3084316/63668283 [00:21<04:15, 237166.93it/s][A
  5%|▍         | 3108607/63668283 [00:21<04:13, 238860.03it/s][A
  5%|▍         | 3132540/63668283 [00:21<04:13, 238999.18it/s][A
  5%|▍         | 3156449/63668283 [00:22<04:13, 238940.21it/s][A
  5%|▍         | 3180350/63668283 [00:22<04:14, 237735.97it/s][A
  5%|▌         | 3204130/63668283 [00:22<04:14, 237545.62it/s][A
  5%|▌         | 3227889/63668283 [00:22<04:15, 236685.07it/s][A
  5%|▌         | 3251812/63668283 [00:22<04:14, 237440.55it/s][A
  5%|▌         | 3275560/63668283 [00:22<04:16, 235178.48it/s][A
  5%|▌    

  9%|▉         | 5863098/63668283 [00:38<04:37, 208475.98it/s][A
  9%|▉         | 5886093/63668283 [00:38<04:29, 214484.84it/s][A
  9%|▉         | 5910009/63668283 [00:39<04:20, 221333.54it/s][A
  9%|▉         | 5933881/63668283 [00:39<04:15, 226277.37it/s][A
  9%|▉         | 5957568/63668283 [00:39<04:11, 229352.65it/s][A
  9%|▉         | 5981448/63668283 [00:39<04:08, 232105.23it/s][A
  9%|▉         | 6005497/63668283 [00:39<04:05, 234557.87it/s][A
  9%|▉         | 6029449/63668283 [00:39<04:04, 236024.19it/s][A
 10%|▉         | 6053214/63668283 [00:39<04:03, 236508.14it/s][A
 10%|▉         | 6077085/63668283 [00:39<04:02, 237162.47it/s][A
 10%|▉         | 6100927/63668283 [00:39<04:02, 237536.37it/s][A
 10%|▉         | 6124838/63668283 [00:39<04:01, 238004.14it/s][A
 10%|▉         | 6148655/63668283 [00:40<04:05, 234772.00it/s][A
 10%|▉         | 6172297/63668283 [00:40<04:04, 235263.04it/s][A
 10%|▉         | 6196631/63668283 [00:40<04:01, 237628.67it/s][A
 10%|▉    

 14%|█▍        | 8848017/63668283 [00:51<03:49, 239221.10it/s][A
 14%|█▍        | 8871943/63668283 [00:51<03:49, 238810.98it/s][A
 14%|█▍        | 8896334/63668283 [00:51<03:47, 240315.86it/s][A
 14%|█▍        | 8920370/63668283 [00:51<03:48, 239653.71it/s][A
 14%|█▍        | 8945020/63668283 [00:51<03:46, 241666.38it/s][A
 14%|█▍        | 8969193/63668283 [00:51<03:47, 240147.81it/s][A
 14%|█▍        | 8993644/63668283 [00:51<03:46, 241438.91it/s][A
 14%|█▍        | 9017795/63668283 [00:52<03:47, 240573.80it/s][A
 14%|█▍        | 9042146/63668283 [00:52<03:46, 241444.19it/s][A
 14%|█▍        | 9066295/63668283 [00:52<03:46, 240611.04it/s][A
 14%|█▍        | 9090758/63668283 [00:52<03:45, 241800.81it/s][A
 14%|█▍        | 9114943/63668283 [00:52<03:46, 241238.03it/s][A
 14%|█▍        | 9139355/63668283 [00:52<03:45, 242095.18it/s][A
 14%|█▍        | 9163568/63668283 [00:52<03:48, 238689.67it/s][A
 14%|█▍        | 9187450/63668283 [00:52<03:52, 234130.37it/s][A
 14%|█▍   

 18%|█▊        | 11768855/63668283 [01:03<03:39, 236921.81it/s][A
 19%|█▊        | 11792859/63668283 [01:03<03:38, 237848.12it/s][A
 19%|█▊        | 11816648/63668283 [01:03<03:38, 236929.48it/s][A
 19%|█▊        | 11840345/63668283 [01:03<03:38, 236903.42it/s][A
 19%|█▊        | 11864038/63668283 [01:04<03:41, 234079.65it/s][A
 19%|█▊        | 11887763/63668283 [01:04<03:40, 235021.73it/s][A
 19%|█▊        | 11911273/63668283 [01:04<03:40, 234658.82it/s][A
 19%|█▊        | 11935306/63668283 [01:04<03:38, 236331.27it/s][A
 19%|█▉        | 11958946/63668283 [01:04<03:39, 235833.37it/s][A
 19%|█▉        | 11983003/63668283 [01:04<03:37, 237233.22it/s][A
 19%|█▉        | 12006732/63668283 [01:04<03:37, 237031.31it/s][A
 19%|█▉        | 12030981/63668283 [01:04<03:36, 238642.84it/s][A
 19%|█▉        | 12054851/63668283 [01:04<03:37, 237387.83it/s][A
 19%|█▉        | 12078618/63668283 [01:05<03:37, 237471.46it/s][A
 19%|█▉        | 12102369/63668283 [01:05<03:37, 237074.65it/s

 23%|██▎       | 14683682/63668283 [01:15<03:27, 236630.59it/s][A
 23%|██▎       | 14707565/63668283 [01:16<03:26, 237283.85it/s][A
 23%|██▎       | 14731296/63668283 [01:16<03:27, 236274.82it/s][A
 23%|██▎       | 14755087/63668283 [01:16<03:26, 236762.91it/s][A
 23%|██▎       | 14778766/63668283 [01:16<03:27, 235804.59it/s][A
 23%|██▎       | 14802677/63668283 [01:16<03:26, 236785.80it/s][A
 23%|██▎       | 14826359/63668283 [01:16<03:26, 236159.05it/s][A
 23%|██▎       | 14850261/63668283 [01:16<03:25, 237008.67it/s][A
 23%|██▎       | 14873965/63668283 [01:16<03:26, 236510.86it/s][A
 23%|██▎       | 14898062/63668283 [01:16<03:25, 237829.56it/s][A
 23%|██▎       | 14921848/63668283 [01:16<03:26, 235496.64it/s][A
 23%|██▎       | 14945944/63668283 [01:17<03:25, 237108.19it/s][A
 24%|██▎       | 14969663/63668283 [01:17<03:26, 236083.24it/s][A
 24%|██▎       | 14993669/63668283 [01:17<03:25, 237260.93it/s][A
 24%|██▎       | 15017401/63668283 [01:17<03:25, 236303.15it/s

 28%|██▊       | 17566291/63668283 [01:28<03:17, 233452.42it/s][A
 28%|██▊       | 17589733/63668283 [01:28<03:17, 233740.25it/s][A
 28%|██▊       | 17613109/63668283 [01:28<03:17, 233080.27it/s][A
 28%|██▊       | 17636419/63668283 [01:28<03:17, 233066.58it/s][A
 28%|██▊       | 17660136/63668283 [01:28<03:16, 234281.36it/s][A
 28%|██▊       | 17683567/63668283 [01:28<03:16, 234117.39it/s][A
 28%|██▊       | 17706981/63668283 [01:28<03:16, 233727.73it/s][A
 28%|██▊       | 17730356/63668283 [01:28<03:16, 233286.15it/s][A
 28%|██▊       | 17754025/63668283 [01:29<03:15, 234295.39it/s][A
 28%|██▊       | 17777595/63668283 [01:29<03:15, 234712.07it/s][A
 28%|██▊       | 17801199/63668283 [01:29<03:15, 235106.77it/s][A
 28%|██▊       | 17824711/63668283 [01:29<03:15, 233972.87it/s][A
 28%|██▊       | 17848111/63668283 [01:29<03:16, 233062.80it/s][A
 28%|██▊       | 17871420/63668283 [01:29<03:16, 233047.52it/s][A
 28%|██▊       | 17894904/63668283 [01:29<03:15, 233580.63it/s