In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from  collections import Counter

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
total_merged= 'total_merged.pkl'
total_merged_path = f'{preprocess_path}/{total_merged}'
total_merged_df = pd.read_pickle(total_merged_path)
print(total_merged_df)


          time  user_id  creative_id  click_times    ad_id  product_id  product_category  advertiser_id  industry
32216896     1  3969503       146454            1   131508      -34137                18          34137        40
25196907     1     2267       249047            1   223979        1927                 2          15784       322
13936212     1   512898       168451            1   150988        1243                 2          29966       322
...        ...      ...          ...          ...      ...         ...               ...            ...       ...
58813490    91  3773900      3680751            1  3167517       42718                17          10259        -1
23168403    91   498277      3877769            1  3331468       26858                 3             23        60
13125865    91   141441      4329985            1  3712996       39503                17          13328        -1

[63668283 rows x 9 columns]


In [3]:

def deepwalk(log,f1,f2,flag,L,window,size,name):
    #Deepwalk算法，
    print("deepwalk:",f1,f2)
    #构建图
    dic={}
    for item in tqdm(log[[f1,f2]].values,total=len(log)):
        try:
            str(int(item[1]))
            str(int(item[0]))
        except:
            continue
        try:
            dic['item_'+str(int(item[1]))].add('user_'+str(int(item[0])))
        except:
            dic['item_'+str(int(item[1]))]=set(['user_'+str(int(item[0]))])
        try:
            dic['user_'+str(int(item[0]))].add('item_'+str(int(item[1])))
        except:
            dic['user_'+str(int(item[0]))]=set(['item_'+str(int(item[1]))])
    dic_cont={}
    for key in dic:
        dic[key]=list(dic[key])
        dic_cont[key]=len(dic[key])
    print("creating")     
    #构建路径
    path_length=size        
    sentences=[]
    length=[]
    for key in tqdm(dic,total=len(dic)):
        sentence=[key]
        while len(sentence)!=path_length:
            key=dic[sentence[-1]][random.randint(0,dic_cont[sentence[-1]]-1)]
            if len(sentence)>=2 and key == sentence[-2]:
                break
            else:
                sentence.append(key)
        sentences.append(sentence)
        length.append(len(sentence))
        if len(sentences)%100000==0:
            print(len(sentences))
    print(np.mean(length))
    print(len(sentences))
    #训练Deepwalk模型
    print('training...')
    random.shuffle(sentences)
    model = Word2Vec(sentences, size=L, window=window,min_count=1,sg=1, workers=10,iter=20)
    print('outputing...')
    model.save(f'model/deepwalk_{f2}_{L}_{window}_{size}_{name}.model')
    #输出
    values=set(log[f1].values)
    w2v=[]
    for v in values:
        try:
            a=[int(v)]
            a.extend(model['user_'+str(int(v))])
            w2v.append(a)
        except:
            pass
    out_df=pd.DataFrame(w2v)
    names=[f1]
    for i in range(L):
        names.append(f1+'_'+ f2+'_'+names[0]+'_deepwalk_embedding_'+str(L)+'_'+str(i))
    out_df.columns = names
    print(out_df.head())
    out_df.to_pickle('preprocess_path/' +f1+'_'+ f2+'_'+f1 +'_'+flag +'_deepwalk_'+ f'w{window}_s{size}_l{str(L)}_{name}.pkl') 
    ########################
    values=set(log[f2].values)
    w2v=[]
    for v in values:
        try:
            a=[int(v)]
            a.extend(model['item_'+str(int(v))])
            w2v.append(a)
        except:
            pass
    out_df=pd.DataFrame(w2v)
    names=[f2]
    for i in range(L):
        names.append(f1+'_'+ f2+'_'+names[0]+'_deepwalk_embedding_'+str(L)+'_'+str(i))
    out_df.columns = names
    print(out_df.head())
    out_df.to_pickle('preprocess_path/' +f1+'_'+ f2+'_'+f2 +'_'+flag +'_deepwalk_' + f'w{window}_s{size}_l{str(L)}_{name}.pkl') 

In [4]:
deepwalk(total_merged_df,'user_id','advertiser_id','train',64,10,50,'first')

deepwalk: user_id advertiser_id


100%|██████████| 63668283/63668283 [04:45<00:00, 223297.58it/s]
  0%|          | 0/1957870 [00:00<?, ?it/s]

creating


  5%|▌         | 101948/1957870 [00:20<02:15, 13677.42it/s]

100000


 10%|█         | 202671/1957870 [00:27<02:08, 13621.50it/s]

200000


 15%|█▌        | 301806/1957870 [00:35<02:01, 13585.25it/s]

300000


 21%|██        | 402683/1957870 [00:42<01:55, 13489.17it/s]

400000


 26%|██▌       | 502176/1957870 [00:50<01:48, 13373.03it/s]

500000


 31%|███       | 600397/1957870 [00:57<01:43, 13071.45it/s]

600000


 36%|███▌      | 701562/1957870 [01:10<01:38, 12767.52it/s]

700000


 41%|████      | 802321/1957870 [01:17<01:27, 13132.13it/s]

800000


 46%|████▌     | 901774/1957870 [01:25<01:17, 13697.16it/s]

900000


 51%|█████     | 1002669/1957870 [01:32<01:10, 13563.70it/s]

1000000


 56%|█████▋    | 1102173/1957870 [01:39<01:03, 13556.37it/s]

1100000


 61%|██████▏   | 1201457/1957870 [01:47<00:56, 13317.00it/s]

1200000


 66%|██████▋   | 1301756/1957870 [02:01<00:55, 11895.97it/s]

1300000


 72%|███████▏  | 1402653/1957870 [02:08<00:41, 13539.30it/s]

1400000


 77%|███████▋  | 1502627/1957870 [02:16<00:33, 13395.39it/s]

1500000


 82%|████████▏ | 1601914/1957870 [02:23<00:27, 13155.68it/s]

1600000


 87%|████████▋ | 1702590/1957870 [02:30<00:19, 13175.73it/s]

1700000


 92%|█████████▏| 1801687/1957870 [02:38<00:11, 13652.43it/s]

1800000


 97%|█████████▋| 1902565/1957870 [02:45<00:04, 13678.22it/s]

1900000


100%|██████████| 1957870/1957870 [02:49<00:00, 11522.87it/s]


31.33056178397953
1957870
training...
outputing...




   user_id  user_id_advertiser_id_user_id_deepwalk_embedding_64_0  user_id_advertiser_id_user_id_deepwalk_embedding_64_1  user_id_advertiser_id_user_id_deepwalk_embedding_64_2  user_id_advertiser_id_user_id_deepwalk_embedding_64_3  \
0        1                                               0.317821                                              -0.595060                                              -0.322108                                              -0.106295   
1        2                                               0.559834                                              -0.181549                                              -0.642175                                               0.464080   
2        3                                              -0.286303                                               0.280946                                              -0.491918                                              -0.426345   
3        4                                               0.32352

FileNotFoundError: [Errno 2] No such file or directory: 'preprocess_path/user_id_advertiser_id_user_id_train_deepwalk_w10_s50_l64_first.pkl'

In [None]:
deepwalk(total_merged_df,'user_id','industry','train',64,10,50,'first')