In [2]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from  collections import Counter

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [3]:
total_final = 'total_final.pkl'
total_final_path = f'{preprocess_path}/{total_final}'
total_final_df = pd.read_pickle(total_final_path)
print(total_final_df)



         user_id  active_days  click_times_total                                                                                                                                      advertiser_id_seq  \
0              1           10                 14                                                                    [7293, 9702, 29455, 14668, 11411, 14681, 17189, 367, 44865, 188, 23575, 188, 10988]   
1              2           28                 46  [22885, 10686, 18562, 25932, 22885, 34505, 768, 26006, 918, 34503, 17284, 13732, 20033, 27843, 25260, 27843, 42272, 26006, 38785, 2302, 42272, 149...   
2              3           23                 30  [32974, 9877, 18492, 14186, 17018, 9058, 8371, 2336, 39500, 45169, 14289, 13119, 36384, 48608, 16764, 9970, 15351, 14498, 55248, 16895, 52263, 918...   
...          ...          ...                ...                                                                                                                                            

In [9]:
train_df = total_final_df[total_final_df.user_id <= 720000]
valid_df = total_final_df[total_final_df.user_id > 720000]
valid_df = valid_df[valid_df.user_id < 2000000]
print(train_df)
print(valid_df)
topN = 3



        user_id  active_days  click_times_total                                                                                                                                      advertiser_id_seq  \
0             1           10                 14                                                                    [7293, 9702, 29455, 14668, 11411, 14681, 17189, 367, 44865, 188, 23575, 188, 10988]   
1             2           28                 46  [22885, 10686, 18562, 25932, 22885, 34505, 768, 26006, 918, 34503, 17284, 13732, 20033, 27843, 25260, 27843, 42272, 26006, 38785, 2302, 42272, 149...   
2             3           23                 30  [32974, 9877, 18492, 14186, 17018, 9058, 8371, 2336, 39500, 45169, 14289, 13119, 36384, 48608, 16764, 9970, 15351, 14498, 55248, 16895, 52263, 918...   
...         ...          ...                ...                                                                                                                                                 

In [10]:

def forfor(a): 
    return [item for sublist in a for item in sublist] 

def w2v(log,pivot,f,flag,L,model_path,topN=3):
    #训练Word2Vec模型
    model = Word2Vec.load(model_path+ f'_{L}')
    print(model)
    
    is_first_user = True
    result=[]
    print('outputing...')
    for row in tqdm(log[['user_id',f'{f}_seq']].values,total=len(log)):
        user_sentence = None
        c = Counter()
        for w in row[1]:
            c.update([w])
            try:
                emb_vec =  model.wv[w]
            except Exception as e:
                emb_vec = [0  for i in range(L)]
            
            if user_sentence is None:
                user_sentence = np.array(emb_vec)
            else:
                user_sentence = user_sentence + np.array(emb_vec)
                
        if user_sentence is None:
            new_list = [0  for i in range(L)]
            user_sentence = np.array(new_list)
        user_sentence = user_sentence / len(row[1])
        key_counts =[]
        for k,v in  c.items():
            key_counts.append(v)
        key_counts = np.array(key_counts)
        
        top_list = c.most_common(topN)
        top_list_len = len(top_list)
        if top_list_len < topN:
            rlen = topN- top_list_len
            top_list = top_list + ["-2" for i in range(rlen)]
        top_ret = []
        for t in top_list:
            try:
                top_vec_count = t[1]
                top_vec =  model.wv[t[0]]
            except Exception as e:
                top_vec = np.array([0  for i in range(L)])
            top_ret = top_ret + top_vec.flatten().tolist() + [top_vec_count]
            
        if len(top_ret) != L * topN + topN:
            print(f"len error!{len(top_ret)} need {L * topN + topN}")
        data= [row[0]] + user_sentence.flatten().tolist() + top_ret + [np.mean(key_counts),np.std(key_counts),np.min(key_counts)]
        result.append(data)
    cols = ['user_id'] + [f'{f}_{i}'  for i in range(L)]  +forfor([[f'{f}_top{i}_{j}'  for j in range(L + 1)]  for i in range(topN)])  + [f'{f}_mean',f'{f}_std',f'{f}_min']  
    ret_df = pd.DataFrame(result,columns=cols)
    #保存文件
    return ret_df 


In [79]:
industry_df   = w2v(train_df,'user_id','industry','train',16,'/data/workspace/kimi/tencent_ads/2020/kimi/industry_emb_model',topN=topN)
print(industry_df)
industry_topn = f'industry_top{topN}_l16'
industry_topn_path = f'{preprocess_path}/{industry_topn}'
industry_df.to_pickle(industry_topn_path)


  0%|          | 499/720000 [00:00<02:24, 4988.89it/s]

Word2Vec(vocab=332, size=16, alpha=0.025)
outputing...


100%|██████████| 720000/720000 [02:29<00:00, 4815.85it/s]


        user_id  industry_0  industry_1  industry_2  industry_3  industry_4  industry_5  industry_6  industry_7  industry_8  industry_9  industry_10  industry_11  industry_12  industry_13  industry_14  industry_15  industry_top0_0  industry_top0_1  industry_top0_2  \
0             1   -2.407525   -2.480743    0.258189    0.068892    2.275061    0.609698   -1.124533   -2.170345   -3.801786   -1.069965    -1.431871    -0.779060    -2.454330     3.554383    -3.526832     2.944196        -3.168944        -9.633119         0.228927   
1             2    0.528324    0.173733    0.413423   -2.342237    2.927688    2.095639    4.499999   -2.768379   -0.016797    0.830495    -3.164093     2.597350    -2.405309     4.605047    -3.062250    -0.915257        -1.735877         2.013226         1.120578   
2             3   -3.704870   -3.906977    0.328102   -1.694905   -0.775484    2.218845   -2.914866   -2.651409    0.200202    2.303957     0.117288    -0.835449    -3.200287    -1.932386    -0.75

In [80]:
advertiser_id_df   = w2v(train_df,'user_id','advertiser_id','train',32,'/data/workspace/kimi/tencent_ads/2020/kimi/advertiser_id_emb_model')
print(advertiser_id_df)

  0%|          | 453/720000 [00:00<02:38, 4527.76it/s]

Word2Vec(vocab=57870, size=32, alpha=0.025)
outputing...


100%|██████████| 720000/720000 [02:38<00:00, 4535.27it/s]


        user_id  advertiser_id_0  advertiser_id_1  advertiser_id_2  advertiser_id_3  advertiser_id_4  advertiser_id_5  advertiser_id_6  advertiser_id_7  advertiser_id_8  advertiser_id_9  advertiser_id_10  advertiser_id_11  advertiser_id_12  advertiser_id_13  advertiser_id_14  \
0             1         1.070448         0.451069        -0.070068         2.615680        -0.031586        -1.202846         0.494680         0.343328        -1.931134         0.109485         -0.385217          0.351432          0.824578          1.762892         -0.695981   
1             2         0.520920         1.156148        -0.667012         2.474591         0.621758         0.798649         1.628432        -0.367765        -0.460772        -0.527746          0.281460          1.193100         -0.761302          1.549958         -3.661474   
2             3         0.917123        -1.390631         0.399550         0.420246        -1.785983        -0.447509        -0.293782        -0.655995         0.6

In [11]:
product_id_size = 32
product_id_df   = w2v(train_df,'user_id','product_id','train',product_id_size,'/data/workspace/kimi/tencent_ads/2020/kimi/product_id_emb_model',topN=topN)
print(product_id_df)
product_id_topn = f'product_id_top{topN}_l{product_id_size}'
product_id_topn_path = f'{preprocess_path}/{product_id_topn}'
product_id_df.to_pickle(product_id_topn_path)

Word2Vec(vocab=63524, size=32, alpha=0.025)
outputing...


100%|██████████| 720000/720000 [02:41<00:00, 4468.54it/s]


        user_id  product_id_0  product_id_1  product_id_2  product_id_3  product_id_4  product_id_5  product_id_6  product_id_7  product_id_8  product_id_9  product_id_10  product_id_11  product_id_12  product_id_13  product_id_14  product_id_15  product_id_16  product_id_17  \
0             1      0.295044     -0.483192     -1.988056     -0.702914     -3.164695      2.112247     -1.713281     -0.786448      0.641502     -0.303342       0.103733      -1.214379      -0.361489      -1.711542      -1.336796      -2.674590       0.117874      -0.595163   
1             2      0.020646      1.588273     -2.702562      0.005208      0.297846      2.164231     -2.916312      1.575071     -0.933505      1.204409      -0.448533      -1.730390       0.015396       0.690417      -1.841663      -2.921460      -0.627399      -1.902242   
2             3     -0.976395     -2.022372     -1.675200      1.320888     -1.423621      1.533523      0.110557     -0.740175     -1.354827     -2.364457      -0

In [33]:
label_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv')

In [34]:
train_df = train_df.merge(label_df,on='user_id')
train_df = train_df.merge(industry_df,on='user_id')
train_df = train_df.merge(advertiser_id_df,on='user_id')
train_df = train_df.merge(product_id_df,on='user_id')
print(train_df)
print(train_df['age'].value_counts())
print(train_df['gender'].value_counts())
train_df.to_pickle("train5.pkl")


        user_id  active_days  click_times_total                                                                                                                                      advertiser_id_seq  \
0             1           10                 14                                                                    [7293, 9702, 29455, 14668, 11411, 14681, 17189, 367, 44865, 188, 23575, 188, 10988]   
1             2           28                 46  [22885, 10686, 18562, 25932, 22885, 34505, 768, 26006, 918, 34503, 17284, 13732, 20033, 27843, 25260, 27843, 42272, 26006, 38785, 2302, 42272, 149...   
2             3           23                 30  [32974, 9877, 18492, 14186, 17018, 9058, 8371, 2336, 39500, 45169, 14289, 13119, 36384, 48608, 16764, 9970, 15351, 14498, 55248, 16895, 52263, 918...   
...         ...          ...                ...                                                                                                                                                 

In [35]:
industry_test_df   = w2v(valid_df,'user_id','industry','train',16,'/data/workspace/kimi/tencent_ads/2020/kimi/industry_emb_model')
print(industry_test_df)

  0%|          | 678/180000 [00:00<00:26, 6771.58it/s]

Word2Vec(vocab=332, size=16, alpha=0.025)
outputing...


100%|██████████| 180000/180000 [00:27<00:00, 6429.90it/s]


        user_id  industry_0  industry_1  industry_2  industry_3  industry_4  industry_5  industry_6  industry_7  industry_8  industry_9  industry_10  industry_11  industry_12  industry_13  industry_14  industry_15  industry_top0_0  industry_top0_1  industry_top0_2  \
0        720001   -1.989230   -2.485385   -0.775828   -2.405642   -0.579944    4.934428    6.225451   -2.301669   -2.214990    0.856269    -2.414757     1.870960     1.029048     3.209267    -1.862676    -1.774886        -1.735877         2.013226         1.120578   
1        720002   -1.502039    2.271748    0.350132    1.125167   -1.233660   -1.070818   -3.583268   -0.954239   -1.935579   -0.525664    -1.712189    -2.707626    -0.668274     1.354530     0.002641     3.774900        -1.735877         2.013226         1.120578   
2        720003    0.915773    2.172321   -1.929452   -1.091829   -0.282038    0.805745   -0.664731   -0.263200   -1.436507    2.145942     0.755215     0.446492    -1.664001     0.509256     3.54

In [36]:
advertiser_id_test_df   = w2v(valid_df,'user_id','advertiser_id','train',32,'/data/workspace/kimi/tencent_ads/2020/kimi/advertiser_id_emb_model')
print(advertiser_id_test_df)

  0%|          | 627/180000 [00:00<00:28, 6230.58it/s]

Word2Vec(vocab=57870, size=32, alpha=0.025)
outputing...


100%|██████████| 180000/180000 [00:28<00:00, 6339.55it/s]


        user_id  advertiser_id_0  advertiser_id_1  advertiser_id_2  advertiser_id_3  advertiser_id_4  advertiser_id_5  advertiser_id_6  advertiser_id_7  advertiser_id_8  advertiser_id_9  advertiser_id_10  advertiser_id_11  advertiser_id_12  advertiser_id_13  advertiser_id_14  \
0        720001         1.136321         0.213856         0.235502         0.533652         0.641636        -0.825572         0.474796        -3.250204        -1.069312        -0.434412         -4.983632          0.528394         -1.142879          1.671987         -0.639480   
1        720002        -0.444156        -1.480911        -1.698048         0.921614        -1.396144         0.542396         0.222608         0.949205        -3.359164        -0.913740          0.151295          1.423159          0.468865          2.096266         -1.824014   
2        720003        -0.316999        -0.413298        -1.352538         2.920290        -0.045855         1.507078         1.606123         1.657228        -0.6

In [None]:
label_test_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv')

In [60]:

valid_df = valid_df.merge(label_df,on='user_id')
valid_df = valid_df.merge(industry_test_df,on='user_id')
valid_df = valid_df.merge(advertiser_id_test_df,on='user_id')
print(valid_df)
print(valid_df['age'].value_counts())
print(valid_df['gender'].value_counts())
valid_df.to_pickle("valid5.pkl")




        user_id  active_days  click_times_total                                                                                                                                      advertiser_id_seq  \
0        720001           19                 32  [19065, 918, 10985, 10985, 19065, 13732, 7733, 41149, 24894, 16320, 37513, 15736, 918, 20264, 41389, 41389, 38357, 10975, 6465, 6465, 23210, 953, ...   
1        720002           21                 29  [19862, 1461, 19862, 14682, 11425, 18103, 2367, 18786, 14636, 15095, 11101, 10986, 10989, 10986, 36700, 7817, 7809, 14682, 14681, 52180, 14681, 25...   
2        720003           18                 21             [10831, 14513, 14681, 15385, 12270, 18103, 18103, 18103, 18103, 14681, 14681, 14681, 19056, 6974, 14681, 14681, 4751, 14682, 19056, 17018]   
...         ...          ...                ...                                                                                                                                                 