In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from  collections import Counter

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'

In [2]:
clk_df = pd.read_csv(f'{data_path}/test/click_log.csv' )
print(clk_df)

          time  user_id  creative_id  click_times
0           20  3131989       645764            1
1           20  3131989      1027422            1
2           20  3131989      1106443            1
...        ...      ...          ...          ...
33585509    12  3131989       589027            1
33585510    61  3131989      2404831            1
33585511    14  3131989       665822            1

[33585512 rows x 4 columns]


In [3]:
user_click_sum_df = clk_df.groupby(['user_id']).click_times.sum().reset_index().rename(columns={'click_times':'click_times_total'})
print(user_click_sum_df)

        user_id  click_times_total
0       3000001                 11
1       3000002                 65
2       3000003                 36
...         ...                ...
999997  3999998                 20
999998  3999999                 86
999999  4000000                 67

[1000000 rows x 2 columns]


In [4]:
user_day_count_df = clk_df.groupby(['user_id']).time.nunique().reset_index().rename(columns={'time':'active_days'})
print(user_day_count_df)

        user_id  active_days
0       3000001            7
1       3000002           37
2       3000003           23
...         ...          ...
999997  3999998           18
999998  3999999           45
999999  4000000           39

[1000000 rows x 2 columns]


In [5]:
user_df = user_day_count_df.merge(user_click_sum_df,on='user_id')
print(user_df)

        user_id  active_days  click_times_total
0       3000001            7                 11
1       3000002           37                 65
2       3000003           23                 36
...         ...          ...                ...
999997  3999998           18                 20
999998  3999999           45                 86
999999  4000000           39                 67

[1000000 rows x 3 columns]


In [6]:
ad_df = pd.read_csv(f'{data_path}/test/ad.csv' )

In [7]:
merged_df = clk_df.merge(ad_df,on='creative_id')
merged_df.replace("\\N",-1,inplace=True)
merged_df=merged_df.sort_values(by='time')
#merged_df['wday'] = merged_df['time'].apply(lambda x :int(x /7))
#merged_df['month'] = merged_df['time'].apply(lambda x :int(x /30))
print(merged_df)
del clk_df

          time  user_id  creative_id  click_times    ad_id product_id  product_category  advertiser_id industry
5115271      1  3134048       171985            1   154333       1234                 2          11990      321
7160664      1  3048376        44862            1    42085      26858                 3           6776       60
4777789      1  3147347        27216            1    27539         -1                12          21199       54
...        ...      ...          ...          ...      ...        ...               ...            ...      ...
29622320    91  3836892      4367150            1  3744662         -1                 5           7133      109
29622315    91  3559996      4367150            1  3744662         -1                 5           7133      109
21595115    91  3259979      4180721            1  3586494       8588                 3          12274      259

[33585512 rows x 9 columns]


In [8]:
def w2v(log,pivot,f,flag,L,model_path,is_train=True):
    #word2vec算法
    #log为曝光日志，以pivot为主键，f为embedding的对象，flag为dev或test，L是embedding的维度
    print("w2v:",pivot,f,model_path)
    #构造文档
    sentence=[]
    dic={}
    day=0
    log['day'] = log['time']
    print('build...')
    for item in tqdm(log[['day',pivot,f]].values,total=len(log)):
        try:
            user_values = dic[item[1]]
            user_list = user_values[0] + [str(int(item[2]))]
            user_values[1].update([str(int(item[2]))])
            dic[item[1]] = [user_list,user_values[1],user_values[2] +1]
        except:
            c=  Counter([str(int(item[2]))])
            dic[item[1]]=[[str(int(item[2]))],c,1]

    c = Counter()
    cv = Counter()
    for key in dic:
        c.update([len(dic[key][0])])
        cv.update([dic[key][2]])

    print(f"sentence counter:{c}")
    print(f"seq counter:{cv}")
    #训练Word2Vec模型
    print('shuffle...')
    random.shuffle(sentence)

    model = Word2Vec.load(model_path)
        
    print(model)
    
    is_first_user = True
    result=[]
    print('outputing...')
    
    for k,v in dic.items():
        user_sentence = None
        for w in v[0]:
            try:
                emb_vec =  model[w]
            except Exception as e:
                emb_vec = [0  for i in range(L)]
                
            if user_sentence is None:
                user_sentence = np.array(emb_vec)
            else:
                user_sentence = user_sentence + np.array(emb_vec)
                
        if user_sentence is None:
            new_list = [0  for i in range(L)]
            user_sentence = np.array(new_list)
        user_sentence = user_sentence / len(v[0])
        
        c = dic[k][1]
        top_list = c.most_common(1)
        if len(top_list) > 0:
            try:
                top1_vec =  model[top_list[0][0]]
            except Exception as e:
                top1_vec = np.array([0  for i in range(L)])
        else:
            top1_vec = np.array([0  for i in range(L)])
        
        result.append([k] + user_sentence.flatten().tolist() + top1_vec.flatten().tolist())
    cols = ['user_id'] + [f'{f}_{i}'  for i in range(L)]  + [f'{f}_top1_{i}'  for i in range(L)]
    ret_df = pd.DataFrame(result,columns=cols)
    del model
    del dic
    del sentence
    #保存文件
    return ret_df 



In [9]:
industry_df   = w2v(merged_df,'user_id','industry','train',16,'/data/workspace/kimi/tencent_ads/2020/kimi/industry_emb_model',False)
print(industry_df)


w2v: user_id industry /data/workspace/kimi/tencent_ads/2020/kimi/industry_emb_model
build...


100%|██████████| 33585512/33585512 [08:38<00:00, 64832.79it/s] 


sentence counter:Counter({11: 49162, 12: 46149, 13: 42552, 10: 40774, 14: 40154, 15: 36887, 16: 34851, 17: 33150, 18: 30580, 19: 29167, 20: 27466, 21: 25817, 22: 24766, 23: 22834, 24: 21859, 25: 20821, 26: 19745, 27: 18936, 28: 17762, 29: 16906, 30: 16079, 31: 15463, 32: 14637, 33: 13945, 34: 13207, 35: 12427, 36: 11953, 9: 11714, 37: 11430, 38: 10906, 39: 10469, 40: 9937, 41: 9370, 42: 8885, 43: 8513, 44: 8175, 45: 7865, 46: 7400, 47: 6999, 48: 6828, 49: 6537, 50: 6335, 51: 5777, 52: 5560, 53: 5456, 54: 5205, 55: 4987, 56: 4749, 57: 4606, 58: 4324, 59: 4283, 60: 4040, 62: 3877, 61: 3792, 63: 3658, 64: 3358, 65: 3301, 66: 3169, 67: 3111, 68: 2968, 69: 2785, 70: 2760, 71: 2613, 72: 2543, 8: 2506, 74: 2427, 73: 2336, 75: 2227, 76: 2115, 77: 2063, 78: 2013, 79: 1923, 80: 1825, 81: 1810, 82: 1787, 83: 1735, 85: 1666, 84: 1607, 86: 1485, 88: 1456, 87: 1443, 89: 1372, 91: 1335, 90: 1284, 92: 1258, 94: 1173, 93: 1156, 95: 1135, 96: 1076, 97: 1055, 98: 1043, 101: 990, 100: 962, 99: 919, 102: 9



        user_id  industry_0  industry_1  industry_2  industry_3  industry_4  industry_5  industry_6  industry_7  industry_8  industry_9  industry_10  industry_11  industry_12  industry_13  industry_14  industry_15  industry_top1_0  industry_top1_1  industry_top1_2  \
0       3134048    4.220266    0.518563   -0.581375    0.881568   -3.340080    2.188093   -0.775323    3.621813    2.190655   -0.290929     0.746681     0.936941    -0.549196    -1.284381     0.958341     0.337710         3.614543        -0.057782        -1.737635   
1       3048376    5.278951   -5.128386    3.394961    1.871873    7.824369   -4.311634    2.332213   -0.835217    1.070822    2.515646     1.179100    -5.097178    -1.300128     0.694701    -1.963471     1.612436         7.930371        -8.691642         7.804227   
2       3147347   -0.737395    2.393932    0.825859   -0.768066   -0.430783   -0.447260   -0.407899   -1.196429    0.019001    0.449663    -0.289007     0.449761     1.546074    -0.514982     0.33

In [10]:
advertiser_id_df   = w2v(merged_df,'user_id','advertiser_id','train',64,'/data/workspace/kimi/tencent_ads/2020/kimi/advertiser_id_emb_model',False)
print(advertiser_id_df)

w2v: user_id advertiser_id /data/workspace/kimi/tencent_ads/2020/kimi/advertiser_id_emb_model
build...


100%|██████████| 33585512/33585512 [09:06<00:00, 61505.69it/s]


sentence counter:Counter({11: 49162, 12: 46149, 13: 42552, 10: 40774, 14: 40154, 15: 36887, 16: 34851, 17: 33150, 18: 30580, 19: 29167, 20: 27466, 21: 25817, 22: 24766, 23: 22834, 24: 21859, 25: 20821, 26: 19745, 27: 18936, 28: 17762, 29: 16906, 30: 16079, 31: 15463, 32: 14637, 33: 13945, 34: 13207, 35: 12427, 36: 11953, 9: 11714, 37: 11430, 38: 10906, 39: 10469, 40: 9937, 41: 9370, 42: 8885, 43: 8513, 44: 8175, 45: 7865, 46: 7400, 47: 6999, 48: 6828, 49: 6537, 50: 6335, 51: 5777, 52: 5560, 53: 5456, 54: 5205, 55: 4987, 56: 4749, 57: 4606, 58: 4324, 59: 4283, 60: 4040, 62: 3877, 61: 3792, 63: 3658, 64: 3358, 65: 3301, 66: 3169, 67: 3111, 68: 2968, 69: 2785, 70: 2760, 71: 2613, 72: 2543, 8: 2506, 74: 2427, 73: 2336, 75: 2227, 76: 2115, 77: 2063, 78: 2013, 79: 1923, 80: 1825, 81: 1810, 82: 1787, 83: 1735, 85: 1666, 84: 1607, 86: 1485, 88: 1456, 87: 1443, 89: 1372, 91: 1335, 90: 1284, 92: 1258, 94: 1173, 93: 1156, 95: 1135, 96: 1076, 97: 1055, 98: 1043, 101: 990, 100: 962, 99: 919, 102: 9

Word2Vec(vocab=57870, size=64, alpha=0.025)
outputing...




        user_id  advertiser_id_0  advertiser_id_1  advertiser_id_2  advertiser_id_3  advertiser_id_4  advertiser_id_5  advertiser_id_6  advertiser_id_7  advertiser_id_8  advertiser_id_9  advertiser_id_10  advertiser_id_11  advertiser_id_12  advertiser_id_13  advertiser_id_14  \
0       3134048         0.084437         0.839025         1.169019        -0.374529         0.492250         0.713683        -0.109928        -0.208917        -3.070115        -0.561719          0.061440          0.517303         -1.745582         -1.806765         -0.909491   
1       3048376        -2.755799         0.422950        -1.218510         0.044212        -1.186199        -0.873965         1.434246         0.096302        -1.357887        -1.661709         -0.321143         -0.179717          0.291023         -1.169828          1.346599   
2       3147347        -0.927423        -1.335005         1.282845        -0.962732        -0.314379         0.611004        -0.125951        -0.683560         1.2

In [11]:
user_final_df = user_df.merge(industry_df,on='user_id')
user_final_df = user_final_df.merge(advertiser_id_df,on='user_id')
print(user_final_df)
user_final_df.to_pickle("test4.pkl")

         index  user_id  active_days  click_times_total  industry_0  industry_1  industry_2  industry_3  industry_4  industry_5  industry_6  industry_7  industry_8  industry_9  industry_10  industry_11  industry_12  industry_13  industry_14  industry_15  industry_top1_0  \
0            0  3000001            7                 11    2.532888    1.878715    1.682341    5.494179   -4.078212    5.800343    2.827987    5.240605    6.578812   -4.025118    -2.619265     2.496614    -0.016113    -0.293491    -3.742475    -1.982195         6.615202   
1            1  3000002           37                 65   -1.831123   -1.709645   -0.813786    1.221696    0.296469    1.149126   -3.300779    1.260832   -2.453826   -1.773209    -2.491741     0.341830    -4.217973    -0.916762     1.460926     3.130889        -4.283564   
2            2  3000003           23                 36   -3.716056   -1.042728   -3.551677   -2.050015   -0.921868    1.912847   -3.276425    3.905490    2.156450    1.718182   