In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer


np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'

In [2]:
clk_df = pd.read_csv(f'{data_path}/train_preliminary/click_log.csv' )
print(clk_df)

          time  user_id  creative_id  click_times
0            9    30920       567330            1
1           65    30920      3072255            1
2           56    30920      2361327            1
3            6   309204       325532            1
4           59   309204      2746730            1
5           12   309204       726402            1
6           79   309204      2851451            1
7           32   309204      1569716            1
8            5   309204        71956            1
9            8   309204       322354            1
...        ...      ...          ...          ...
30082761    52    30920      2361397            1
30082762    52    30920         9148            1
30082763    53    30920      2361397            1
30082764    40    30920        22298            2
30082765    72    30920      2361397            1
30082766     6    30920       228129            1
30082767    57    30920      2361397            1
30082768    26    30920       629802            1


In [3]:
user_click_sum_df = clk_df.groupby(['user_id']).click_times.sum().reset_index().rename(columns={'click_times':'click_times_total'})
print(user_click_sum_df)

        user_id  click_times_total
0             1                 14
1             2                 46
2             3                 30
3             4                 29
4             5                 34
5             6                 17
6             7                 10
7             8                 52
8             9                 21
9            10                 10
...         ...                ...
899990   899991                153
899991   899992                 37
899992   899993                 47
899993   899994                 14
899994   899995                 20
899995   899996                 14
899996   899997                 20
899997   899998                 15
899998   899999                 22
899999   900000                 12

[900000 rows x 2 columns]


In [4]:
user_day_count_df = clk_df.groupby(['user_id']).time.nunique().reset_index().rename(columns={'time':'active_days'})
print(user_day_count_df)

        user_id  active_days
0             1           10
1             2           28
2             3           23
3             4           15
4             5           26
5             6           14
6             7           10
7             8           34
8             9           17
9            10           10
...         ...          ...
899990   899991           68
899991   899992           23
899992   899993           30
899993   899994           11
899994   899995           16
899995   899996           12
899996   899997           14
899997   899998           10
899998   899999           17
899999   900000           12

[900000 rows x 2 columns]


In [5]:
user_df = user_day_count_df.merge(user_click_sum_df,on='user_id')
print(user_df)

        user_id  active_days  click_times_total
0             1           10                 14
1             2           28                 46
2             3           23                 30
3             4           15                 29
4             5           26                 34
5             6           14                 17
6             7           10                 10
7             8           34                 52
8             9           17                 21
9            10           10                 10
...         ...          ...                ...
899990   899991           68                153
899991   899992           23                 37
899992   899993           30                 47
899993   899994           11                 14
899994   899995           16                 20
899995   899996           12                 14
899996   899997           14                 20
899997   899998           10                 15
899998   899999           17            

In [6]:
ad_df = pd.read_csv(f'{data_path}/train_preliminary/ad.csv' )

In [7]:
merged_df = clk_df.merge(ad_df,on='creative_id')
#merged_df['wday'] = merged_df['time'].apply(lambda x :int(x /7))
#merged_df['month'] = merged_df['time'].apply(lambda x :int(x /30))
print(merged_df)
del clk_df

          time  user_id  creative_id  click_times    ad_id product_id  product_category  advertiser_id industry
0            9    30920       567330            1   504423      30673                 3          32638      319
1           15   320815       567330            1   504423      30673                 3          32638      319
2           11   355089       567330            1   504423      30673                 3          32638      319
3            9   363442       567330            1   504423      30673                 3          32638      319
4           14   370513       567330            1   504423      30673                 3          32638      319
5            9   372445       567330            1   504423      30673                 3          32638      319
6           16   399556       567330            2   504423      30673                 3          32638      319
7           10   429834       567330            1   504423      30673                 3          32638  

In [None]:

def tfidf(log,pivot,f,flag,L):
    #word2vec算法
    #log为曝光日志，以pivot为主键，f为embedding的对象，flag为dev或test，L是embedding的维度
    print("tdidf:",pivot,f)
    
    #构造文档
    log[f]=log[f].fillna(-1).astype(str)
    sentence=[]
    dic={}
    day=0
    log=log.sort_values(by='time')
    log['day']=log['time']
    for item in tqdm(log[['day',pivot,f]].values,total=len(log)):
        try:
            dic[item[1]].append(str(item[2]))
        except:
            dic[item[1]]=[str(item[2])]
    for key in dic:
        sentence.append(" ".join(dic[key]))
    print(len(sentence))
    print(sentence[:3])
    #训练Word2Vec模型
    print('training...')
    #random.shuffle(sentence)
    tfidf_list = TfidfVectorizer().fit_transform(sentence)
    print('outputing...')
    df_data = []
    arr_len = 0
    for v1,v2 in zip(list(dic.keys()),tfidf_list):
        arr = np.array(v2.todense()).flatten().tolist()
        if arr_len == 0:
            arr_len = len(arr)
            print(arr_len)
        df_data.append([v1] + arr)
    cols = ['user_id'] + [f'tfidf_{i}'  for i in range(arr_len)]
    tfidf_df= pd.DataFrame(df_data,columns=cols)
    #保存文件
    return tfidf_df 

tfidf_df  = tfidf(merged_df,'user_id','industry','train',64)
print(tfidf_df)


tdidf: user_id industry



  0%|          | 0/30082771 [00:00<?, ?it/s][A
  0%|          | 87949/30082771 [00:00<00:34, 876723.32it/s][A
  1%|          | 176850/30082771 [00:00<00:33, 880373.14it/s][A
  1%|          | 264813/30082771 [00:00<00:33, 880149.06it/s][A
  1%|          | 355921/30082771 [00:01<01:39, 299316.93it/s][A
  1%|▏         | 442480/30082771 [00:01<01:19, 372405.53it/s][A
  2%|▏         | 530404/30082771 [00:01<01:05, 450272.52it/s][A
  2%|▏         | 623237/30082771 [00:01<00:55, 532544.59it/s][A
  2%|▏         | 713088/30082771 [00:01<00:48, 606673.28it/s][A
  3%|▎         | 810507/30082771 [00:01<00:42, 684096.46it/s][A
  3%|▎         | 907481/30082771 [00:01<00:38, 750402.13it/s][A
  3%|▎         | 1006104/30082771 [00:01<00:35, 808392.88it/s][A
  4%|▎         | 1104765/30082771 [00:01<00:33, 854709.42it/s][A
  4%|▍         | 1203039/30082771 [00:01<00:32, 889472.46it/s][A
  4%|▍         | 1301487/30082771 [00:02<00:31, 915990.30it/s][A
  5%|▍         | 1400196/30082771 [00:

 40%|████      | 12099961/30082771 [00:13<00:18, 955860.59it/s][A
 41%|████      | 12198762/30082771 [00:13<00:18, 965283.16it/s][A
 41%|████      | 12296342/30082771 [00:14<00:18, 968413.92it/s][A
 41%|████      | 12393231/30082771 [00:14<00:18, 939424.76it/s][A
 42%|████▏     | 12487396/30082771 [00:14<00:18, 933404.32it/s][A
 42%|████▏     | 12583737/30082771 [00:14<00:18, 942205.55it/s][A
 42%|████▏     | 12680927/30082771 [00:14<00:18, 950920.53it/s][A
 42%|████▏     | 12779170/30082771 [00:14<00:18, 960157.34it/s][A
 43%|████▎     | 12877689/30082771 [00:14<00:17, 967531.45it/s][A
 43%|████▎     | 12976437/30082771 [00:14<00:17, 973429.77it/s][A
 43%|████▎     | 13075356/30082771 [00:14<00:17, 978103.75it/s][A
 44%|████▍     | 13174821/30082771 [00:14<00:17, 983008.42it/s][A
 44%|████▍     | 13274030/30082771 [00:15<00:17, 985713.57it/s][A
 44%|████▍     | 13373012/30082771 [00:15<00:16, 986940.79it/s][A
 45%|████▍     | 13471901/30082771 [00:15<00:16, 987522.91it/s

 80%|████████  | 24207028/30082771 [00:26<00:05, 993284.63it/s][A
 81%|████████  | 24306730/30082771 [00:26<00:05, 994399.74it/s][A
 81%|████████  | 24406172/30082771 [00:26<00:05, 992838.86it/s][A
 81%|████████▏ | 24506801/30082771 [00:26<00:05, 996834.46it/s][A
 82%|████████▏ | 24606490/30082771 [00:26<00:05, 994565.15it/s][A
 82%|████████▏ | 24705952/30082771 [00:26<00:05, 993684.26it/s][A
 82%|████████▏ | 24806603/30082771 [00:26<00:05, 997494.99it/s][A
 83%|████████▎ | 24906359/30082771 [00:26<00:05, 996149.68it/s][A
 83%|████████▎ | 25007592/30082771 [00:26<00:05, 1000948.78it/s][A
 83%|████████▎ | 25107696/30082771 [00:26<00:04, 998586.72it/s] [A
 84%|████████▍ | 25207562/30082771 [00:27<00:04, 996913.45it/s][A
 84%|████████▍ | 25307260/30082771 [00:27<00:04, 995962.07it/s][A
 84%|████████▍ | 25406861/30082771 [00:27<00:04, 994759.39it/s][A
 85%|████████▍ | 25506341/30082771 [00:27<00:04, 993457.33it/s][A
 85%|████████▌ | 25605690/30082771 [00:27<00:04, 990617.00it

900000
['317 319 291 297 319 317 238 116 319 200 291 5 5 291 202 6 216 202 231 6 6 231 \\N 231', '60 247 6 60 88 247 319 253 26 6 247 6 6 60 288 6 5 5 5 172 6 6 6 6 5 6 27 6 54 8 8 6 326 242 6 326 242 6 27 5 322 84 60 73 6 297 6 5 6 297 6 5 4 5 60 73 319 27 242 5 238 6 6 242 5 232 5 291 82 21 6 6 6 6 88 5 242 6 242 6 6 6 6 6 242 5 6 6 73 242 5 73 252', '54 54 297 54 54 242 242 54 297 319 54 54 289 252 88 88 88 6 242 242 297 242 297 183 6 27 6 321 297 6 88 6 \\N 6 183 183 248 248 6 242 6 183 54']
training...
outputing...
317


In [None]:
#print(tfidf_df)
print(tfidf_df.dtypes)
#tfidf_df.to_pickle("tfidf.pkl")

In [None]:
train_user_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv' )
print(train_user_df)

In [None]:
user_final_df = user_df.merge(train_user_df,on='user_id')
user_final_df = user_final_df.merge(tfidf_df,on='user_id')
