In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from  collections import Counter

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
clk_train_file= 'click_log.pkl'
clk_train_file_path = f'{preprocess_path}/{clk_train_file}'
if not os.path.exists(clk_train_file_path):
    clk_df = pd.read_csv(f'{data_path}/train_preliminary/click_log.csv' )
    clk_df.to_pickle(clk_train_file_path)
else: 
    clk_df = pd.read_pickle(clk_train_file_path)
    
print(clk_df)

          time  user_id  creative_id  click_times
0            9    30920       567330            1
1           65    30920      3072255            1
2           56    30920      2361327            1
...        ...      ...          ...          ...
30082768    26    30920       629802            1
30082769    86    30920      2713031            1
30082770    25    30920       629802            1

[30082771 rows x 4 columns]


In [3]:
user_click_sum_df = clk_df.groupby(['user_id']).click_times.sum().reset_index().rename(columns={'click_times':'click_times_total'})
print(user_click_sum_df)

        user_id  click_times_total
0             1                 14
1             2                 46
2             3                 30
...         ...                ...
899997   899998                 15
899998   899999                 22
899999   900000                 12

[900000 rows x 2 columns]


In [4]:
user_day_count_df = clk_df.groupby(['user_id']).time.nunique().reset_index().rename(columns={'time':'active_days'})
print(user_day_count_df)

        user_id  active_days
0             1           10
1             2           28
2             3           23
...         ...          ...
899997   899998           10
899998   899999           17
899999   900000           12

[900000 rows x 2 columns]


In [5]:
user_log_day_clicks_df = clk_df.groupby(['user_id','time']).creative_id.count().reset_index().rename(columns={'creative_id':'day_clicks'})
print(user_log_day_clicks_df)
user_day_clicks_max_df = user_log_day_clicks_df.groupby(['user_id']).day_clicks.max().reset_index().rename(columns={'day_clicks':'day_clicks_max'})
user_day_clicks_min_df = user_log_day_clicks_df.groupby(['user_id']).day_clicks.min().reset_index().rename(columns={'day_clicks':'day_clicks_min'})
user_day_clicks_mean_df = user_log_day_clicks_df.groupby(['user_id']).day_clicks.mean().reset_index().rename(columns={'day_clicks':'day_clicks_mean'})
user_day_clicks_std_df = user_log_day_clicks_df.groupby(['user_id']).day_clicks.std().reset_index().rename(columns={'day_clicks':'day_clicks_std'})



          user_id  time  day_clicks
0               1    20           3
1               1    39           1
2               1    40           1
...           ...   ...         ...
19825288   900000    72           1
19825289   900000    74           1
19825290   900000    91           1

[19825291 rows x 3 columns]


In [6]:
user_df = user_day_count_df.merge(user_click_sum_df,on='user_id')
user_df = user_df.merge(user_day_clicks_max_df,on='user_id')
user_df = user_df.merge(user_day_clicks_min_df,on='user_id')
user_df = user_df.merge(user_day_clicks_mean_df,on='user_id')
user_df = user_df.merge(user_day_clicks_std_df,on='user_id')
print(user_df)


        user_id  active_days  click_times_total  day_clicks_max  day_clicks_min  day_clicks_mean  day_clicks_std
0             1           10                 14               3               1         1.300000        0.674949
1             2           28                 46               4               1         1.607143        0.994030
2             3           23                 30               3               1         1.304348        0.634950
...         ...          ...                ...             ...             ...              ...             ...
899997   899998           10                 15               3               1         1.400000        0.699206
899998   899999           17                 22               2               1         1.294118        0.469668
899999   900000           12                 12               1               1         1.000000        0.000000

[900000 rows x 7 columns]


In [7]:
user_day_count_df = clk_df.groupby(['user_id']).time.nunique().reset_index().rename(columns={'time':'active_days'})
print(user_day_count_df)

        user_id  active_days
0             1           10
1             2           28
2             3           23
...         ...          ...
899997   899998           10
899998   899999           17
899999   900000           12

[900000 rows x 2 columns]


In [8]:
clk_test_file= 'click_log_test.pkl'
clk_test_file_path = f'{preprocess_path}/{clk_test_file}'
if not os.path.exists(clk_test_file_path):
    clk_test_df = pd.read_csv(f'{data_path}/test/click_log.csv' )
    clk_test_df.to_pickle(clk_test_file_path)
else: 
    clk_test_df = pd.read_pickle(clk_test_file_path)
print(clk_test_df)

          time  user_id  creative_id  click_times
0           20  3131989       645764            1
1           20  3131989      1027422            1
2           20  3131989      1106443            1
...        ...      ...          ...          ...
33585509    12  3131989       589027            1
33585510    61  3131989      2404831            1
33585511    14  3131989       665822            1

[33585512 rows x 4 columns]


In [9]:
user_click_sum_test_df = clk_test_df.groupby(['user_id']).click_times.sum().reset_index().rename(columns={'click_times':'click_times_total'})
print(user_click_sum_test_df)

        user_id  click_times_total
0       3000001                 11
1       3000002                 65
2       3000003                 36
...         ...                ...
999997  3999998                 20
999998  3999999                 86
999999  4000000                 67

[1000000 rows x 2 columns]


In [10]:
user_day_count_test_df = clk_test_df.groupby(['user_id']).time.nunique().reset_index().rename(columns={'time':'active_days'})
print(user_day_count_test_df)

        user_id  active_days
0       3000001            7
1       3000002           37
2       3000003           23
...         ...          ...
999997  3999998           18
999998  3999999           45
999999  4000000           39

[1000000 rows x 2 columns]


In [11]:
user_log_day_clicks_test_df = clk_test_df.groupby(['user_id','time']).creative_id.count().reset_index().rename(columns={'creative_id':'day_clicks'})
print(user_log_day_clicks_df)
user_day_clicks_max_test_df = user_log_day_clicks_test_df.groupby(['user_id']).day_clicks.max().reset_index().rename(columns={'day_clicks':'day_clicks_max'})
user_day_clicks_min_test_df = user_log_day_clicks_test_df.groupby(['user_id']).day_clicks.min().reset_index().rename(columns={'day_clicks':'day_clicks_min'})
user_day_clicks_mean_test_df = user_log_day_clicks_test_df.groupby(['user_id']).day_clicks.mean().reset_index().rename(columns={'day_clicks':'day_clicks_mean'})
user_day_clicks_std_test_df = user_log_day_clicks_test_df.groupby(['user_id']).day_clicks.std().reset_index().rename(columns={'day_clicks':'day_clicks_std'})



          user_id  time  day_clicks
0               1    20           3
1               1    39           1
2               1    40           1
...           ...   ...         ...
19825288   900000    72           1
19825289   900000    74           1
19825290   900000    91           1

[19825291 rows x 3 columns]


In [12]:
user_test_df = user_day_count_test_df.merge(user_click_sum_test_df,on='user_id')
print(user_test_df)

        user_id  active_days  click_times_total
0       3000001            7                 11
1       3000002           37                 65
2       3000003           23                 36
...         ...          ...                ...
999997  3999998           18                 20
999998  3999999           45                 86
999999  4000000           39                 67

[1000000 rows x 3 columns]


In [13]:
user_final_test_df = user_day_count_test_df.merge(user_click_sum_test_df,on='user_id')
user_final_test_df = user_final_test_df.merge(user_day_clicks_max_test_df,on='user_id')
user_final_test_df = user_final_test_df.merge(user_day_clicks_min_test_df,on='user_id')
user_final_test_df = user_final_test_df.merge(user_day_clicks_mean_test_df,on='user_id')
user_final_test_df = user_final_test_df.merge(user_day_clicks_std_test_df,on='user_id')
print(user_final_test_df)


        user_id  active_days  click_times_total  day_clicks_max  day_clicks_min  day_clicks_mean  day_clicks_std
0       3000001            7                 11               3               1         1.571429        0.786796
1       3000002           37                 65               4               1         1.648649        0.823820
2       3000003           23                 36               4               1         1.565217        0.727767
...         ...          ...                ...             ...             ...              ...             ...
999997  3999998           18                 20               2               1         1.055556        0.235702
999998  3999999           45                 86               7               1         1.777778        1.312335
999999  4000000           39                 67               4               1         1.615385        0.814842

[1000000 rows x 7 columns]


In [14]:
user_total_df = pd.concat([user_df, user_final_test_df], axis=0).reset_index().drop(['index'], axis=1)
user_total= 'user_total_df.pkl'
user_total_path = f'{preprocess_path}/{user_total}'
user_total_df.to_pickle(user_total_path)