In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.metrics import roc_auc_score
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import os
from tqdm import tqdm
import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
def reduce_mem(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df

In [3]:
from numba import njit
from scipy.stats import rankdata

@njit
def _auc(actual, pred_ranks):
    actual = np.asarray(actual)
    pred_ranks = np.asarray(pred_ranks)
    n_pos = np.sum(actual)
    n_neg = len(actual) - n_pos
    return (np.sum(pred_ranks[actual==1]) - n_pos*(n_pos+1)/2) / (n_pos*n_neg)

def auc(actual, predicted):
    pred_ranks = rankdata(predicted)
    return _auc(actual, pred_ranks)

In [4]:
def uAUC(labels, preds):
    """Calculate user AUC"""
    size = 0
    
    total_auc = 0
    
    for user_id in list(usefulid):
        auc1 = auc(labels[userid_dict[user_id]], preds[userid_dict[user_id]])
        total_auc += auc1
        size += 1.0
    user_auc = float(total_auc)/size
    return user_auc

def custom_uAUC_eval(y_true, y_pred):
    eval_auc = uAUC(y_true,y_pred)
    return "uAUC", eval_auc, True



In [5]:
def get_history_stats(data,ycols,gap):
    for f in [['userid','authorid','feedid']]:
        res = pd.DataFrame()
        for day_id in range(1+gap,16):
            print('*************************** start {} ***************************'.format(day_id))
            tmp_data = data[(data['day']<day_id)& (data['day']>=day_id - gap)][f +['id','day','stay','play']+ ycols].copy()
            res_data = data[(data['day']==day_id)][f + ['day']].drop_duplicates()
            
            tmp = tmp_data[f + ['id']].groupby(f,as_index = False)['id'].agg({'_'.join(f) + '_prev_{}day_count'.format(gap):'count'})
            tmp['day'] = day_id
            res_data = res_data.merge(tmp,on = f + ['day'],how = 'left')
            

            tmp = tmp_data[f + ['id','stay']].groupby(f,as_index = False)['stay'].agg({'_'.join(f) + '_prev_{}day_stay_mean'.format(gap):'mean'})
            tmp['day'] = day_id
            res_data = res_data.merge(tmp,on = f + ['day'],how = 'left')
            

            tmp = tmp_data[f + ['id','play']].groupby(f,as_index = False)['play'].agg({'_'.join(f) + '_prev_{}day_play_mean'.format(gap):'mean'})
            tmp['day'] = day_id
            res_data = res_data.merge(tmp,on = f + ['day'],how = 'left')
            
            
            
            for ycol in ycols:
                click_df = tmp_data[tmp_data[ycol] ==1].reset_index(drop = True)


                tmp = click_df[f + ['id']].groupby(f, as_index=False)['id'].agg({'_'.join(f) + '_prev_{}day_{}_count'.format(gap,ycol): 'count'})
                tmp['day']  = day_id

                res_data = res_data.merge(tmp, on=f + ['day'], how='left')
                
                res_data['_'.join(f) + '_prev_{}day_{}_count'.format(gap,ycol)] = res_data['_'.join(f) + '_prev_{}day_{}_count'.format(gap,ycol)].fillna(0)
                
        

                res_data['_'.join(f) + '_prev_{}day_{}_ratio'.format(gap,ycol)] = res_data['_'.join(f) + '_prev_{}day_{}_count'.format(gap,ycol)] / (
                    res_data['_'.join(f) + '_prev_{}day_count'.format(gap)] + res_data['_'.join(f) + '_prev_{}day_count'.format(gap)].mean())  
        
            res= pd.concat([res,res_data])
        
        data = data.merge(res,on = f + ['day'],how = 'left')
    return data

In [6]:
def tfidf(input_values, output_num, output_prefix, seed=1024):
    tfidf_enc = TfidfVectorizer()
    tfidf_vec = tfidf_enc.fit_transform(input_values)
    svd_tmp = TruncatedSVD(n_components=output_num, n_iter=20, random_state=seed)
    svd_tmp = svd_tmp.fit_transform(tfidf_vec)
    svd_tmp = pd.DataFrame(svd_tmp)
    svd_tmp.columns = ['{}_tfidf_{}'.format(output_prefix, i) for i in range(output_num)]
    return svd_tmp


def count2vec(input_values, output_num, output_prefix, seed=1024):
    count_enc = CountVectorizer()
    count_vec = count_enc.fit_transform(input_values)
    svd_tmp = TruncatedSVD(n_components=output_num, n_iter=20, random_state=seed)
    svd_tmp = svd_tmp.fit_transform(count_vec)
    svd_tmp = pd.DataFrame(svd_tmp)
    svd_tmp.columns = ['{}_countvec_{}'.format(output_prefix, i) for i in range(output_num)]
    return svd_tmp

def get_tfidf_count_vec(feed,group_id,group_target,num):
    tmp = feed[[group_id,group_target]]
    tfidf_tmp = tfidf(tmp[group_target], 4, group_target)
    count_tmp = count2vec(tmp[group_target], 4, group_target)
    return pd.concat([tmp[group_id], tfidf_tmp, count_tmp], axis=1)



In [7]:
print("*********************{}***********************".format('load_data'))
PATH = 'wx/wechat_algo_data1/'
feed_embeddings = pd.read_csv(PATH + "feed_embeddings.csv")
feed_info = pd.read_csv(PATH + "feed_info.csv")
sample = pd.read_csv(PATH + "submit_demo_初赛a.csv")
test_a = pd.read_csv(PATH + "test_a.csv")
user_action = pd.read_csv(PATH + "user_action.csv")

user_action.columns = ['userid', 'feedid', 'day', 'device', 'read_comment', 'comment','like', 'play', 'stay', 'click_avatar', 'forward', 'follow','favorite']


test_a['day'] = 15
data = pd.concat([user_action,test_a])
data = data.reset_index(drop = True)
data['id'] =data.index +1

data = data.merge(feed_info[['feedid','authorid','videoplayseconds','bgm_song_id', 'bgm_singer_id']],on = 'feedid',how = 'left')

print("*********************{}***********************".format('Finish'))

*********************load_data***********************
*********************Finish***********************


In [8]:
print("*********************{}***********************".format('get_history_fea'))
for gap in [1,3,7]:
    data = get_history_stats(data,['read_comment', 'comment','like', 'click_avatar', 'forward', 'follow','favorite'],gap)
    
print("*********************{}***********************".format('Finish'))

*********************get_history_fea***********************
*************************** start 2 ***************************
*************************** start 3 ***************************
*************************** start 4 ***************************
*************************** start 5 ***************************
*************************** start 6 ***************************
*************************** start 7 ***************************
*************************** start 8 ***************************
*************************** start 9 ***************************
*************************** start 10 ***************************
*************************** start 11 ***************************
*************************** start 12 ***************************
*************************** start 13 ***************************
*************************** start 14 ***************************
*************************** start 15 ***************************
*************************** start 4 **

In [9]:
print("*********************{}***********************".format('get_first_keyword'))
feed_info['manual_keyword_list'] = feed_info['manual_keyword_list'].fillna(';')
feed_info['manual_keword'] = feed_info['manual_keyword_list'].apply(lambda x:x.split(';')[0]).fillna(-1)
feed_info['manual_keword'] = feed_info['manual_keword'].replace('','-1').astype(int)

print("*********************{}***********************".format('get_first_tag'))
feed_info['manual_tag_list'] = feed_info['manual_tag_list'].fillna(';')
feed_info['manual_tag'] = feed_info['manual_tag_list'].apply(lambda x:x.split(';')[0]).fillna(-1)
feed_info['manual_tag'] = feed_info['manual_tag'].replace('','-1').astype(int)

data = data.merge(feed_info[['feedid','manual_keword','manual_tag']],on = 'feedid',how = 'left')
print("*********************{}***********************".format('Finish'))

*********************get_first_keyword***********************
*********************get_first_tag***********************
*********************Finish***********************


In [10]:
print("*********************{}***********************".format('tfidf_fea'))

for group_target in ['description','ocr','asr']:#,'description_char','ocr_char','asr_char']:
    print(group_target)
    feed_info[group_target]  = feed_info[group_target].fillna('')
    data = data.merge(get_tfidf_count_vec(feed_info,'feedid',group_target,4),on = 'feedid',how = 'left')
    
print("*********************{}***********************".format('Finish'))

*********************tfidf_fea***********************
description
ocr
asr
*********************Finish***********************


In [11]:
print("*********************{}***********************".format('count_fea'))
cate_cols = ['userid','feedid','authorid','bgm_song_id', 'bgm_singer_id','videoplayseconds','manual_keword','manual_tag']
for f in cate_cols:
    data[f] = data[f].fillna(-1).astype(int)
    data[f + '_count'] = data[f].map(data[f].value_counts())

print("*********************{}***********************".format('Finish'))


print("*********************{}***********************".format('cross_fea'))
for f in ['feedid','authorid','bgm_song_id', 'bgm_singer_id','videoplayseconds','manual_keword','manual_tag']:
    data['user_{}_nunique'.format(f)] = data.groupby('userid')[f].transform('nunique')
    data['{}_user_nunique'.format(f)] = data.groupby(f)['userid'].transform('nunique')

    
print("*********************{}***********************".format('Finish'))

*********************count_fea***********************
*********************Finish***********************
*********************cross_fea***********************
*********************Finish***********************


In [12]:
print('========================================================================================================')
train_df = data[data['read_comment'].notna()].reset_index(drop=True)
test_df = data[data['read_comment'].isna()].reset_index(drop=True)



labels = train_df[['read_comment', 'like', 'click_avatar', 'forward']]


train_idx = train_df[(train_df['day'] < 14)].index.tolist()


val_idx = train_df[train_df['day']  >= 14].index.tolist()


gc.collect()
print('========================================================================================================')



In [14]:
used_feat = [f for f in train_df.columns if f not in (['day','read_comment','comment','like','play','stay','click_avatar','forward','follow','favorite','id'])]
print(len(used_feat))
print(used_feat)


106
['userid', 'feedid', 'device', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id', 'userid_authorid_feedid_prev_1day_count', 'userid_authorid_feedid_prev_1day_stay_mean', 'userid_authorid_feedid_prev_1day_play_mean', 'userid_authorid_feedid_prev_1day_read_comment_count', 'userid_authorid_feedid_prev_1day_read_comment_ratio', 'userid_authorid_feedid_prev_1day_comment_count', 'userid_authorid_feedid_prev_1day_comment_ratio', 'userid_authorid_feedid_prev_1day_like_count', 'userid_authorid_feedid_prev_1day_like_ratio', 'userid_authorid_feedid_prev_1day_click_avatar_count', 'userid_authorid_feedid_prev_1day_click_avatar_ratio', 'userid_authorid_feedid_prev_1day_forward_count', 'userid_authorid_feedid_prev_1day_forward_ratio', 'userid_authorid_feedid_prev_1day_follow_count', 'userid_authorid_feedid_prev_1day_follow_ratio', 'userid_authorid_feedid_prev_1day_favorite_count', 'userid_authorid_feedid_prev_1day_favorite_ratio', 'userid_authorid_feedid_prev_3day_count', 'userid_aut

In [15]:
cate_cols = [ 'userid','feedid','authorid','bgm_song_id', 'bgm_singer_id','manual_keword','manual_keword','manual_tag']

df_oof = train_df[['userid', 'read_comment', 'comment', 'like', 'click_avatar', 'forward', 'follow', 'favorite']].iloc[val_idx].reset_index(drop = True)
df_oof['id'] = df_oof.index
temp = df_oof.groupby(['userid'])['id'].apply(lambda x:np.array(x)).reset_index()
userid_dict =  dict(zip(temp['userid'].to_list(), temp['id'].to_list()))

best_rounds_list = []
df_importance_list = []

for ycol in  [ 'read_comment','like', 'click_avatar', 'forward']:
    train_x = train_df[used_feat].iloc[train_idx].reset_index(drop=True)
    train_y = labels[ycol][train_idx]
    val_x = train_df[used_feat].iloc[val_idx].reset_index(drop=True)
    val_y = labels[ycol][val_idx]
    
    df_oof[ycol] = df_oof[ycol].astype(int)
    tt = df_oof.groupby(['userid'])[ycol].agg({'mean'}).reset_index()
    usefulid = set(tt['userid']) - set(tt[tt['mean'].isin([0,1])]['userid'])
    
    clf = LGBMClassifier(
        learning_rate = 0.05,
        n_estimators = 10000000,
        num_leaves = 32,
        subsample = 0.9,
        colsample_bytree = 0.8,
        random_state = 2019,
        metric = 'auc'
    )

    #train
    clf.fit(
        train_x,train_y,
        eval_set = [(val_x,val_y)],
        eval_metric=lambda y_true, y_pred: [custom_uAUC_eval(y_true, y_pred)],
        #eval_metric='auc',
        categorical_feature = cate_cols,
        early_stopping_rounds = 200,
        verbose = 50

    )

    
    #val predict
    best_rounds = clf.best_iteration_
    #best_auc = clf.best_score_['valid_0']['auc']
    
    
    val_pred = clf.predict_proba(val_x)[:, 1]
    
    df_oof['{}_prob'.format(ycol)] = val_pred
    
    uAUC_score = uAUC(df_oof[ycol].values,df_oof['{}_prob'.format(ycol)].values)
    
    

    df_importance = pd.DataFrame({
            'column': used_feat,
            'importance': clf.feature_importances_,
        })
    df_importance_list.append(df_importance)
    
    print('uAUC_score',uAUC_score)
    print(best_rounds,)
    best_rounds_list.append(best_rounds)
    
    del clf

Training until validation scores don't improve for 200 rounds
[50]	valid_0's auc: 0.908964	valid_0's uAUC: 0.59723
[100]	valid_0's auc: 0.919483	valid_0's uAUC: 0.610154
[150]	valid_0's auc: 0.924215	valid_0's uAUC: 0.616733
[200]	valid_0's auc: 0.92663	valid_0's uAUC: 0.622169
[250]	valid_0's auc: 0.928259	valid_0's uAUC: 0.627538
[300]	valid_0's auc: 0.929194	valid_0's uAUC: 0.630371
[350]	valid_0's auc: 0.929778	valid_0's uAUC: 0.631716
[400]	valid_0's auc: 0.930285	valid_0's uAUC: 0.633311
[450]	valid_0's auc: 0.93063	valid_0's uAUC: 0.634656
[500]	valid_0's auc: 0.930916	valid_0's uAUC: 0.635532
[550]	valid_0's auc: 0.931086	valid_0's uAUC: 0.635864
[600]	valid_0's auc: 0.931278	valid_0's uAUC: 0.636127
[650]	valid_0's auc: 0.93143	valid_0's uAUC: 0.636631
[700]	valid_0's auc: 0.931492	valid_0's uAUC: 0.637257
[750]	valid_0's auc: 0.931561	valid_0's uAUC: 0.638254
[800]	valid_0's auc: 0.931573	valid_0's uAUC: 0.63906
[850]	valid_0's auc: 0.931667	valid_0's uAUC: 0.639348
[900]	val

In [22]:
best_rounds

251

In [21]:
#[6404,6329,7219,6997]
#[0.6406,0.6318,0.7228,0.7076]

#[1063,693,575,251]


In [16]:
df_importance_list[0].sort_values('importance',ascending = False)[:60]

Unnamed: 0,column,importance
0,userid,17263
3,authorid,4964
1,feedid,3730
58,manual_keword,1072
6,bgm_singer_id,1035
5,bgm_song_id,942
59,manual_tag,447
102,user_manual_keword_nunique,272
2,device,169
84,userid_count,150


In [23]:
n_estimator_dict = dict(zip(['read_comment', 'like', 'click_avatar', 'forward'],best_rounds_list))

In [18]:
sub = pd.read_csv("wx/wechat_algo_data1/submit_demo_初赛a.csv")

In [None]:
cate_cols = [ 'userid','feedid','authorid','bgm_song_id', 'bgm_singer_id','manual_keword','manual_keword','manual_tag']

for ycol in  ['read_comment', 'like', 'click_avatar', 'forward']:

    fea_imp_list = []
    #train predict
    clf = LGBMClassifier(
        learning_rate = 0.05,
        n_estimators = n_estimator_dict[ycol],
        num_leaves = 32,
        subsample = 0.9,
        colsample_bytree = 0.8,
        random_state = 2019,
        metric = None
    )
    #train
    clf.fit(
        train_df[used_feat],labels[ycol],
        eval_set = [(train_df[used_feat],labels[ycol])],
        categorical_feature = cate_cols,
        verbose = 50
    )

    res = clf.predict_proba(test_df[used_feat])[:,1]
    sub[ycol] = res
    
    

In [32]:
sub

Unnamed: 0,userid,feedid,read_comment,like,click_avatar,forward
0,14298,67227,0.000007,0.004045,0.009690,0.004760
1,68356,91864,0.002715,0.001992,0.001203,0.000858
2,49925,104657,0.015007,0.001351,0.002040,0.003640
3,60529,23738,0.033674,0.009555,0.010546,0.001543
4,131482,69038,0.000003,0.029216,0.009201,0.000558
...,...,...,...,...,...,...
421980,133812,56450,0.019016,0.005787,0.003526,0.000932
421981,231669,76501,0.000006,0.139555,0.003746,0.001096
421982,179168,70550,0.025822,0.026078,0.001700,0.001916
421983,92546,49432,0.013522,0.012677,0.002055,0.000202


In [33]:
sub.to_csv('baseline.csv',index = False)