In [1]:
import pandas as pd
import gc
import numpy as np

In [2]:
train_path = 'train_format1.csv'
log_path = 'user_log_format1.csv'
info_path = 'user_info_format1.csv'
test_path = 'test_format1.csv'

In [3]:
raw_train_data = pd.read_csv(train_path)
raw_test_data = pd.read_csv(test_path)
raw_log_data = pd.read_csv(log_path).rename(columns = {'seller_id':'merchant_id'})
raw_info_data = pd.read_csv(info_path)

### Necessary Features

In [4]:
#商家有多少商品：merchant_id + item_id
#商家有多少品类：merchant_id + cat_id
#商家有多少品牌：merchant_id + brand_id
#商家的商品被点击、收藏、加车、购买了多少次：merchant_id + action_type
#商家被活跃天数：merchant_id + time_stamp
#商家吸引的用户数：merchant_id + user_id

#用户年龄段：age_range
#用户性别：gender
#用户活跃天数：days_count
#用户活跃月数：months_count
#用户所有点击、收藏、加车、购买了多少次：action_counts
#用户活跃的商家数：merchant_id

#用户在目标店铺点击、收藏、加车、购买了多少次：action_counts
#用户在目标店铺活跃天数：days_count of specified user and merchant
#用户在目标店铺活跃月数：months_count of specified user and merchant

### Data cleaning

In [5]:
raw_log_data['brand_id'].fillna(0, inplace=True)
raw_info_data['gender'].fillna(2, inplace=True)
raw_info_data['age_range'].fillna(2, inplace=True)
del raw_test_data['prob']

In [6]:
%%time
groups = raw_log_data.groupby(['merchant_id'])
feature = groups['item_id'].agg('nunique').reset_index().rename(columns = {'item_id':'item_count'})
temp = groups['cat_id'].agg('nunique').reset_index().rename(columns = {'cat_id':'cat_count'})
feature = feature.merge(temp,on='merchant_id',how='left')
temp = groups['brand_id'].agg('nunique').reset_index().rename(columns = {'brand_id':'brand_count'})
feature = feature.merge(temp,on='merchant_id',how='left')
temp = raw_log_data.groupby(['merchant_id','action_type']).size().reset_index().rename(columns={0:'cnt'})
temp = pd.get_dummies(temp,columns = ['action_type'],prefix = ['actions'])
columns = [i for i in temp.columns.tolist() if 'actions' in i]
for i in columns:
    temp[i] *= temp['cnt']
temp = temp.groupby('merchant_id').agg('sum').reset_index()
del temp['cnt']
feature = feature.merge(temp,on = 'merchant_id',how = 'left')

CPU times: user 54.2 s, sys: 3.48 s, total: 57.7 s
Wall time: 57.7 s


In [7]:
%%time
temp = groups['time_stamp'].agg('nunique').reset_index().rename(columns = {'time_stamp':'days_count'})
feature = feature.merge(temp,on = 'merchant_id',how = 'left')
temp = groups['user_id'].agg('nunique').reset_index().rename(columns = {'user_id':'user_count'})
feature = feature.merge(temp,on = 'merchant_id',how = 'left')

CPU times: user 22.9 s, sys: 1.27 s, total: 24.2 s
Wall time: 24.2 s


In [8]:
groups = raw_log_data.groupby(['user_id'])
user_feature = raw_info_data
temp = groups['time_stamp'].agg('nunique').reset_index().rename(columns = {'time_stamp':'days_count'})
user_feature = user_feature.merge(temp,on = 'user_id',how = 'left')
group = raw_log_data.copy()
group['time_stamp'] //= 100
group = group.groupby(['user_id'])
temp = group['time_stamp'].agg('nunique').reset_index().rename(columns = {'time_stamp':'month_count'})
user_feature = user_feature.merge(temp,on = 'user_id',how = 'left')
temp = raw_log_data.groupby(['user_id','action_type']).size().reset_index().rename(columns={0:'cnt'})
temp = pd.get_dummies(temp,columns = ['action_type'],prefix = ['action'])
columns = [i for i in temp.columns.tolist() if 'action' in i]
for i in columns:
    temp[i] *= temp['cnt']
temp = temp.groupby('user_id').agg('sum').reset_index()
del temp['cnt']
user_feature = user_feature.merge(temp,on = 'user_id',how = 'left')
temp = groups['merchant_id'].agg('nunique').reset_index().rename(columns = {'merchant_id':'merchant_count'})

In [9]:
groups = raw_log_data.groupby(['user_id','merchant_id'])
mutual_feature_train = raw_train_data
mutual_feature_pred = raw_test_data
temp = groups['time_stamp'].agg('nunique').reset_index().rename(columns = {'time_stamp':'days'})
mutual_feature_train = mutual_feature_train.merge(temp,on = ['user_id','merchant_id'],how = 'left')
mutual_feature_pred = mutual_feature_pred.merge(temp,on = ['user_id','merchant_id'],how = 'left')
group = raw_log_data.copy()
group['time_stamp'] //= 100
group = group.groupby(['user_id','merchant_id'])
temp = group['time_stamp'].agg('nunique').reset_index().rename(columns = {'time_stamp':'month'})
mutual_feature_train = mutual_feature_train.merge(temp,on = ['user_id','merchant_id'],how = 'left')
mutual_feature_pred = mutual_feature_pred.merge(temp,on = ['user_id','merchant_id'],how = 'left')
temp = raw_log_data.groupby(['user_id','merchant_id','action_type']).size().reset_index().rename(columns={0:'cnt'})
temp = pd.get_dummies(temp,columns = ['action_type'],prefix = ['act'])
columns = [i for i in temp.columns.tolist() if 'act' in i]
for i in columns:
    temp[i] *= temp['cnt']
temp = temp.groupby(['user_id','merchant_id']).agg('sum').reset_index()
del temp['cnt']
mutual_feature_train = mutual_feature_train.merge(temp,on = ['user_id','merchant_id'],how = 'left')
mutual_feature_pred = mutual_feature_pred.merge(temp,on = ['user_id','merchant_id'],how = 'left')

In [10]:
mutual_feature_pred =mutual_feature_pred.merge(feature,on = 'merchant_id',how = 'left')
mutual_feature_pred =mutual_feature_pred.merge(user_feature,on = 'user_id',how = 'left')
mutual_feature_train = mutual_feature_train.merge(feature,on = 'merchant_id',how = 'left')
mutual_feature_train = mutual_feature_train.merge(user_feature,on = 'user_id',how = 'left')

In [11]:
from sklearn.model_selection import train_test_split
import xgboost as xgb  
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [12]:
%%time
def xgb_train(X_train, y_train, X_valid, y_valid, verbose=True):
    model_xgb = xgb.XGBClassifier(
        max_depth=10, # raw8
        n_estimators=1000,
        min_child_weight=300, 
        colsample_bytree=0.8, 
        subsample=0.8, 
        eta=0.3,    
        seed=42        
    )

    model_xgb.fit(
        X_train, 
        y_train,
        eval_metric='auc',
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        verbose=verbose,
        early_stopping_rounds=10 # 早停法，如果auc在10epoch没有进步就stop
    )
    print(model_xgb.best_score)
    return model_xgb

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 12.6 µs


In [13]:
# train
train_y = mutual_feature_train.pop('label')
train_X = mutual_feature_train.iloc[:,2:]
pred_X = mutual_feature_pred.iloc[:,2:]
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=.2, random_state=42) # test_size=.3

In [14]:
model_xgb = xgb_train(X_train,y_train, X_valid, y_valid)



[0]	validation_0-auc:0.63287	validation_1-auc:0.63348
[1]	validation_0-auc:0.64954	validation_1-auc:0.64167
[2]	validation_0-auc:0.65497	validation_1-auc:0.64389
[3]	validation_0-auc:0.65790	validation_1-auc:0.64385
[4]	validation_0-auc:0.66194	validation_1-auc:0.64832
[5]	validation_0-auc:0.66319	validation_1-auc:0.65072
[6]	validation_0-auc:0.66422	validation_1-auc:0.65079
[7]	validation_0-auc:0.66499	validation_1-auc:0.65248
[8]	validation_0-auc:0.66602	validation_1-auc:0.65298
[9]	validation_0-auc:0.66758	validation_1-auc:0.65433
[10]	validation_0-auc:0.66745	validation_1-auc:0.65422
[11]	validation_0-auc:0.66903	validation_1-auc:0.65687
[12]	validation_0-auc:0.67034	validation_1-auc:0.65765
[13]	validation_0-auc:0.67104	validation_1-auc:0.65900
[14]	validation_0-auc:0.67360	validation_1-auc:0.66204
[15]	validation_0-auc:0.67485	validation_1-auc:0.66302
[16]	validation_0-auc:0.67609	validation_1-auc:0.66354
[17]	validation_0-auc:0.67681	validation_1-auc:0.66420
[18]	validation_0-au

In [16]:
%%time
# predict
prob = model_xgb.predict_proba(pred_X)
submission = mutual_feature_pred.iloc[:,:2]
submission['prob'] = pd.Series(prob[:,1])
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('ans_oct.csv', index=False)

CPU times: user 4.49 s, sys: 73 ms, total: 4.57 s
Wall time: 628 ms


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
import lightgbm as lgb
import catboost as cat