## Feature Engineering(Manual)
### Import, Read data

In [28]:
#package load
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ParameterSampler
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
import pandas as pd
import numpy as np
import os
dicpath = os.getcwd()


def len_consec_zeros(a):
    a = np.array(list(a))    # convert elements to `str`
    rr = np.argwhere(a == '0').ravel()  # find out positions of `0`
    if not rr.size:  # if there are no zeros, return 0
        return 0

    full = np.arange(rr[0], rr[-1]+1)  # get the range of spread of 0s

    # get the indices where `0` was flipped to something else
    diff = np.setdiff1d(full, rr)
    if not diff.size:     # if there are no bit flips, return the 
        return len(full)  # size of the full range

    # break the array into pieces wherever there's a bit flip
    # and the result is the size of the largest chunk
    pos, difs = full[0], []
    for el in diff:
        difs.append(el - pos)
        pos = el + 1

    difs.append(full[-1]+1 - pos)

    # return size of the largest chunk
    res = max(difs) if max(difs) != 1 else 0

    return res



#data load
train_activity = pd.read_csv(dicpath+'/train/train_activity.csv',encoding='EUC-kr')
train_label = pd.read_csv(dicpath+'/train/train_label.csv',encoding='EUC-kr')
train_guild = pd.read_csv(dicpath+'/train/train_guild.csv',encoding='EUC-kr')
train_party = pd.read_csv(dicpath+'/train/train_party.csv',encoding='EUC-kr')
train_payment = pd.read_csv(dicpath+'/train/train_payment.csv',encoding='EUC-kr')
train_trade = pd.read_csv(dicpath+'/train/train_trade.csv',encoding='EUC-kr')

test_activity = pd.read_csv(dicpath+'/test/test_activity.csv',encoding='EUC-kr')
test_guild = pd.read_csv(dicpath+'/test/test_guild.csv',encoding='EUC-kr')
test_party = pd.read_csv(dicpath+'/test/test_party.csv',encoding='EUC-kr')
test_payment = pd.read_csv(dicpath+'/test/test_payment.csv',encoding='EUC-kr')
test_trade = pd.read_csv(dicpath+'/test/test_trade.csv',encoding='EUC-kr')
test_label = pd.DataFrame(test_payment["acc_id"].unique())
test_label.columns = ["acc_id"]

train_label = train_label.append(test_label)
train_activity = train_activity.append(test_activity)
train_guild = train_guild.append(test_guild)
train_party = train_party.append(test_party)
train_payment = train_payment.append(test_party)
train_trade = train_trade.append(test_trade)

### Feature Engineering(Manual)

In [33]:
#생성된 파생변수를 담을 데이터 프레임 생성
act_ft_eng=pd.DataFrame({"acc_id":train_activity["acc_id"],"wk":train_activity["wk"]})

#play_time related
act_ft_eng["npc_exp_div_play_time"] = np.exp(train_activity["npc_exp"])/np.exp(train_activity["play_time"])
act_ft_eng["npc_hongmun_div_play_time"] = np.exp(train_activity["npc_hongmun"])/np.exp(train_activity["play_time"])
act_ft_eng["quest_exp_div_play_time"] = np.exp(train_activity["quest_exp"])/np.exp(train_activity["play_time"])
act_ft_eng["quest_hongmun_div_play_time"] = np.exp(train_activity["quest_hongmun"])/np.exp(train_activity["play_time"])
act_ft_eng["item_hongmun_div_play_time"] = np.exp(train_activity["item_hongmun"])/np.exp(train_activity["play_time"])
act_ft_eng["game_combat_time_div_play_time"] = np.exp(train_activity["game_combat_time"])/np.exp(train_activity["play_time"])
act_ft_eng["get_money_div_play_time"] = np.exp(train_activity["get_money"])/np.exp(train_activity["play_time"])
act_ft_eng["normal_chat_div_play_time"] = np.exp(train_activity["normal_chat"])/np.exp(train_activity["play_time"])
act_ft_eng["whisper_chat_div_play_time"] = np.exp(train_activity["whisper_chat"])/np.exp(train_activity["play_time"])
act_ft_eng["district_chat_div_play_time"] = np.exp(train_activity["district_chat"])/np.exp(train_activity["play_time"])
act_ft_eng["party_chat_div_play_time"] = np.exp(train_activity["party_chat"])/np.exp(train_activity["play_time"])
act_ft_eng["guild_chat_div_play_time"] = np.exp(train_activity["guild_chat"])/np.exp(train_activity["play_time"])
act_ft_eng["faction_chat_div_play_time"] = np.exp(train_activity["faction_chat"])/np.exp(train_activity["play_time"])
act_ft_eng["gathering_cnt_div_play_time"] = np.exp(train_activity["gathering_cnt"])/np.exp(train_activity["play_time"])
act_ft_eng["making_cnt_div_play_time"] = np.exp(train_activity["making_cnt"])/np.exp(train_activity["play_time"])

#cnt_dt related
act_ft_eng["play_time_div_cnt_dt"] = np.exp(train_activity["play_time"])/np.exp(train_activity["cnt_dt"])

#game_combat_time related
act_ft_eng["npc_exp_div_game_combat_time"] = np.exp(train_activity["npc_exp"])/np.exp(train_activity["game_combat_time"])
act_ft_eng["npc_hongmun_div_game_combat_time"] = np.exp(train_activity["npc_hongmun"])/np.exp(train_activity["game_combat_time"])
act_ft_eng["quest_exp_div_game_combat_time"] = np.exp(train_activity["quest_exp"])/np.exp(train_activity["game_combat_time"])
act_ft_eng["quest_hongmun_div_game_combat_time"] = np.exp(train_activity["quest_hongmun"])/np.exp(train_activity["game_combat_time"])
act_ft_eng["item_hongmun_div_game_combat_time"] = np.exp(train_activity["item_hongmun"])/np.exp(train_activity["game_combat_time"])
act_ft_eng["get_money_div_game_combat_time"] = np.exp(train_activity["get_money"])/np.exp(train_activity["game_combat_time"])

#duel related
act_ft_eng["duel_win_div_duel_cnt"] = np.exp(train_activity["duel_win"])/np.exp(train_activity["duel_cnt"])
act_ft_eng["partyb_win_div_partyb_cnt"] = np.exp(train_activity["partybattle_win"])/np.exp(train_activity["partybattle_cnt"])

#raid related
act_ft_eng["cnt_enter_inzone_solo_div_cnt_enter_raid"] = np.exp(train_activity["cnt_enter_inzone_solo"])/np.exp(train_activity["cnt_enter_raid"])
act_ft_eng["cnt_clear_raid_div_cnt_enter_raid"] = np.exp(train_activity["cnt_clear_raid"])/np.exp(train_activity["cnt_enter_raid"])

#train_activity 데이터에 새로 생성한 파생변수 붙여넣기
train_activity = train_activity.merge(act_ft_eng,how="left")

#수동으로 aggregate + 가중 평균 + 시차 변수 생성 with count
tmp_cnt_agg = train_activity[train_activity.wk == 8][['acc_id','cnt_dt']]\
.merge(train_activity[train_activity.wk == 7][['acc_id','cnt_dt']],how='left',on="acc_id")\
.merge(train_activity[train_activity.wk == 6][['acc_id','cnt_dt']],how='left',on="acc_id")\
.merge(train_activity[train_activity.wk == 5][['acc_id','cnt_dt']],how='left',on="acc_id")\
.merge(train_activity[train_activity.wk == 4][['acc_id','cnt_dt']],how='left',on="acc_id")\
.merge(train_activity[train_activity.wk == 3][['acc_id','cnt_dt']],how='left',on="acc_id")\
.merge(train_activity[train_activity.wk == 2][['acc_id','cnt_dt']],how='left',on="acc_id")\
.merge(train_activity[train_activity.wk == 1][['acc_id','cnt_dt']],how='left',on="acc_id")
tmp_cnt_agg.columns = ["acc_id",'wk8','wk7','wk6','wk5','wk4','wk3','wk2','wk1']
tmp_cnt_agg = tmp_cnt_agg.fillna(0)
for column in tmp_cnt_agg:
    if column != 'acc_id':
        tmp_cnt_agg[column] = tmp_cnt_agg[column].astype(int)
tmp_cnt_agg["mean_wk"] = tmp_cnt_agg.drop(['acc_id'],axis=1).mean(axis=1)
tmp_cnt_agg["std_wk"] = tmp_cnt_agg.drop(['acc_id'],axis=1).std(axis=1)
tmp_cnt_agg["wgt_mean_wk"] = (tmp_cnt_agg["wk8"]*8+tmp_cnt_agg["wk7"]*7+tmp_cnt_agg["wk6"]*6+tmp_cnt_agg["wk5"]*5+tmp_cnt_agg["wk4"]*4+tmp_cnt_agg["wk3"]*3+tmp_cnt_agg["wk2"]*2+tmp_cnt_agg["wk1"])/36
tmp_cnt_agg["cnt_lag_78"] = tmp_cnt_agg["wk8"]-tmp_cnt_agg["wk7"]
tmp_cnt_agg["cnt_lag_68"] = tmp_cnt_agg["wk8"]-tmp_cnt_agg["wk6"]

#수동으로 aggregate + 가중 평균 + 시차 변수 생성 with time
tmp_time_agg = train_activity[train_activity.wk == 8][['acc_id','play_time']]\
.merge(train_activity[train_activity.wk == 7][['acc_id','play_time']],how='left',on="acc_id")\
.merge(train_activity[train_activity.wk == 6][['acc_id','play_time']],how='left',on="acc_id")\
.merge(train_activity[train_activity.wk == 5][['acc_id','play_time']],how='left',on="acc_id")\
.merge(train_activity[train_activity.wk == 4][['acc_id','play_time']],how='left',on="acc_id")\
.merge(train_activity[train_activity.wk == 3][['acc_id','play_time']],how='left',on="acc_id")\
.merge(train_activity[train_activity.wk == 2][['acc_id','play_time']],how='left',on="acc_id")\
.merge(train_activity[train_activity.wk == 1][['acc_id','play_time']],how='left',on="acc_id")
tmp_time_agg.columns = ["acc_id",'play_time_wk8','play_time_wk7','play_time_wk6','play_time_wk5','play_time_wk4','play_time_wk3','play_time_wk2','play_time_wk1']
tmp_time_agg = tmp_time_agg.fillna(-0.6617)

tmp_time_agg["mean_wk_pt"] = tmp_time_agg.drop(['acc_id'],axis=1).mean(axis=1)
tmp_time_agg["std_wk_pt"] = tmp_time_agg.drop(['acc_id'],axis=1).std(axis=1)
tmp_time_agg["wgt_mean_wk_pt"] = (tmp_time_agg["play_time_wk8"]*8+tmp_time_agg["play_time_wk7"]*7+tmp_time_agg["play_time_wk6"]*6+tmp_time_agg["play_time_wk5"]*5+tmp_time_agg["play_time_wk4"]*4+tmp_time_agg["play_time_wk3"]*3+tmp_time_agg["play_time_wk2"]*2+tmp_time_agg["play_time_wk1"])/36
tmp_time_agg["time_lag_78"] = tmp_time_agg["play_time_wk8"]-tmp_time_agg["play_time_wk7"]
tmp_time_agg["time_lag_68"] = tmp_time_agg["play_time_wk8"]-tmp_time_agg["play_time_wk6"]


#수동으로 aggregate + 가중 평균 + 시차 변수 생성 with time/count
tmp_div_time_agg = pd.merge(tmp_cnt_agg,tmp_time_agg,how='left',on='acc_id').drop(['mean_wk','std_wk','wgt_mean_wk','mean_wk_pt','std_wk_pt','wgt_mean_wk_pt'],axis=1)
tmp_dv_time_agg = pd.DataFrame(tmp_div_time_agg['acc_id'])
tmp_dv_time_agg["dv_pt_cnt_wk8"] = np.exp(tmp_time_agg["play_time_wk8"])/tmp_cnt_agg["wk8"]
tmp_dv_time_agg["dv_pt_cnt_wk7"] = np.exp(tmp_time_agg["play_time_wk7"])/tmp_cnt_agg["wk7"]
tmp_dv_time_agg["dv_pt_cnt_wk6"] = np.exp(tmp_time_agg["play_time_wk6"])/tmp_cnt_agg["wk6"]
tmp_dv_time_agg["dv_pt_cnt_wk5"] = np.exp(tmp_time_agg["play_time_wk5"])/tmp_cnt_agg["wk5"]
tmp_dv_time_agg["dv_pt_cnt_wk4"] = np.exp(tmp_time_agg["play_time_wk4"])/tmp_cnt_agg["wk4"]
tmp_dv_time_agg["dv_pt_cnt_wk3"] = np.exp(tmp_time_agg["play_time_wk3"])/tmp_cnt_agg["wk3"]
tmp_dv_time_agg["dv_pt_cnt_wk2"] = np.exp(tmp_time_agg["play_time_wk2"])/tmp_cnt_agg["wk2"]
tmp_dv_time_agg["dv_pt_cnt_wk1"] = np.exp(tmp_time_agg["play_time_wk1"])/tmp_cnt_agg["wk1"]
tmp_dv_time_agg = tmp_dv_time_agg.replace([np.inf],0)

tmp_dv_time_agg['mean_dv_pt_cnt'] = tmp_dv_time_agg.drop(['acc_id'],axis=1).mean(axis=1)
tmp_dv_time_agg['std_dv_pt_cnt'] = tmp_dv_time_agg.drop(['acc_id'],axis=1).std(axis=1)
tmp_dv_time_agg['wgt_dv_pt_cnt'] = (tmp_dv_time_agg["dv_pt_cnt_wk8"]*8+tmp_dv_time_agg["dv_pt_cnt_wk7"]*7+tmp_dv_time_agg["dv_pt_cnt_wk6"]*6+tmp_dv_time_agg["dv_pt_cnt_wk5"]*5+tmp_dv_time_agg["dv_pt_cnt_wk4"]*4+tmp_dv_time_agg["dv_pt_cnt_wk3"]*3+tmp_dv_time_agg["dv_pt_cnt_wk2"]*2+tmp_dv_time_agg["dv_pt_cnt_wk1"])/36
tmp_dv_time_agg['dv_tc_lag_78'] = tmp_dv_time_agg["dv_pt_cnt_wk8"]-tmp_dv_time_agg["dv_pt_cnt_wk7"]
tmp_dv_time_agg['dv_tc_lag_68'] = tmp_dv_time_agg["dv_pt_cnt_wk8"]-tmp_dv_time_agg["dv_pt_cnt_wk6"]


# feature_activity
act_ft_eng_01 = act_ft_eng[act_ft_eng.wk == 8].drop('wk',axis=1)

for i in range(7):
    temp = act_ft_eng[act_ft_eng.wk == i].drop(['acc_id','wk'],axis=1)
    temp.columns = temp.columns + str(i+1)
    temp["acc_id"] = act_ft_eng[act_ft_eng.wk == i].drop('wk',axis=1)['acc_id']
    
    act_ft_eng_01 = pd.merge(act_ft_eng_01,temp,how='left',on='acc_id')
    
#마지막 연속 미접속일 지속 날짜
tmp_train_01 = tmp_cnt_agg[["wk1",'wk2','wk3','wk4','wk5','wk6','wk7','wk8']]

tmp_train_01['concated'] = tmp_train_01.astype(str).apply(lambda x: ''.join(x), axis=1)
tmp_train_01['consecutive_zeros'] = tmp_train_01.concated.apply(lambda x: len_consec_zeros(x))
tmp_cnt_agg["consecutive_zero"] = tmp_train_01['consecutive_zeros'] 


In [36]:
train_all = train_label.merge(tmp_cnt_agg,how='left',on='acc_id')\
.merge(tmp_time_agg,how='left',on='acc_id')\
.merge(tmp_dv_time_agg,how='left',on='acc_id')\
.merge(act_ft_eng_01,how='left',on='acc_id')\
.fillna(0)

In [37]:
test_all_n = train_all[train_all.label == 0]
train_all_n = train_all[train_all.label != 0]
print(train_all_n.shape,test_all_n.shape)

test_all_n=test_all_n.drop('label',axis=1)
train_all_n.to_csv("fin_train_02.csv")
test_all_n.to_csv("fin_test_02.csv")

(100000, 250) (40000, 250)


### Modeling

In [44]:
le = LabelEncoder()
train_all_n["label"] = le.fit_transform(train_all_n["label"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [45]:
%%time

num_folds = 5

# Create arrays and dataframes to store results
feats = [f for f in train_all_n.columns if f not in ['label','acc_id']]
folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)

result_c = []


num_trials = 1
num_rounds = 3000
early_stopping_rounds = 200
params = {
    'objective':['multiclass'],
    'num_class':[4],
    'class_weight':['balanced'],
    'boosting':['gbdt'],
    #'min_child_weight': list(np.arange(1,20,1)),
    'colsample_bytree': [0.6],
    #'max_depth': list(np.arange(3,15,1)),
    #'subsample': list(np.arange(0.4,1,0.1))
    'reg_alpha': [1],
    'reg_lambda': [10]
    #'learning_rate': [0.005,0.01,0.05,0.1],
    #'num_leaves' :list(np.arange(20,55,3)),
}

for trial, param in zip(np.arange(num_trials),list(ParameterSampler(params, n_iter=num_trials))):
    result_b = []
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_all_n[feats], train_all_n['label'])):
        train_x, train_y = train_all_n[feats].iloc[train_idx], train_all_n['label'].iloc[train_idx]
        valid_x, valid_y = train_all_n[feats].iloc[valid_idx], train_all_n['label'].iloc[valid_idx] 


        lgb_train = lgb.Dataset(train_x, train_y)
        lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)
        
        regressor = lgb.train(param,
                              lgb_train,
                              valid_sets = lgb_eval,
                              num_boost_round = num_rounds,
                              verbose_eval=50,
                              early_stopping_rounds=early_stopping_rounds
                             )
        y_pred = regressor.predict(valid_x)


        max_pred = []
        for i in range(len(y_pred)):
            max_pred.append(np.argmax(y_pred[i]))

        comp_lab = pd.DataFrame({'pred':max_pred,'true':valid_y})

        pr_rc = pd.DataFrame()
        for i in range(len(comp_lab['pred'].value_counts())):
            prec = len(comp_lab[(comp_lab["pred"]==i) & (comp_lab["true"]==i)])/len(comp_lab["pred"][comp_lab["pred"]==i])
            recl = len(comp_lab[(comp_lab["pred"]==i) & (comp_lab["true"]==i)])/len(comp_lab["true"][comp_lab["true"]==i])

            if i == 0:
                pr_rc = pd.DataFrame({"pred":[prec],"recl":[recl]})
            else:
                pr_rc = pr_rc.append(pd.DataFrame({"pred":[prec],"recl":[recl]}))

        f_score = list()
        for i in range(len(pr_rc.index)):
            f_score.append(1/(pr_rc["pred"].tolist()[i]))
            f_score.append(1/(pr_rc["recl"].tolist()[i]))
        ls_result_lgb = 8/sum(f_score)
        
        result_b.append(ls_result_lgb)
        
    result_c.append(result_b)




Training until validation scores don't improve for 200 rounds.
[50]	valid_0's multi_logloss: 0.702753
[100]	valid_0's multi_logloss: 0.666265
[150]	valid_0's multi_logloss: 0.653588
[200]	valid_0's multi_logloss: 0.643878
[250]	valid_0's multi_logloss: 0.636598
[300]	valid_0's multi_logloss: 0.630786
[350]	valid_0's multi_logloss: 0.62593
[400]	valid_0's multi_logloss: 0.622167
[450]	valid_0's multi_logloss: 0.619361
[500]	valid_0's multi_logloss: 0.616986
[550]	valid_0's multi_logloss: 0.614708
[600]	valid_0's multi_logloss: 0.612878
[650]	valid_0's multi_logloss: 0.611564
[700]	valid_0's multi_logloss: 0.609974
[750]	valid_0's multi_logloss: 0.608787
[800]	valid_0's multi_logloss: 0.607869
[850]	valid_0's multi_logloss: 0.60693
[900]	valid_0's multi_logloss: 0.606132
[950]	valid_0's multi_logloss: 0.60533
[1000]	valid_0's multi_logloss: 0.604629
[1050]	valid_0's multi_logloss: 0.604406
[1100]	valid_0's multi_logloss: 0.604013
[1150]	valid_0's multi_logloss: 0.603751
[1200]	valid_0's 

In [46]:
###lgb model   feature :alpha 1, lambda 10
print('사용된 feature 개수 : ',len(feats))
print('5-fold CV f-score :',result_c[0],'average score :',np.average(result_c))
print('predict value ratio : ',sum(comp_lab["pred"]==0),sum(comp_lab["pred"]==1),sum(comp_lab["pred"]==2),sum(comp_lab["pred"]==3))

pr_rc = pd.DataFrame()
for i in range(len(comp_lab['pred'].value_counts())):
    prec = len(comp_lab[(comp_lab["pred"]==i) & (comp_lab["true"]==i)])/len(comp_lab["pred"][comp_lab["pred"]==i])
    recl = len(comp_lab[(comp_lab["pred"]==i) & (comp_lab["true"]==i)])/len(comp_lab["true"][comp_lab["true"]==i])
    print("precision : ",prec,", recall : ",recl)
    if i == 0:
        pr_rc = pd.DataFrame({"pred":[prec],"recl":[recl]})
    else:
        pr_rc = pr_rc.append(pd.DataFrame({"pred":[prec],"recl":[recl]}))

f_score = list()
for i in range(len(pr_rc.index)):
    f_score.append(1/(pr_rc["pred"].tolist()[i]))
    f_score.append(1/(pr_rc["recl"].tolist()[i]))
ls_result_lgb = 8/sum(f_score)

사용된 feature 개수 :  248
5-fold CV f-score : [0.7191539760973668, 0.7157350381340298, 0.7182029981998694, 0.7223985117459301, 0.7233840290865586] average score : 0.7197749106527509
predict value ratio :  5257 4190 5191 5362
precision :  0.6431424766977364 , recall :  0.6713661636219221
precision :  0.6699284009546539 , recall :  0.5730910575745202
precision :  0.7927181660566365 , recall :  0.8220135836995606
precision :  0.8453935098843715 , recall :  0.8958498023715415


In [48]:
ft_imp = pd.DataFrame({'name':train_all_n.columns[2:],'fi':regressor.feature_importance()}).sort_values("fi",ascending=False)
fi_zero_list = ft_imp[ft_imp.fi == 0].name.tolist()
ft_imp.to_csv('ft_imp_02.csv')


In [189]:
import pickle
feature = 'feature_02_248'
pickle.dump(regressor,open('new_feature_'+feature+'.pickle','wb'))