In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [58]:
# dir = 'C:/Users/SAMSUNG/Desktop/new/빅콘테스트/2019빅콘테스트_챔피언스리그_데이터_수정/train_pledge.csv'

def preprocessing_pledge(pledge_dir, label_dir, pay_dir):
    # file load
    tr_pledge = pd.read_csv(pledge_dir)
    tr_label = pd.read_csv(label_dir)
    pay = pd.read_csv(pay_dir)
    
    # 공지에 올라온 error 부분 제거
    del tr_pledge['non_combat_play_time']
    
    # 가입한 혈맹의 순위
    bbb = tr_pledge.pledge_id.value_counts().to_frame().reset_index()
    bbb.columns = ['pledge_id','count']
    bbb['rank'] = bbb['count'].rank(ascending = False, method = 'min')
    pledge_rank = {}
    # dictionary를 활용해 원데이터의 pledge_id에 rank값을 mapping
    for i, j in enumerate(list(bbb['rank'])):
        pledge_rank[bbb['pledge_id'][i]] = j
    tr_pledge['pledge_rank'] = tr_pledge['pledge_id'].map(pledge_rank)


    # 혈맹원의 합
    pledge_member_num = {}
    for i in tr_pledge.pledge_id:
        if i not in pledge_member_num.keys():
            pledge_member_num[i] = 0
        pledge_member_num[i] += 1
    tr_pledge['pledge_member_num'] = tr_pledge['pledge_id'].map(pledge_member_num)


    # acc_id 기준으로 데이터 압축
    group = tr_pledge.groupby(['acc_id', 'day']).sum().reset_index()
    groups = group.groupby(['acc_id']).sum().reset_index()
    # label 데이터 merge
    merge_df = pd.merge(groups, tr_label, how = 'left', on = 'acc_id')
    # merge로 인해 생성되는 columns 제거
    del merge_df['Unnamed: 0']


    # 접속일 변수 log_in_freq 생성
    freq = []
    for i in group.acc_id.unique():
        freq.append([i,group[group.acc_id == i].shape[0]])
    new = pd.DataFrame(sorted(freq))
    new.columns = ['acc_id', 'log_in_freq']
    merge_df = pd.merge(merge_df, new, how = 'left', on = 'acc_id')


    # 유저별 가입한 혈맹 수
    act_pledge_num = {}
    for i in tr_pledge.acc_id.unique():
        act_pledge_num[i] = tr_pledge[tr_pledge.acc_id == i].pledge_id.nunique()
    merge_df['join_pledge_num'] = merge_df['acc_id'].map(act_pledge_num)


    # payment 데이터 곃합
    pay = pay.groupby(pay.acc_id).sum().reset_index().drop('day', axis=1)
    merge_pay = pd.merge(merge_df, pay, on = 'acc_id', how = 'left')
    merge_pay = merge_pay.fillna(0)

    
    # 유저별 a서버, b서버 접속 횟수
    acc_id = []
    a_server_num = []
    b_server_num = []

    for i in tr_pledge.acc_id.unique():
        a_count = 0
        b_count = 0
        li = list(tr_pledge[tr_pledge.acc_id == i].server)
        for j in li:
            if j[0] == 'a':
                a_count += 1
            else:
                b_count += 1
        acc_id.append(i)
        a_server_num.append(a_count)
        b_server_num.append(b_count)
    
    df = pd.DataFrame({'acc_id' : acc_id,
                  'a_server_num' : a_server_num,
                  'b_server_num' : b_server_num})
    merge_pay = pd.merge(merge_pay, df, on = 'acc_id')
    

    # 일자별 혈맹 활동 내역 flatten (pledge_rank는 flatten을 안시키는게 퍼포먼스 향상에 더 좋음)
    df = tr_pledge[[col for col in tr_pledge.columns if col not in ['server', 'char_id','pledge_id',
                                                                   'pledge_rank']]
                  ].groupby(['day', 'acc_id']).sum().reset_index()
    df_grouped = df.groupby('day')
    p = df_grouped.get_group(1)
    for i in range(2, 29):
        p = pd.merge(p, df_grouped.get_group(i), on='acc_id', how='outer',
                     suffixes=('_'+str(i-1), '_'+str(i)))
    p = p[[col for col in p.columns if ('day' not in col) & ('combat_char_cnt' not in col)]]
    p = p.fillna(0).set_index('acc_id')
    df = p.reset_index()
    merge_flatten_df = pd.merge(merge_pay, df, on = 'acc_id')
    
    return merge_flatten_df


###################################################################
# 참고사항
# return값 출력시 columns 중 amount_spent_x와 amount_spent_y가 존재
# amount_spent_x : label 데이터에 있는 amount_spent
# amount_spent_y : pay 데이터에 있는 amount_spent
###################################################################

In [3]:
# log1p 가중치 적용 함수
def log_weight(df):
    df_weight = df.copy()
    day = list(map(str,list(range(1,29,1))))
    weight = []
    for i in day:
        weight.append(np.log1p(int(i)))
    
    for j in df_weight.columns:
        if j[-1] in day: # column의 맨 뒷글자(str)가 day에 있으면
            index = int(j[-1]) - 1 # weight의 index로 사용하기 위해 1을 배줌
            df_weight[j] = df_weight[j].apply(lambda x: x * weight[index])
    
    return df_weight

In [4]:
pledge_dir = 'C:/Users/SAMSUNG/Desktop/new/빅콘테스트/2019빅콘테스트_챔피언스리그_데이터_수정/train_pledge.csv'
label_dir ='C:/Users/SAMSUNG/Desktop/new/빅콘테스트/train_label_add.csv'
pay_dir = 'C:/Users/SAMSUNG/Desktop/new/빅콘테스트/2019빅콘테스트_챔피언스리그_데이터_수정/train_payment.csv'

merge_pledge = preprocessing_pledge(pledge_dir, label_dir, pay_dir)

In [59]:
# flatten 부분 주석 처리 후 return값을 merge_pay로 설정

merge_pledge_non_flatten = preprocessing_pledge(pledge_dir, label_dir, pay_dir)

In [60]:
# label과 payment의 같은 컬럼명 amount_spent가 존재하여 각각 이름이
# amount_spent_x와 amount_spent_y로 지정되어 있다.
# 모델 저장 후 test set 돌릴 시 변수명 일치하지 않으면 안돌아가므로 변경해줌
merge_pledge = merge_pledge.rename(columns = {'amount_spent_y':'amount_spent'})
merge_pledge_non_flatten = merge_pledge_non_flatten.rename(columns = {'amount_spent_y':'amount_spent'})

In [27]:
merge_pledge_weight = log_weight(merge_pledge)

In [28]:
merge_pledge.shape, merge_pledge_weight.shape

((33854, 274), (33854, 274))

# 일별 가중치 적용 (log1p 사용), 전후비교

### 이탈, 비이탈 이진분류 - 가중치 적용 전

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(merge_pledge.drop(['acc_id',
                                                                           'char_id',
                                                                           'pledge_id',
                                                                           'amount_spent_x',
                                                                           'total_spent',
                                                                          'secession',
                                                                          'survival_time'], axis=1),
                                                    merge_pledge.secession,
                                                    test_size = 0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state=42)

X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

((17772, 267), (5925, 267), (10157, 267), (17772,), (5925,), (10157,))

In [17]:
import xgboost as xgb

params = {'n_estimators' : 475,
         'learning_rate' : 0.1,
          'nthread' : 15,
          'num_boost_round' : 300,
         'max_depth' : 6,
          'eval_metric' : 'auc',
          'gamma' : 1,
          'subsample' : 0.9}

VALID = True
np.random.seed(42)
if VALID == True:
    
    tr_data = xgb.DMatrix(X_train, y_train)
    va_data = xgb.DMatrix(X_valid, y_valid)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model = xgb.train(params, tr_data, 500, watchlist, verbose_eval = 50, early_stopping_rounds=30)

  if getattr(data, 'base', None) is not None and \


[0]	train-auc:0.746162	valid-auc:0.728914
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 30 rounds.
[50]	train-auc:0.87957	valid-auc:0.782579
[100]	train-auc:0.918569	valid-auc:0.789515
[150]	train-auc:0.942401	valid-auc:0.792197
[200]	train-auc:0.952209	valid-auc:0.793354
Stopping. Best iteration:
[195]	train-auc:0.951208	valid-auc:0.793525



In [18]:
from sklearn.metrics import accuracy_score, f1_score
y_pred = np.where(model.predict(xgb.DMatrix(X_test)) > 0.5, 1, 0)
accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

(0.7140888057497292, 0.6954059156702328)

### 64 category multi class - 가중치 적용 전

#### OneVsRest Classifier

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(merge_pledge.drop(['acc_id',
                                                                           'char_id',
                                                                           'pledge_id',
                                                                           'amount_spent_x',
                                                                           'total_spent',
                                                                          'secession',
                                                                          'survival_time'], axis=1),
                                                    merge_pledge.survival_time,
                                                    test_size = 0.3, random_state=42)
# X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((23697, 267), (10157, 267), (23697,), (10157,))

In [38]:
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer

np.random.seed(42)

OVR = OneVsRestClassifier(XGBClassifier(n_estimator = 1000,n_jobs=-1, max_depth=8, silent = 1,
                                       gamma = 1, subsample = 0.8 ))


# You may need to use MultiLabelBinarizer to encode your variables from arrays [[x, y, z]] to a multilabel 
# format before training.


OVR.fit(X_train, y_train)

print('One vs Rest accuracy : %f' % OVR.score(X_train, y_train))

One vs Rest accuracy : 0.908258


In [39]:
from sklearn.metrics import accuracy_score

y_pred = OVR.predict(X_test)
accuracy_score(y_test, y_pred)

0.6077581963178104

In [40]:
from joblib import dump, load

dump(OVR, '0001.joblib')

['0001.joblib']

### amount spent 예측 rmse - 가중치 적용 전

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(merge_pledge.drop(['acc_id',
                                                                           'char_id',
                                                                           'pledge_id',
                                                                           'amount_spent_x',
                                                                           'total_spent',
                                                                          'secession',
                                                                          'survival_time'], axis=1),
                                                    merge_pledge.amount_spent_x,
                                                    test_size = 0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state=42)

X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

((17772, 267), (5925, 267), (10157, 267), (17772,), (5925,), (10157,))

In [50]:
import xgboost as xgb

params = {'n_estimators' : 475,
         'learning_rate' : 0.1,
          'nthread' : 15,
          'num_boost_round' : 300,
         'max_depth' : 6,
          'objective': 'reg:linear',
          'booster' : 'gblinear',
          'gamma' : 0,
          'subsample' : 0.9}

VALID = True
np.random.seed(42)
if VALID == True:
    
    tr_data = xgb.DMatrix(X_train, y_train)
    va_data = xgb.DMatrix(X_valid, y_valid)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model = xgb.train(params, tr_data, 1000, watchlist, verbose_eval = 50, early_stopping_rounds=30)

[0]	train-rmse:0.738745	valid-rmse:0.531206
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 30 rounds.
[50]	train-rmse:0.703645	valid-rmse:0.476474
[100]	train-rmse:0.703173	valid-rmse:0.475741
[150]	train-rmse:0.702997	valid-rmse:0.475417
[200]	train-rmse:0.702897	valid-rmse:0.47519
[250]	train-rmse:0.702833	valid-rmse:0.475028
[300]	train-rmse:0.702789	valid-rmse:0.474906
[350]	train-rmse:0.702756	valid-rmse:0.474816
[400]	train-rmse:0.702731	valid-rmse:0.474749
[450]	train-rmse:0.702711	valid-rmse:0.474699
[500]	train-rmse:0.702694	valid-rmse:0.474661
[550]	train-rmse:0.70268	valid-rmse:0.474632
[600]	train-rmse:0.702669	valid-rmse:0.474609
[650]	train-rmse:0.702659	valid-rmse:0.474594
[700]	train-rmse:0.70265	valid-rmse:0.474583
[750]	train-rmse:0.702643	valid-rmse:0.474575
[800]	train-rmse:0.702637	valid-rmse:0.474569
[850]	train-rmse:0.702631	valid-rmse:0.474566
Stopping. Best iteration:
[869]	t

In [51]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(xgb.DMatrix(pd.DataFrame(X_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))

RMSE: 0.587075


In [32]:
model.save_model('0002.model')

## 가중치 적용

### 이탈, 비이탈 이진분류 - 가중치 적용 후

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(merge_pledge_weight.drop(['acc_id',
                                                                           'char_id',
                                                                           'pledge_id',
                                                                           'amount_spent_x',
                                                                           'total_spent',
                                                                          'secession',
                                                                          'survival_time'], axis=1),
                                                    merge_pledge_weight.secession,
                                                    test_size = 0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state=42)

X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

((17772, 267), (5925, 267), (10157, 267), (17772,), (5925,), (10157,))

In [20]:
import xgboost as xgb

params = {'n_estimators' : 475,
         'learning_rate' : 0.1,
          'nthread' : 15,
          'num_boost_round' : 300,
         'max_depth' : 6,
          'eval_metric' : 'auc',
          'gamma' : 1,
          'subsample' : 0.9}

VALID = True
np.random.seed(42)
if VALID == True:
    
    tr_data = xgb.DMatrix(X_train, y_train)
    va_data = xgb.DMatrix(X_valid, y_valid)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model = xgb.train(params, tr_data, 500, watchlist, verbose_eval = 50, early_stopping_rounds=30)

[0]	train-auc:0.746162	valid-auc:0.728914
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 30 rounds.
[50]	train-auc:0.879572	valid-auc:0.782578
[100]	train-auc:0.918016	valid-auc:0.790052
[150]	train-auc:0.942326	valid-auc:0.794499
[200]	train-auc:0.954298	valid-auc:0.795656
[250]	train-auc:0.960356	valid-auc:0.796608
[300]	train-auc:0.963853	valid-auc:0.796946
Stopping. Best iteration:
[308]	train-auc:0.965047	valid-auc:0.797268



In [21]:
from sklearn.metrics import accuracy_score, f1_score
y_pred = np.where(model.predict(xgb.DMatrix(X_test)) > 0.5, 1, 0)
accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

(0.7130058088018115, 0.696953945316561)

### 64 category multi class - 가중치 적용 후

#### OneVsRest Classifier

In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(merge_pledge_weight.drop(['acc_id',
                                                                           'char_id',
                                                                           'pledge_id',
                                                                           'amount_spent_x',
                                                                           'total_spent',
                                                                          'secession',
                                                                          'survival_time'], axis=1),
                                                    merge_pledge_weight.survival_time,
                                                    test_size = 0.3, random_state=42)
# X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((23697, 267), (10157, 267), (23697,), (10157,))

In [42]:
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer

np.random.seed(42)

OVR = OneVsRestClassifier(XGBClassifier(n_estimator = 1000,n_jobs=-1, max_depth=8, silent = 1,
                                       gamma = 1, subsample = 0.8 ))


# You may need to use MultiLabelBinarizer to encode your variables from arrays [[x, y, z]] to a multilabel 
# format before training.


OVR.fit(X_train, y_train)

print('One vs Rest accuracy : %f' % OVR.score(X_train, y_train))

One vs Rest accuracy : 0.907794


In [43]:
from sklearn.metrics import accuracy_score

y_pred = OVR.predict(X_test)
accuracy_score(y_test, y_pred)

0.6070690164418627

In [44]:
from joblib import dump, load

dump(OVR, '0003.joblib')

['0003.joblib']

#### Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(n_estimators=130,
                            criterion = 'gini',
                              random_state=42)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=130,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [11]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.6037215713301172

In [15]:
# clf.save_model('0003.model')
import dill

with open("./0003.obj", "wb") as f:
    dill.dump(clf, f)

### amount spent 예측 rmse - 가중치 적용 후

In [53]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(merge_pledge_weight.drop(['acc_id',
                                                                           'char_id',
                                                                           'pledge_id',
                                                                           'amount_spent_x',
                                                                           'total_spent',
                                                                          'secession',
                                                                          'survival_time'], axis=1),
                                                    merge_pledge_weight.amount_spent_x,
                                                    test_size = 0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state=42)

X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

((17772, 267), (5925, 267), (10157, 267), (17772,), (5925,), (10157,))

In [55]:
import xgboost as xgb

params = {'n_estimators' : 475,
         'learning_rate' : 0.1,
          'nthread' : 15,
          'num_boost_round' : 300,
         'max_depth' : 6,
          'objective': 'reg:linear',
          'booster' : 'gblinear',
          'gamma' : 0,
          'subsample' : 0.9}

VALID = True
np.random.seed(42)
if VALID == True:
    
    tr_data = xgb.DMatrix(X_train, y_train)
    va_data = xgb.DMatrix(X_valid, y_valid)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model = xgb.train(params, tr_data, 1000, watchlist, verbose_eval = 50, early_stopping_rounds=30)

[0]	train-rmse:0.738101	valid-rmse:0.530641
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 30 rounds.
[50]	train-rmse:0.703647	valid-rmse:0.476503
[100]	train-rmse:0.703174	valid-rmse:0.475753
[150]	train-rmse:0.702997	valid-rmse:0.475418
[200]	train-rmse:0.702898	valid-rmse:0.475197
[250]	train-rmse:0.702833	valid-rmse:0.475031
[300]	train-rmse:0.702789	valid-rmse:0.47491
[350]	train-rmse:0.702756	valid-rmse:0.474822
[400]	train-rmse:0.70273	valid-rmse:0.474753
[450]	train-rmse:0.70271	valid-rmse:0.474704
[500]	train-rmse:0.702694	valid-rmse:0.474665
[550]	train-rmse:0.70268	valid-rmse:0.474636
[600]	train-rmse:0.702669	valid-rmse:0.474615
[650]	train-rmse:0.702659	valid-rmse:0.474601
[700]	train-rmse:0.70265	valid-rmse:0.474589
[750]	train-rmse:0.702643	valid-rmse:0.474581
[800]	train-rmse:0.702636	valid-rmse:0.474576
[850]	train-rmse:0.702631	valid-rmse:0.474572
Stopping. Best iteration:
[868]	tra

In [56]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(xgb.DMatrix(pd.DataFrame(X_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))

RMSE: 0.587074


In [36]:
model.save_model('0004.model')

# catboost - 성능 떨어짐

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(merge_pledge.drop(['acc_id',
                                                                           'char_id',
                                                                           'pledge_id',
                                                                           'amount_spent_x',
                                                                           'total_spent',
                                                                          'secession',
                                                                          'survival_time'], axis=1),
                                                    merge_pledge.survival_time,
                                                    test_size = 0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state=42)

X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

((17772, 267), (5925, 267), (10157, 267), (17772,), (5925,), (10157,))

In [27]:
from catboost import CatBoostClassifier, Pool

np.random.seed(42)


categorical_var = np.where(X_train.dtypes != np.float)[0]

model = CatBoostClassifier(iterations=250, use_best_model = True, od_type = 'Iter',
                          random_seed = 42)
# Fit model
model.fit(X_train, y_train, cat_features = categorical_var, plot = False, eval_set = (X_valid, y_valid))

y_pred = model.predict(X_test)

0:	learn: 3.6436040	test: 3.6512151	best: 3.6512151 (0)	total: 28.9s	remaining: 1h 59m 44s
1:	learn: 3.4553108	test: 3.4634199	best: 3.4634199 (1)	total: 59.3s	remaining: 2h 2m 37s
2:	learn: 3.3244946	test: 3.3328779	best: 3.3328779 (2)	total: 1m 24s	remaining: 1h 55m 29s
3:	learn: 3.2244418	test: 3.2338854	best: 3.2338854 (3)	total: 1m 50s	remaining: 1h 52m 45s
4:	learn: 3.1427841	test: 3.1528130	best: 3.1528130 (4)	total: 2m 19s	remaining: 1h 53m 42s
5:	learn: 3.0741553	test: 3.0845124	best: 3.0845124 (5)	total: 2m 47s	remaining: 1h 53m 41s
6:	learn: 3.0163761	test: 3.0271194	best: 3.0271194 (6)	total: 3m 17s	remaining: 1h 53m 59s
7:	learn: 2.9652535	test: 2.9765536	best: 2.9765536 (7)	total: 3m 45s	remaining: 1h 53m 56s
8:	learn: 2.9217859	test: 2.9339522	best: 2.9339522 (8)	total: 4m 13s	remaining: 1h 52m 56s
9:	learn: 2.8815491	test: 2.8944088	best: 2.8944088 (9)	total: 4m 40s	remaining: 1h 52m 15s
10:	learn: 2.8437348	test: 2.8572158	best: 2.8572158 (10)	total: 5m 9s	remaining: 1

88:	learn: 2.2627211	test: 2.3252592	best: 2.3252592 (88)	total: 40m 2s	remaining: 1h 12m 26s
89:	learn: 2.2599732	test: 2.3235759	best: 2.3235759 (89)	total: 40m 27s	remaining: 1h 11m 56s
90:	learn: 2.2576924	test: 2.3209900	best: 2.3209900 (90)	total: 40m 57s	remaining: 1h 11m 34s
91:	learn: 2.2572641	test: 2.3206205	best: 2.3206205 (91)	total: 41m 3s	remaining: 1h 10m 30s
92:	learn: 2.2544283	test: 2.3181323	best: 2.3181323 (92)	total: 41m 34s	remaining: 1h 10m 11s
93:	learn: 2.2522634	test: 2.3163901	best: 2.3163901 (93)	total: 42m 3s	remaining: 1h 9m 48s
94:	learn: 2.2483397	test: 2.3140132	best: 2.3140132 (94)	total: 42m 29s	remaining: 1h 9m 19s
95:	learn: 2.2466865	test: 2.3129498	best: 2.3129498 (95)	total: 43m 1s	remaining: 1h 9m 1s
96:	learn: 2.2440726	test: 2.3113818	best: 2.3113818 (96)	total: 43m 31s	remaining: 1h 8m 39s
97:	learn: 2.2427675	test: 2.3107618	best: 2.3107618 (97)	total: 44m 1s	remaining: 1h 8m 16s
98:	learn: 2.2406898	test: 2.3099573	best: 2.3099573 (98)	tot

175:	learn: 2.1322874	test: 2.2433131	best: 2.2433131 (175)	total: 1h 25m 8s	remaining: 35m 47s
176:	learn: 2.1314664	test: 2.2431818	best: 2.2431818 (176)	total: 1h 25m 41s	remaining: 35m 20s
177:	learn: 2.1309725	test: 2.2429876	best: 2.2429876 (177)	total: 1h 26m 12s	remaining: 34m 52s
178:	learn: 2.1305034	test: 2.2428083	best: 2.2428083 (178)	total: 1h 26m 41s	remaining: 34m 23s
179:	learn: 2.1292425	test: 2.2420766	best: 2.2420766 (179)	total: 1h 27m 10s	remaining: 33m 54s
180:	learn: 2.1282661	test: 2.2417446	best: 2.2417446 (180)	total: 1h 27m 38s	remaining: 33m 24s
181:	learn: 2.1275788	test: 2.2415107	best: 2.2415107 (181)	total: 1h 28m 3s	remaining: 32m 54s
182:	learn: 2.1271097	test: 2.2413969	best: 2.2413969 (182)	total: 1h 28m 36s	remaining: 32m 26s
183:	learn: 2.1263841	test: 2.2409687	best: 2.2409687 (183)	total: 1h 29m 1s	remaining: 31m 56s
184:	learn: 2.1254901	test: 2.2406587	best: 2.2406587 (184)	total: 1h 29m 31s	remaining: 31m 27s
185:	learn: 2.1246707	test: 2.240

In [28]:
from sklearn.metrics import accuracy_score, f1_score

accuracy_score(y_test, y_pred)

0.5577434281776115

# flatten전 과 비교

### 이진분류

In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(merge_pledge_non_flatten.drop(['acc_id',
                                                                           'char_id',
                                                                           'pledge_id',
                                                                           'amount_spent_x',
                                                                           'total_spent',
                                                                          'secession',
                                                                          'survival_time'], axis=1),
                                                    merge_pledge_non_flatten.secession,
                                                    test_size = 0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state=42)

X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

((17772, 15), (5925, 15), (10157, 15), (17772,), (5925,), (10157,))

In [64]:
import xgboost as xgb

params = {'n_estimators' : 475,
         'learning_rate' : 0.1,
          'nthread' : 15,
          'num_boost_round' : 300,
         'max_depth' : 6,
          'eval_metric' : 'auc',
          'gamma' : 1,
          'subsample' : 0.9}

VALID = True
np.random.seed(42)
if VALID == True:
    
    tr_data = xgb.DMatrix(X_train, y_train)
    va_data = xgb.DMatrix(X_valid, y_valid)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model = xgb.train(params, tr_data, 500, watchlist, verbose_eval = 50, early_stopping_rounds=30)

[0]	train-auc:0.741184	valid-auc:0.720395
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 30 rounds.
[50]	train-auc:0.8409	valid-auc:0.772741
[100]	train-auc:0.868687	valid-auc:0.776566
[150]	train-auc:0.886052	valid-auc:0.778128
Stopping. Best iteration:
[154]	train-auc:0.887367	valid-auc:0.778332



In [67]:
from sklearn.metrics import accuracy_score, f1_score
y_pred = np.where(model.predict(xgb.DMatrix(X_test)) > 0.5, 1, 0)
accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

(0.7055232844343803, 0.6889882499740044)

이진 분류는 정확도 0.5~1.0정도 떨어짐

### 64category 분류  

In [69]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(merge_pledge_non_flatten.drop(['acc_id',
                                                                           'char_id',
                                                                           'pledge_id',
                                                                           'amount_spent_x',
                                                                           'total_spent',
                                                                          'secession',
                                                                          'survival_time'], axis=1),
                                                    merge_pledge_non_flatten.survival_time,
                                                    test_size = 0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state=42)

X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

((17772, 15), (5925, 15), (10157, 15), (17772,), (5925,), (10157,))

In [70]:
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer

np.random.seed(42)

OVR = OneVsRestClassifier(XGBClassifier(n_estimator = 1000,n_jobs=-1, max_depth=8, silent = 1,
                                       gamma = 1, subsample = 0.8 ))


# You may need to use MultiLabelBinarizer to encode your variables from arrays [[x, y, z]] to a multilabel 
# format before training.


OVR.fit(X_train, y_train)

print('One vs Rest accuracy : %f' % OVR.score(X_train, y_train))

One vs Rest accuracy : 0.824555


In [72]:
from sklearn.metrics import accuracy_score

y_pred = OVR.predict(X_test)
accuracy_score(y_test, y_pred)

0.5937776902628729

64 카테고리 또한 정확도 약 1.5% 떨어짐

# Regression으로 예측 후 카테고리화 하여 정확도 측정

In [147]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(merge_pledge.drop(['acc_id',
                                                                           'char_id',
                                                                           'pledge_id',
                                                                           'amount_spent_x',
                                                                           'total_spent',
                                                                          'secession',
                                                                          'survival_time'], axis=1),
                                                    merge_pledge.survival_time,
                                                    test_size = 0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state=42)

X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

((17772, 267), (5925, 267), (10157, 267), (17772,), (5925,), (10157,))

In [148]:
import xgboost as xgb

np.random.seed(42)


xg_reg = xgb.XGBClassifier(n_estimators = 475, learning_rate = 0.1, nthread = 15, max_depth = 8,
                          objective = 'multi:softprob', gamma = 0, subsample=0.9)


eval_set = [(X_train, y_train), (X_valid, y_valid)]

xg_reg.fit(X_train,y_train, eval_set = eval_set, early_stopping_rounds=20)
y_pred = xg_reg.predict(X_test)

[0]	validation_0-merror:0.430677	validation_1-merror:0.449283
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.420268	validation_1-merror:0.440169
[2]	validation_0-merror:0.411152	validation_1-merror:0.435781
[3]	validation_0-merror:0.4053	validation_1-merror:0.432068
[4]	validation_0-merror:0.398548	validation_1-merror:0.427173
[5]	validation_0-merror:0.392415	validation_1-merror:0.423291
[6]	validation_0-merror:0.386226	validation_1-merror:0.420084
[7]	validation_0-merror:0.381274	validation_1-merror:0.417722
[8]	validation_0-merror:0.376435	validation_1-merror:0.416371
[9]	validation_0-merror:0.371877	validation_1-merror:0.415359
[10]	validation_0-merror:0.366194	validation_1-merror:0.414177
[11]	validation_0-merror:0.36113	validation_1-merror:0.412827
[12]	validation_0-merror:0.356235	validation_1-merror:0.412321
[13]	validation_0-merror:0.3514

In [149]:
print('accuracy :', accuracy_score(y_test, y_pred))

accuracy : 0.602933937186177


In [150]:
# 카테고리로 설정했을 떄 차이의 합
de = 0
for i in range(len(y_test)):
    aa = y_test.tolist()[i]-y_pred[i]
    if aa < 0:
        de += -aa
    else:
        de += aa
        
de

144344

In [151]:
import xgboost as xgb

params = {'n_estimators' : 475,
         'learning_rate' : 0.1,
          'nthread' : 15,
          'num_boost_round' : 300,
         'max_depth' : 6,
          'objective': 'reg:linear',
          'booster' : 'gblinear',
          'gamma' : 0,
          'subsample' : 0.9}

VALID = True
np.random.seed(42)
if VALID == True:
    
    tr_data = xgb.DMatrix(X_train, y_train)
    va_data = xgb.DMatrix(X_valid, y_valid)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model = xgb.train(params, tr_data, 1000, watchlist, verbose_eval = 50, early_stopping_rounds=20)

[0]	train-rmse:30.813	valid-rmse:30.5893
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 20 rounds.
[50]	train-rmse:20.7248	valid-rmse:20.9068
[100]	train-rmse:20.6382	valid-rmse:20.87
[150]	train-rmse:20.601	valid-rmse:20.8566
[200]	train-rmse:20.5779	valid-rmse:20.8492
[250]	train-rmse:20.5619	valid-rmse:20.8449
[300]	train-rmse:20.55	valid-rmse:20.8422
[350]	train-rmse:20.5407	valid-rmse:20.8401
[400]	train-rmse:20.5332	valid-rmse:20.8386
[450]	train-rmse:20.5271	valid-rmse:20.8377
[500]	train-rmse:20.522	valid-rmse:20.837
[550]	train-rmse:20.5176	valid-rmse:20.8366
[600]	train-rmse:20.514	valid-rmse:20.8363
[650]	train-rmse:20.5108	valid-rmse:20.8361
Stopping. Best iteration:
[637]	train-rmse:20.5116	valid-rmse:20.836



In [152]:
y_pred = model.predict(xgb.DMatrix(X_test))
y_pred

array([20.321339, 42.772808, 51.716362, ..., 48.04666 , 50.168293,
       41.209946], dtype=float32)

In [156]:
y_pred = y_pred.round()

In [157]:
for i, x in enumerate(y_pred):
    if x < 0:
        y_pred[i] = 0
    elif x > 64:
        y_pred[i] = 64

In [159]:
# 카테고리로 설정했을 떄 차이의 합
de = 0
for i in range(len(y_test)):
    aa = y_test.tolist()[i]-y_pred[i]
    if aa < 0:
        de += -aa
    else:
        de += aa
        
de

182023.0

 64category classify : 144344
 
 linear classify : 182023
 
 굉장히 큰 차이를 보이므로 잔존 vs 이탈로 분류 후 linear classify 진행

In [162]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(merge_pledge.drop(['acc_id',
                                                                           'char_id',
                                                                           'pledge_id',
                                                                           'amount_spent_x',
                                                                           'total_spent',
                                                                          'secession',
                                                                          'survival_time'], axis=1),
                                                    merge_pledge.secession,
                                                    test_size = 0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state=42)

X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

((17772, 267), (5925, 267), (10157, 267), (17772,), (5925,), (10157,))

In [163]:
import xgboost as xgb

params = {'n_estimators' : 475,
         'learning_rate' : 0.1,
          'nthread' : 15,
          'num_boost_round' : 300,
         'max_depth' : 6,
          'eval_metric' : 'auc',
          'gamma' : 1,
          'subsample' : 0.9}

VALID = True
np.random.seed(42)
if VALID == True:
    
    tr_data = xgb.DMatrix(X_train, y_train)
    va_data = xgb.DMatrix(X_valid, y_valid)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model = xgb.train(params, tr_data, 500, watchlist, verbose_eval = 50, early_stopping_rounds=25)

[0]	train-auc:0.746162	valid-auc:0.728914
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 25 rounds.
[50]	train-auc:0.87957	valid-auc:0.782579
[100]	train-auc:0.918569	valid-auc:0.789515
[150]	train-auc:0.942401	valid-auc:0.792197
[200]	train-auc:0.952209	valid-auc:0.793354
Stopping. Best iteration:
[195]	train-auc:0.951208	valid-auc:0.793525



In [165]:
from sklearn.metrics import accuracy_score, f1_score
y_pred = np.where(model.predict(xgb.DMatrix(X_test)) > 0.5, 1, 0)
accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

(0.7145810770896919, 0.6960897368696929)

In [171]:
len(y_pred), X_test.shape

(10157, (10157, 267))