In [None]:
# 모델 학습에 필요한 라이브러리
import lightgbm as lgbm
import xgboost as xgb
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from time import time
import datetime


In [None]:
import os, sys
from google.colab import drive
drive.mount('/gdrive')
os.chdir('/gdrive/My Drive/Colab Notebooks')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
def Gini(y_true, y_pred):
    # 정답과 예측값의 개수가 동일한지 확인한다
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # 예측값(y_pred)를 오름차순으로 정렬한다
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # Lorenz curves를 계산한다
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # Gini 계수를 계산한다
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # Gini 계수를 정규화한다
    return G_pred * 1. / G_true

# LightGBM 모델 학습 과정에서 평가 함수로 사용한다
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', Gini(labels, preds), True

## DATA

In [None]:
# 훈련 데이터, 테스트 데이터를 읽어온다
train = pd.read_csv('train.csv')
train_label = train['target']
train_id = train['id']
test = pd.read_csv('test.csv')
test_id = test['id']

# target 변수를 별도로 분리하고, ‘id, target’ 변수를 제거한다. 훈련 데이터와 테스트 데이터의 변수를 동일하게 가져가기 위함이다.
y = train['target'].values
drop_feature = [
    'id',
    'target'
]
X = train.drop(drop_feature,axis=1)

## feature engineering

In [None]:
# 범주형 변수와 수치형 변수를 분리
# Calc 변수는 포함시키지 않음 !!!!
feature_names = X.columns.tolist()
cat_features = [c for c in feature_names if ('cat' in c and 'count' not in c)]
num_features = [c for c in feature_names if ('cat' not in c and 'calc' not in c)]

파생1. 결측값개수

In [None]:
X['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)
num_features.append('missing')

파생2. 범주형변수 onehotencoding

In [None]:
for c in cat_features:
    le = LabelEncoder()
    le.fit(X[c])
    X[c] = le.transform(X[c])
    test[c] = le.transform(test[c])
    
enc = OneHotEncoder()
enc.fit(X[cat_features])
X_cat = enc.transform(X[cat_features])
X_t_cat = enc.transform(test[cat_features])

파생3. 범주형 'new_ind' 고유값의 빈도 

In [None]:
#  ind 변수의 고유값을 조합한 'new ind'
ind_features = [c for c in feature_names if 'ind' in c]
count=0
for c in ind_features:
    if count==0:
        X['new_ind'] = X[c].astype(str)+'_'
        test['new_ind'] = test[c].astype(str)+'_'
        count+=1
    else:
        X['new_ind'] += X[c].astype(str)+'_'
        test['new_ind'] += test[c].astype(str)+'_'

In [None]:
cat_count_features = []
for c in cat_features+['new_ind']:
    d = pd.concat([X[c],test[c]]).value_counts().to_dict()
    X['%s_count'%c] = X[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

In [None]:
# 수치형 변수, 범주형 변수/new_ind 빈도 및 범주형 변수를 모델 학습에 사용한다. 
train_list = [X[num_features+cat_count_features].values,X_cat,]
test_list = [test[num_features+cat_count_features].values,X_t_cat,]

# 모델 학습 속도 및 메로리 최적화를 위하여 데이터를 Sparse Matrix 형태로 변환한다.
X = ssp.hstack(train_list).tocsr()
X_test = ssp.hstack(test_list).tocsr()

## lgbm

In [None]:
# LightGBM 모델의 설정값이다.
num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": 0.1,
          "num_leaves": 15,
           "max_bin": 256,
          "feature_fraction": 0.6,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }

In [None]:
# Stratified 5-Fold 내부 교차 검증
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)

In [None]:
x_score = []
final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_id))
# 총20번의 다른 시드값으로 학습을 돌려, 평균값을 최종 예측 결과물로 사용한다. 시드값이 많을 수록 랜덤 요소로 인한 분산을 줄일 수 있다.
for s in range(22):
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_id))

    params['seed'] = s
    
    kf = kfold.split(X, train_label)

    best_trees = []
    fold_scores = []

    for i, (train_fold, validate) in enumerate(kf):
        X_train, X_validate, label_train, label_validate = X[train_fold, :], X[validate, :], train_label[train_fold], train_label[validate]
        dtrain = lgbm.Dataset(X_train, label_train)
        dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
        # 훈련 데이터를 학습하고, evalerror() 함수를 통해 검증 데이터에 대한 정규화 Gini 계수 점수를 기준으로 최적의 트리 개수를 찾는다.
        bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100, early_stopping_rounds=100)
        best_trees.append(bst.best_iteration)
        # 테스트 데이터에 대한 예측값을 cv_pred에 더한다.
        cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
        cv_train[validate] += bst.predict(X_validate)

        # 검증 데이터에 대한 평가 점수를 출력한다.
        score = Gini(label_validate, cv_train[validate])
        print(score)
        fold_scores.append(score)

    cv_pred /= NFOLDS
    final_cv_train += cv_train
    final_cv_pred += cv_pred

    # 시드값별로 교차 검증 점수를 출력한다.
    print("cv score:")
    print(Gini(train_label, cv_train))
    print("current score:", Gini(train_label, final_cv_train / (s + 1.)), s+1)
    print(fold_scores)
    print(best_trees, np.mean(best_trees))

    x_score.append(Gini(train_label, cv_train))

print(x_score)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.15159	valid_0's gini: 0.291863
[200]	valid_0's binary_logloss: 0.151468	valid_0's gini: 0.29491
Early stopping, best iteration is:
[189]	valid_0's binary_logloss: 0.151457	valid_0's gini: 0.295075
0.2950745960037473
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.152123	valid_0's gini: 0.272679
[200]	valid_0's binary_logloss: 0.152038	valid_0's gini: 0.275581
[300]	valid_0's binary_logloss: 0.152089	valid_0's gini: 0.276195
Early stopping, best iteration is:
[224]	valid_0's binary_logloss: 0.152015	valid_0's gini: 0.276851
0.27685121636649
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151974	valid_0's gini: 0.277978
[200]	valid_0's binary_logloss: 0.151926	valid_0's gini: 0.280307
Early stopping, best iteration is:
[139]	valid_0's binary_logloss: 0.151902	valid_0's gini: 0.280395
0.2803950421

In [None]:
pd.DataFrame({'id': test_id, 'target': final_cv_pred / 22.}).to_csv('lgbm_delcalc22.csv', index=False)

다시...

In [None]:

from scipy.stats.mstats import hmean

lgbm = pd.read_csv('lgbm_delcalc22.csv')
nn = pd.read_csv('nn_model_with_cv(del calc + inter).csv')
preds = pd.concat([lgbm['target'], nn['target']])

#Apply harmonic mean 
preds = preds.groupby(level=0).apply(hmean)

# Create submission 
print(preds.head)
sub = pd.DataFrame()
sub['id'] = lgbm['id']
sub['target'] = preds
	
sub.to_csv('sub_harmonic(lgbm+nn)2.csv', index = False) 

#def get_rank(x):
 # return pd.Series(x).rank(pct=True).values

#pd.DataFrame({'id':lgbm['id'],'target':get_rank(lgbm['target'])*0.5 + get_rank(nn['target'])*0.5}).to_csv('simple_average4.csv',index=False)

<bound method NDFrame.head of 0         0.054134
1         0.043964
2         0.043060
3         0.021783
4         0.066621
            ...   
892811    0.167522
892812    0.084535
892813    0.062941
892814    0.039807
892815    0.051672
Name: target, Length: 892816, dtype: float64>
