## 1. Data Loading 

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data_path = '../dataset/'
train_df = pd.read_csv(data_path + 'train.csv')
test_df = pd.read_csv(data_path+ 'test.csv')

print("[TRAIN] : {}".format(train_df.shape))
print("[TEST]  : {}".format(test_df.shape))

[TRAIN] : (595212, 59)
[TEST]  : (892816, 58)


## 2. Feature Selection

In [3]:
feature_list = [
'ps_ind_06_bin',
'ps_ind_07_bin',
'ps_ind_08_bin',
'ps_ind_09_bin',
'ps_ind_16_bin',
'ps_ind_17_bin',
'ps_ind_18_bin',
'ps_ind_04_cat',
'ps_ind_05_cat',
'ps_ind_01',
'ps_ind_03',
'ps_ind_14',
'ps_ind_15',
'ps_reg_01',
'ps_reg_02',
'ps_reg_03',
'ps_car_01_cat',
'ps_car_02_cat',
'ps_car_03_cat',
'ps_car_04_cat',
'ps_car_05_cat',
'ps_car_06_cat',
'ps_car_07_cat',
'ps_car_08_cat',
'ps_car_09_cat',
'ps_car_10_cat',
'ps_car_11',
'ps_car_12',
'ps_car_13',
'ps_car_14',
'ps_car_15',
]

In [4]:
tr_df = train_df[feature_list + ['target']].copy()

for feature in feature_list:
    if 'bin' in feature or 'cat' in feature:
        temp_df = pd.get_dummies(tr_df[feature], prefix=feature) # , drop_first=True
        tr_df = pd.concat([tr_df, temp_df], axis=1)
        tr_df.drop(columns=feature, inplace=True)
print(tr_df.shape)

(595212, 102)


In [5]:
ts_df = test_df[feature_list].copy()

for feature in feature_list:
    if 'bin' in feature or 'cat' in feature:
        temp_df = pd.get_dummies(ts_df[feature], prefix=feature) #, drop_first=True
        ts_df = pd.concat([ts_df, temp_df], axis=1)
        ts_df.drop(columns=feature, inplace=True)
print(ts_df.shape)

(892816, 101)


## 3. Modeling - Stacking

- 참고 : [https://github.com/ishuca/Nested-CV-Stacking/blob/master/Nested_CV_Stacking.iynb]

In [35]:
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegressionCV
import numpy as np

In [14]:
tr_all_y = tr_df['target']
tr_all_X = tr_df.drop(columns='target', axis=1)
ts_X = ts_df

In [31]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)

folds = [x[1] for x in list(skf.split(tr_all_X, tr_all_y))]
folds

[array([    10,     11,     13, ..., 595202, 595205, 595208]),
 array([     0,      2,      4, ..., 595203, 595204, 595209]),
 array([     1,      3,      6, ..., 595207, 595210, 595211])]

In [32]:
clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        #RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        #ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]

In [33]:
from tqdm import tqdm
class_num = len(np.unique(tr_all_y))
print(class_num)

dataset_blend_train_not_nested = np.zeros((tr_all_X.shape[0], len(clfs)*class_num))
dataset_blend_test = np.zeros((ts_X.shape[0], len(clfs)*class_num))
print(dataset_blend_train_not_nested)
print(tr_all_X.index)
print(np.concatenate(inner_folds).ravel())

for k, clf in tqdm(enumerate(clfs)):
    for i in tqdm(range(0,len(folds))):
        target_fold = folds[i]
        inner_folds = folds[0:i]+folds[i+1:]
        print(inner_folds)
        print(target_fold)
        
        clf.fit(tr_all_X.loc[np.concatenate(inner_folds).ravel()], tr_all_y.loc[np.concatenate(inner_folds).ravel()])
        pred = clf.predict_proba(tr_all_X.loc[target_fold])
        dataset_blend_train_not_nested[target_fold, k*class_num:(k*class_num+class_num)] = pred

2
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
RangeIndex(start=0, stop=595212, step=1)
[     4     11     15 ... 595207 595210 595211]




0it [00:00, ?it/s]


  0%|                                                    | 0/3 [00:00<?, ?it/s]

[array([     0,      2,      4, ..., 595203, 595204, 595209]), array([     1,      3,      6, ..., 595207, 595210, 595211])]
[    10     11     13 ... 595202 595205 595208]





 33%|██████████████▋                             | 1/3 [01:37<03:15, 97.69s/it]

[array([    10,     11,     13, ..., 595202, 595205, 595208]), array([     1,      3,      6, ..., 595207, 595210, 595211])]
[     0      2      4 ... 595203 595204 595209]





 67%|█████████████████████████████▎              | 2/3 [03:22<01:39, 99.72s/it]

[array([    10,     11,     13, ..., 595202, 595205, 595208]), array([     0,      2,      4, ..., 595203, 595204, 595209])]
[     1      3      6 ... 595207 595210 595211]





100%|████████████████████████████████████████████| 3/3 [05:01<00:00, 99.68s/it]




1it [05:01, 301.75s/it]


  0%|                                                    | 0/3 [00:00<?, ?it/s]

[array([     0,      2,      4, ..., 595203, 595204, 595209]), array([     1,      3,      6, ..., 595207, 595210, 595211])]
[    10     11     13 ... 595202 595205 595208]





 33%|██████████████▎                            | 1/3 [02:11<04:21, 131.00s/it]

[array([    10,     11,     13, ..., 595202, 595205, 595208]), array([     1,      3,      6, ..., 595207, 595210, 595211])]
[     0      2      4 ... 595203 595204 595209]





 67%|████████████████████████████▋              | 2/3 [04:30<02:13, 133.42s/it]

[array([    10,     11,     13, ..., 595202, 595205, 595208]), array([     0,      2,      4, ..., 595203, 595204, 595209])]
[     1      3      6 ... 595207 595210 595211]





100%|███████████████████████████████████████████| 3/3 [06:49<00:00, 135.14s/it]




2it [11:50, 333.99s/it]


  0%|                                                    | 0/3 [00:00<?, ?it/s]

[array([     0,      2,      4, ..., 595203, 595204, 595209]), array([     1,      3,      6, ..., 595207, 595210, 595211])]
[    10     11     13 ... 595202 595205 595208]





 33%|██████████████▎                            | 1/3 [05:38<11:17, 338.79s/it]

[array([    10,     11,     13, ..., 595202, 595205, 595208]), array([     1,      3,      6, ..., 595207, 595210, 595211])]
[     0      2      4 ... 595203 595204 595209]





 67%|████████████████████████████▋              | 2/3 [11:14<05:38, 338.01s/it]

[array([    10,     11,     13, ..., 595202, 595205, 595208]), array([     0,      2,      4, ..., 595203, 595204, 595209])]
[     1      3      6 ... 595207 595210 595211]





100%|███████████████████████████████████████████| 3/3 [17:22<00:00, 346.77s/it]




3it [29:13, 546.46s/it]



In [36]:
stacker = LogisticRegressionCV(refit=False)
stacker.fit(dataset_blend_train_not_nested, tr_all_y)



LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=False, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0)

## 4. Score 

In [51]:
def gini(actual, pred, cmpcol=0, sortcol=1):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [52]:
pred_y = stacker.predict(dataset_blend_train_not_nested)
print(gini_normalized(tr_all_y, pred_y))

0.0005808707910432705


In [None]:
pd.DataFrame({'target':y_pred, 'id': test_df['id']}).to_csv('../dataset/submission.csv', index=False)