In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold
import lightgbm as lgb
import hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import precision_score
from skopt import BayesSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, log_loss, f1_score

In [2]:
def f1_scorer(pred, y):
    pred = pred.reshape(len(np.unique(labels)), -1)
    return "F1_scorer", metrics.f1_score(y, pred, average="weighted"), True

In [8]:
unique.head()

Unnamed: 0,ImageId,ships,has_ship,has_ship_vec,path
0,00003e153.jpg,0,0,[0.0],../input/train/00003e153.jpg
1,000155de5.jpg,1,1,[1.0],../input/train/000155de5.jpg
2,00021ddc3.jpg,9,1,[1.0],../input/train/00021ddc3.jpg
3,0002756f7.jpg,2,1,[1.0],../input/train/0002756f7.jpg
4,0002d0f32.jpg,0,0,[0.0],../input/train/0002d0f32.jpg


In [27]:
unique = pd.read_csv('../input/unique_img_ids.csv')
unique.has_ship = unique.has_ship.astype(np.int8)
oof_stat = pd.read_csv('../result/oof_stat.csv')
sub_stat = pd.read_csv('../result/suball_area_stat.csv')

submit = pd.read_csv('../input/sample_submission.csv.zip')

pres = [
    ('xception', '../result/Xception_oof.csv', '../result/Xception_pre_0.801.csv'),
    ('inception', '../result/InceptionV3_oof.csv', '../result/InceptionV3_pre_0.824.csv'),
    ('resent', '../result/RESNET52_oof.csv', '../result/RESNET52_pre_0.836.csv'),
    ('vgg', '../result/VGG16_oof.csv', '../result/VGG16_pre_0.791.csv'),
    ('densenet', '../result/DenseNet169_oof.csv', '../result/DenseNet169_pre_0.843.csv'),
]

df_oof = pd.DataFrame()
df_pre = pd.DataFrame()
for (name, oofpath, prepath) in pres:
    oof = pd.read_csv(oofpath, header=None)
    pre = pd.read_csv(prepath)
    df_oof[name] = oof.iloc[:,1]
    df_pre[name] = pre['pre']
    
df_oof['ImageId'] = unique['ImageId']
df_oof.set_index('ImageId', inplace=True, drop=True)

df_pre['ImageId'] = submit['ImageId']
df_pre.set_index('ImageId', inplace=True, drop=True)

In [28]:
for column in df_oof.columns:
    print(column, log_loss(unique.has_ship, df_oof[column]))

xception 0.6735712612350175
inception 0.6029786069805989
resent 0.36940746929589796
vgg 0.35670423332891193
densenet 0.42878190827219037


In [29]:
df_corr = df_oof.corr()
df_corr.style.background_gradient().set_precision(2)

Unnamed: 0,xception,inception,resent,vgg,densenet
xception,1.0,0.38,0.24,0.17,0.18
inception,0.38,1.0,0.25,0.19,0.14
resent,0.24,0.25,1.0,0.77,0.76
vgg,0.17,0.19,0.77,1.0,0.66
densenet,0.18,0.14,0.76,0.66,1.0


## Average

In [12]:
log_loss(unique.has_ship, df_oof.mean(axis=1))

0.39924689281326403

## Linear weights

In [13]:
from scipy.optimize import nnls

def linear_weights(oof, train_y, sub):
    weights = nnls(oof.values, train_y.values)[0]
    weight_out = (oof.values * weights).sum(axis=1).clip(0,1)
    print('log_loss', log_loss(train_y, weight_out))
    weight_sub = (sub.values * weights).sum(axis=1).clip(0,1)
    return weight_out, weight_sub

weight_out, weight_sub = linear_weights(df_oof, unique.has_ship, df_pre)

log_loss 0.2949904154986796


## Ranked Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

def rank_trans(oof, oof_y):
    frames = [oof, oof_y]
    hold_test = pd.concat(frames)

    # Compute ranking:
    ranked_hold_test = hold_test.rank(axis=0)/hold_test.shape[0]

    # Split the frames:
    ranked_hold = ranked_hold_test[:len(oof)]
    ranked_test = ranked_hold_test[len(oof):]
    
    return ranked_hold, ranked_test
    
ranked_hold, ranked_test = rank_trans(df_oof, df_pre)

In [15]:
def logistic_fit(oof, train_y, sub):
    clf = LogisticRegression(C=0.1)
    clf.fit(oof.values, train_y.values)
    log_oof = clf.predict_proba(oof.values)[:, 1]
    log_sub = clf.predict_proba(sub.values)[:, 1]
    
    print('none streach score:', log_loss(train_y, log_oof))
    stretch_oof = (log_oof - log_oof.min())/(log_oof.max() - log_oof.min())
    print('streach score:', log_loss(train_y, stretch_oof))
    
    return clf, log_oof, log_sub

In [16]:
clf, rank_oof, rank_sub = logistic_fit(ranked_hold, unique.has_ship, ranked_test)

none streach score: 0.26740341040823395
streach score: 0.2649396811592294


In [17]:
def hist_it(ensemble, submit):
    plt.hist(submit, normed=True, alpha=0.5)
    plt.hist(ensemble, normed=True, alpha=0.5)
    
def predict_hist(clf, train, test):
    hold_out = clf.predict_proba(train)[:,1]
    hold_out = (hold_out - hold_out.min()) / (hold_out.max() - hold_out.min())
    
    pre = clf.predict_proba(test)[:,1]
    pre = (pre - pre.min()) / (pre.max() - pre.min())
    
    hist_it(hold_out, pre)
    
    return hold_out, pre

In [None]:
hold_out = clf.predict_proba(ranked_hold)[:,1]
hold_out = (hold_out - hold_out.min()) / (hold_out.max() - hold_out.min())

## Quadratic Linear Stacking

In [None]:
df_oof.head()

In [21]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures

def preprocessing(X, degree):

    poly = PolynomialFeatures(degree)
    scaler = MinMaxScaler()  
    lin_scaler = StandardScaler()
    poly_df = pd.DataFrame(lin_scaler.fit_transform(poly.fit_transform(scaler.fit_transform(X))))
    poly_df['ImageId'] = X.index
    poly_df.set_index('ImageId', inplace=True, drop=True)
    return poly_df

# Compute poly features:
def poly_trans(oof, sub):
    frames = [oof, sub]
    poly_all = pd.concat(frames)

    degree = 3
    poly_hold_test = preprocessing(poly_all, degree)

    # Split the frames:
    poly_hold = poly_hold_test[:len(oof)]
    poly_test = poly_hold_test[len(oof):]
    return poly_hold, poly_test

In [22]:
poly_hold, poly_test = poly_trans(ranked_hold, ranked_test)

In [23]:
clf, poly_oof, poly_sub = logistic_fit(poly_hold, unique.has_ship, poly_test)

none streach score: 0.23580027476680418
streach score: 0.23581117833462922


In [None]:
predict_hist(clf, poly_hold, poly_test)

## LGBM

In [33]:
poly_stat = pd.merge(poly_hold, oof_stat, on='ImageId', how='left').set_index('ImageId')
sub_poly_stat = pd.merge(poly_test, sub_stat, on='ImageId', how='left').set_index('ImageId')

In [32]:
sub_poly_stat.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,54,55,mean,std,max,min,median,kurt,skew,cnt
ImageId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001124c7.jpg,0.0,-0.620059,0.333775,0.223734,0.120687,1.083771,-0.769711,-0.270454,-0.335039,-0.348766,...,0.536547,1.012752,0.0,0.0,0,0,0.0,0.0,0.0,0.0
000194a2d.jpg,0.0,-0.620059,0.333775,0.949365,1.058403,-0.262593,-0.769711,-0.270454,-0.058866,0.01567,...,-0.246338,-0.612652,521.5,606.299843,1728,60,328.0,5.019171,2.174685,6.0
0001b1832.jpg,0.0,-0.620059,0.333775,0.148247,-0.948412,1.083771,-0.769711,-0.270454,-0.363769,-0.764263,...,-0.229361,1.012752,0.0,0.0,0,0,0.0,0.0,0.0,0.0
00052ed46.jpg,0.0,0.451515,-0.547421,-0.832555,-0.128764,0.281663,0.21471,-0.174142,-0.408701,0.151937,...,-0.203334,-0.188957,0.0,0.0,0,0,0.0,0.0,0.0,0.0
000532683.jpg,0.0,-0.217191,-1.651302,0.810661,0.808592,0.528073,-0.47447,-0.994166,0.237316,0.274641,...,0.381855,0.097828,170.0,114.551299,251,89,170.0,0.0,0.0,2.0


In [34]:
#lgb_params = {'colsample_bytree': 0.9212458857042737, 'learning_rate': 0.005631424718051923, 'max_bin': 210, 'max_depth': 546, 'min_child_samples': 15, 'min_child_weight': 21.3893618197026, 'num_leaves': 27, 'reg_alpha': 2.7341188915075043, 'reg_lambda': 0.000771903984301836, 'subsample': 0.5499556995458353, 'subsample_freq': 19}

# 0.2284
#{'colsample_bytree': 0.4, 'learning_rate': 0.03807991186694423, 'max_bin': 1200, 'max_depth': 40, 'min_child_samples': 100, 'min_child_weight': 0.0001, 'num_leaves': 10, 'reg_alpha': 0.19411163899114828, 'reg_lambda': 183.7497880427182, 'subsample': 1.0, 'subsample_freq': 8}

# 0.156
lgb_params = {'colsample_bytree': 0.8797320649643039, 'learning_rate': 0.007517239253342655, 'max_bin': 634, 'max_depth': 573, 'min_child_samples': 90, 'min_child_weight': 35.85158585487673, 'num_leaves': 91, 'reg_alpha': 2.2710865846131765, 'reg_lambda': 0.023189858048358762, 'subsample': 0.8297131143495482, 'subsample_freq': 11}
"""
lgb_params.update(
    {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'binary_error'
    })
"""

submit['combine'] = 0

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
for idx,(trn_idx, val_idx) in enumerate(folds.split(poly_stat, unique.has_ship)):
    print('Fold', idx)
    
    clf = lgb.LGBMClassifier(objective='binary', 
                                  boosting_type='gbdt', 
                                  n_estimators = 2000,
                                  nthread= 4,
                                  verbose = 200,
                                  #is_unbalance = True,
                                  **lgb_params
                                 )
    clf.fit(poly_stat.iloc[trn_idx], unique.has_ship.iloc[trn_idx],
           eval_set=(poly_stat.iloc[val_idx], unique.has_ship.iloc[val_idx]),
           eval_metric='binary_logloss',
           early_stopping_rounds = 100,
           verbose=200)
    
    submit['combine'] += clf.predict_proba(sub_poly_stat)[:,1] / folds.n_splits

submit[['ImageId','combine']].rename({'combine':'pre'},axis=1).to_csv('../result/stack_poly_stat.csv', index=False)

Fold 0
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.238553
[400]	valid_0's binary_logloss: 0.171276
[600]	valid_0's binary_logloss: 0.15877
[800]	valid_0's binary_logloss: 0.156335
[1000]	valid_0's binary_logloss: 0.155616
[1200]	valid_0's binary_logloss: 0.155463
Early stopping, best iteration is:
[1150]	valid_0's binary_logloss: 0.155448
Fold 1
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.23844
[400]	valid_0's binary_logloss: 0.170474
[600]	valid_0's binary_logloss: 0.157567
[800]	valid_0's binary_logloss: 0.154842
[1000]	valid_0's binary_logloss: 0.154243
Early stopping, best iteration is:
[1001]	valid_0's binary_logloss: 0.154239
Fold 2
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.241118
[400]	valid_0's binary_logloss: 0.174978
[600]	valid_0's binary_logloss: 0.162946
[800]	valid_0's binary_logloss: 0.160847
[1000]	valid_0's bin

In [None]:
stack_sub = pd.DataFrame()
stack_sub['img'] = submit.ImageId
stack_sub['pre'] = submit['combine']

In [None]:
stack_sub.head()

In [37]:
def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest binary_error: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 8),
        bayes_cv_tuner.best_params_
    ))
    
    # Save all model results
    clf_name = bayes_cv_tuner.estimator.__class__.__name__
    all_models.to_csv(clf_name+"_cv_results.csv")
    
estimator = lgb.LGBMClassifier(objective='binary', 
                                  boosting_type='gbdt', 
                                  n_estimators = 1000,
                                  nthread= 4,
                                  verbose = -1,
                                  eval_metric='binary_logloss',
                                 )

search_spaces = {
        'learning_rate': (0.001, 0.1, 'log-uniform'),
        'num_leaves': (10, 200),      
        'max_depth': (3, 800), 
        'min_child_samples': (1, 100),
        'max_bin': (5, 1200),
        'subsample_freq': (1, 20),
        'subsample':(0.5,1),
        'min_child_weight': (1e-4, 50),
        'reg_lambda': (1e-4, 500, 'log-uniform'),
        'reg_alpha': (1e-4, 500, 'log-uniform'),
        'colsample_bytree':(0.4, 1.0),
    }

bayes_cv_tuner = BayesSearchCV(
    estimator = estimator,
    search_spaces = search_spaces, 
    scoring = 'neg_log_loss',
    cv = KFold(
        n_splits=5,
        shuffle=True,
        random_state=42
    ),
    n_jobs = 1,
    n_iter = 100,   
    verbose = 0,
    refit = True,
    random_state = 42
)

result = bayes_cv_tuner.fit(poly_stat, unique.has_ship, callback=status_print)
with open('best_params.txt', 'w+') as fo:
    fo.write(str(bayes_cv_tuner.best_params_))
    
pred = bayes_cv_tuner.predict(sub_poly_stat)
submit['combine'] = pred

Model #1
Best binary_error: -0.15793285
Best params: {'colsample_bytree': 0.6460623753119883, 'learning_rate': 0.02853983686604182, 'max_bin': 1120, 'max_depth': 255, 'min_child_samples': 67, 'min_child_weight': 20.705990212413674, 'num_leaves': 77, 'reg_alpha': 8.993250342705343, 'reg_lambda': 0.010954450268049988, 'subsample': 0.8238957941734388, 'subsample_freq': 11}

Model #2
Best binary_error: -0.15793285
Best params: {'colsample_bytree': 0.6460623753119883, 'learning_rate': 0.02853983686604182, 'max_bin': 1120, 'max_depth': 255, 'min_child_samples': 67, 'min_child_weight': 20.705990212413674, 'num_leaves': 77, 'reg_alpha': 8.993250342705343, 'reg_lambda': 0.010954450268049988, 'subsample': 0.8238957941734388, 'subsample_freq': 11}

Model #3
Best binary_error: -0.15793285
Best params: {'colsample_bytree': 0.6460623753119883, 'learning_rate': 0.02853983686604182, 'max_bin': 1120, 'max_depth': 255, 'min_child_samples': 67, 'min_child_weight': 20.705990212413674, 'num_leaves': 77, 'r

Model #23
Best binary_error: -0.15661503
Best params: {'colsample_bytree': 0.8797320649643039, 'learning_rate': 0.007517239253342655, 'max_bin': 634, 'max_depth': 573, 'min_child_samples': 90, 'min_child_weight': 35.85158585487673, 'num_leaves': 91, 'reg_alpha': 2.2710865846131765, 'reg_lambda': 0.023189858048358762, 'subsample': 0.8297131143495482, 'subsample_freq': 11}

Model #24
Best binary_error: -0.15661503
Best params: {'colsample_bytree': 0.8797320649643039, 'learning_rate': 0.007517239253342655, 'max_bin': 634, 'max_depth': 573, 'min_child_samples': 90, 'min_child_weight': 35.85158585487673, 'num_leaves': 91, 'reg_alpha': 2.2710865846131765, 'reg_lambda': 0.023189858048358762, 'subsample': 0.8297131143495482, 'subsample_freq': 11}

Model #25
Best binary_error: -0.15661503
Best params: {'colsample_bytree': 0.8797320649643039, 'learning_rate': 0.007517239253342655, 'max_bin': 634, 'max_depth': 573, 'min_child_samples': 90, 'min_child_weight': 35.85158585487673, 'num_leaves': 91, 

Model #47
Best binary_error: -0.15591875
Best params: {'colsample_bytree': 0.4183327770923152, 'learning_rate': 0.014038707423002264, 'max_bin': 1200, 'max_depth': 572, 'min_child_samples': 100, 'min_child_weight': 0.0001, 'num_leaves': 53, 'reg_alpha': 0.0001, 'reg_lambda': 0.8343267995233301, 'subsample': 1.0, 'subsample_freq': 16}

Model #48
Best binary_error: -0.15591875
Best params: {'colsample_bytree': 0.4183327770923152, 'learning_rate': 0.014038707423002264, 'max_bin': 1200, 'max_depth': 572, 'min_child_samples': 100, 'min_child_weight': 0.0001, 'num_leaves': 53, 'reg_alpha': 0.0001, 'reg_lambda': 0.8343267995233301, 'subsample': 1.0, 'subsample_freq': 16}

Model #49
Best binary_error: -0.15591875
Best params: {'colsample_bytree': 0.4183327770923152, 'learning_rate': 0.014038707423002264, 'max_bin': 1200, 'max_depth': 572, 'min_child_samples': 100, 'min_child_weight': 0.0001, 'num_leaves': 53, 'reg_alpha': 0.0001, 'reg_lambda': 0.8343267995233301, 'subsample': 1.0, 'subsample_f

Model #72
Best binary_error: -0.15591875
Best params: {'colsample_bytree': 0.4183327770923152, 'learning_rate': 0.014038707423002264, 'max_bin': 1200, 'max_depth': 572, 'min_child_samples': 100, 'min_child_weight': 0.0001, 'num_leaves': 53, 'reg_alpha': 0.0001, 'reg_lambda': 0.8343267995233301, 'subsample': 1.0, 'subsample_freq': 16}

Model #73
Best binary_error: -0.15591875
Best params: {'colsample_bytree': 0.4183327770923152, 'learning_rate': 0.014038707423002264, 'max_bin': 1200, 'max_depth': 572, 'min_child_samples': 100, 'min_child_weight': 0.0001, 'num_leaves': 53, 'reg_alpha': 0.0001, 'reg_lambda': 0.8343267995233301, 'subsample': 1.0, 'subsample_freq': 16}

Model #74
Best binary_error: -0.15591875
Best params: {'colsample_bytree': 0.4183327770923152, 'learning_rate': 0.014038707423002264, 'max_bin': 1200, 'max_depth': 572, 'min_child_samples': 100, 'min_child_weight': 0.0001, 'num_leaves': 53, 'reg_alpha': 0.0001, 'reg_lambda': 0.8343267995233301, 'subsample': 1.0, 'subsample_f

Model #96
Best binary_error: -0.15516743
Best params: {'colsample_bytree': 0.4, 'learning_rate': 0.009261881426846122, 'max_bin': 1200, 'max_depth': 800, 'min_child_samples': 1, 'min_child_weight': 0.0001, 'num_leaves': 106, 'reg_alpha': 0.8588205815399963, 'reg_lambda': 0.0001, 'subsample': 1.0, 'subsample_freq': 20}

Model #97
Best binary_error: -0.15516743
Best params: {'colsample_bytree': 0.4, 'learning_rate': 0.009261881426846122, 'max_bin': 1200, 'max_depth': 800, 'min_child_samples': 1, 'min_child_weight': 0.0001, 'num_leaves': 106, 'reg_alpha': 0.8588205815399963, 'reg_lambda': 0.0001, 'subsample': 1.0, 'subsample_freq': 20}

Model #98
Best binary_error: -0.15516743
Best params: {'colsample_bytree': 0.4, 'learning_rate': 0.009261881426846122, 'max_bin': 1200, 'max_depth': 800, 'min_child_samples': 1, 'min_child_weight': 0.0001, 'num_leaves': 106, 'reg_alpha': 0.8588205815399963, 'reg_lambda': 0.0001, 'subsample': 1.0, 'subsample_freq': 20}

Model #99
Best binary_error: -0.15516

  if diff:


In [38]:
submit[['ImageId','combine']].rename({'combine':'pre'},axis=1).to_csv('../result/stack_stat_bayes.csv', index=False)

In [None]:
stack_sub = submit[['ImageId','combine']]
stack_sub[''ImageId''] = submit[]