In [2]:
%%time
RANDOM_STATE = 42
import pandas as pd
train_data = pd.read_csv('train.csv', index_col='ID')
test_data = pd.read_csv('test.csv', index_col='ID')

y_train = train_data['TARGET']
X_train = train_data.drop(['TARGET'], axis=1)
print(X_train.shape, y_train.shape)
print(y_train.value_counts())
assert not (X_train.count() != y_train.shape[0]).any() # ensure that there is no missing data

(76020, 369) (76020,)
0    73012
1     3008
Name: TARGET, dtype: int64
CPU times: user 8.81 s, sys: 780 ms, total: 9.59 s
Wall time: 10.1 s


In [3]:
%%time
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
import numpy as np

def remove_constant_features(X):
    s = VarianceThreshold()
    s.fit(X)
    return s.transform

def get_best_features(X, y):
    selectK = SelectKBest(f_classif, k=250)
    selectK.fit(X, y)
    return selectK.transform

def replace_outliers_with_mean(X):
    for col in X.columns:
        mean, std = X[col].mean(), X[col].std()
        outliers = (X[col] - mean).abs() > 3 * std
        X[outliers] = X[col][~outliers].mean()
    return X

def remove_dup_cols(X):
    remove = []
    df = pd.DataFrame(X)
    c = df.columns
    for i in range(len(c)-1):
        v = df[c[i]].values
        for j in range(i+1,len(c)):
            if np.array_equal(v, df[c[j]].values):
                remove.append(c[j])
                
    def tr(x):
        return pd.DataFrame(x).drop(remove, axis=1)
    return tr
    
def data_fit_transform(X, y):
#    X = X.replace(-999999, 2)#9999999999
    
    cf = remove_constant_features(X)
    X = cf(X)
    
    rdc = remove_dup_cols(X)
    X = rdc(X)
 
# XXX: it decreses auc on train set, don't use
# BUT: one time it increased auc on test set for 0.003
#    bf = get_best_features(X, y)
#    X = bf(X) 

# XXX: it dramatically decreses auc on train set, don't use
#df = pd.DataFrame(x)
#return replace_outliers_with_mean(df)
    
    def tr(x):
        return rdc(cf(x))
    
    return tr, X

data_transformer, X_train_transformed = data_fit_transform(X_train, y_train)
print('data transformation removed {}/{} features'.format(X_train.shape[1] - X_train_transformed.shape[1], X_train.shape[1]))
X_test_transformed = data_transformer(test_data)

data transformation removed 63/369 features
CPU times: user 7.45 s, sys: 100 ms, total: 7.55 s
Wall time: 7.61 s


In [8]:
%%time
def report(grid_scores, n_top=3):
    import numpy as np
    from operator import itemgetter
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

def find_best_xgb_params(X, y, n_iter):
    import xgboost as xgb
    import numpy as np
    from sklearn.grid_search import RandomizedSearchCV

    clf = xgb.XGBClassifier(missing=np.nan, seed=RANDOM_STATE)
    param_distributions = {
        'max_depth': np.arange(2, 8),
        'n_estimators': [1000], # early stop helps us
        'learning_rate': np.random.uniform(0.001, 0.2, 30),
        'nthread': [4],
        'subsample': np.arange(0.05, 1.05, 0.05),
        'colsample_bytree': np.arange(0.05, 1.05, 0.05),
#        'colsample_bylevel': np.arange(0.1, 1.1, 0.1),
        'seed': [RANDOM_STATE],
#        'scale_pos_weight': np.arange(0, 1.1, 0.1),
#        'reg_lambda': np.arange(0, 1.1, 0.1),
#        'reg_alpha': np.arange(0, 1.1, 0.1),
#        'gamma': np.arange(0, 1.1, 0.1),
    }
    fit_params = {'early_stopping_rounds': 30, 'eval_metric': "auc", 'verbose': False, 'eval_set': [(X, y)]}
    rs = RandomizedSearchCV(estimator=clf, param_distributions=param_distributions,
                            n_iter=n_iter, n_jobs=1, refit=False,
                            cv=3, random_state=RANDOM_STATE,
                            scoring='roc_auc', fit_params=fit_params, verbose=1)

    rs.fit(X, y)
    return rs

subsamples_n = 200
rs = find_best_xgb_params(X_train_transformed.sample(subsamples_n, random_state=RANDOM_STATE),
                          y_train.sample(subsamples_n, random_state=RANDOM_STATE), 100)
report(rs.grid_scores_, n_top=5)
# on 5k:
#Model with rank: 1
#Mean validation score: 0.823 (std: 0.008)
#Parameters: {'max_depth': 7, 'reg_lambda': 0.75000000000000011, 'seed': 4242, 'colsample_bytree': 0.15000000000000002, 'n_estimators': 700, 'gamma': 0.80000000000000004, 'scale_pos_weight': 0.65000000000000013, 'nthread': 4, 'learning_rate': 0.16590859024207197, 'colsample_bylevel': 0.90000000000000013, 'subsample': 0.45000000000000001, 'reg_alpha': 0.95000000000000007}

# 1k:
# ===
#Model with rank: 1
#Mean validation score: 0.810 (std: 0.040)
#Parameters: {'max_depth': 2, 'reg_lambda': 0.75000000000000011, 'seed': 4242, 'colsample_bytree': 0.55000000000000004, 'n_estimators': 700, 'gamma': 0.90000000000000002, 'scale_pos_weight': 0.40000000000000002, 'nthread': 4, 'learning_rate': 0.18509478318796457, 'colsample_bylevel': 0.65000000000000013, 'subsample': 0.95000000000000007, 'reg_alpha': 1.0}


#Model with rank: 1
#Mean validation score: 0.818 (std: 0.041)
#Parameters: {'max_depth': 1, 'reg_lambda': 0.0, 'seed': 4242, 'colsample_bytree': 0.90000000000000002, 'n_estimators': 230, 'gamma': 0.80000000000000004, 'scale_pos_weight': 0.60000000000000009, 'nthread': 4, 'learning_rate': 0.16432489935422318, 'colsample_bylevel': 0.40000000000000002, 'subsample': 0.90000000000000002, 'reg_alpha': 0.90000000000000002}

#Model with rank: 4
#Mean validation score: 0.815 (std: 0.035)
#Parameters: {'max_depth': 1, 'reg_lambda': 0.30000000000000004, 'seed': 4242, 'colsample_bytree': 0.59999999999999998, 'n_estimators': 830, 'gamma': 0.60000000000000009, 'scale_pos_weight': 0.30000000000000004, 'nthread': 4, 'learning_rate': 0.16106270282244134, 'colsample_bylevel': 0.70000000000000007, 'subsample': 1.0, 'reg_alpha': 0.20000000000000001}

#Model with rank: 3
#Mean validation score: 0.850 (std: 0.034)
#Parameters: {'n_estimators': 1000, 'seed': 42, 'nthread': 4, 'subsample': 0.90000000000000013, 'max_depth': 5, 'learning_rate': 0.11214482728762599, 'colsample_bytree': 0.95000000000000007}

Model with rank: 1
Mean validation score: 0.858 (std: 0.085)
Parameters: {'n_estimators': 1000, 'seed': 42, 'nthread': 4, 'subsample': 0.60000000000000009, 'max_depth': 6, 'learning_rate': 0.1979282234579941, 'colsample_bytree': 0.65000000000000013}

Model with rank: 2
Mean validation score: 0.855 (std: 0.064)
Parameters: {'n_estimators': 1000, 'seed': 42, 'nthread': 4, 'subsample': 0.29999999999999999, 'max_depth': 6, 'learning_rate': 0.1979282234579941, 'colsample_bytree': 0.95000000000000007}

Model with rank: 3
Mean validation score: 0.850 (std: 0.034)
Parameters: {'n_estimators': 1000, 'seed': 42, 'nthread': 4, 'subsample': 0.90000000000000013, 'max_depth': 5, 'learning_rate': 0.11214482728762599, 'colsample_bytree': 0.95000000000000007}

Model with rank: 4
Mean validation score: 0.850 (std: 0.039)
Parameters: {'n_estimators': 1000, 'seed': 42, 'nthread': 4, 'subsample': 0.90000000000000013, 'max_depth': 5, 'learning_rate': 0.049480389963296491, 'colsample_bytree': 0.6000000000000

In [4]:
%%time
import xgboost as xgb
import numpy as np
# also working params: missing=9999999999, max_depth = 4, n_estimators=200, learning_rate=0.001,
# nthread=4, subsample=0.75, colsample_bytree=0.75, seed=4242

clf_params_from_kaggle_scripts = {'max_depth': 5, 'learning_rate': 0.03,
                                  'subsample': 0.95, 'colsample_bytree': 0.85}

clf_params = dict(clf_params_from_kaggle_scripts)
clf_params['missing'] = np.nan
clf_params['seed'] = RANDOM_STATE
clf_params['nthread'] = 4
clf_params['n_estimators'] = 1000

def do_xgb_cv(X, y):
    dtrain = xgb.DMatrix(X, label=y, missing=clf_params['missing'])
    xgb.cv(clf_params, dtrain, clf_params['n_estimators'], nfold=5, metrics='auc',
           early_stopping_rounds=20, verbose_eval=True, seed=RANDOM_STATE)
 
def xgb_predict(X_train, y_train, X_test, **add_params):
    params = dict(clf_params)
    params.update(add_params)
    clf = xgb.XGBClassifier(**clf_params)
    clf.fit(X_train, y_train, early_stopping_rounds=20, eval_metric="auc", verbose=True,
            eval_set=[(X_train, y_train)])
    
    from sklearn.metrics import roc_auc_score
    print('xgb auc:', clf.best_iteration,
          roc_auc_score(y_train, clf.predict_proba(X_train, ntree_limit=clf.best_iteration)[:,1]))
    
    return clf.predict_proba(X_test, ntree_limit=clf.best_iteration)[:,1]

# ipython crashes often on full collection, so run on subcollection
do_xgb_cv(X_train_transformed[:40000], y_train[:40000])

#y_pred = xgb_predict(X_train_transformed, y_train, X_test_transformed)
#submission = pd.DataFrame({"ID": test_data.index, "TARGET": y_pred})
#submission.to_csv("submission.csv", index=False)

Will train until cv error hasn't decreased in 20 rounds.
[0]	cv-test-auc:0.7925904+0.003893049221368766	cv-train-auc:0.8062403999999999+0.0041545556007833145
[1]	cv-test-auc:0.8049076+0.002897922952737004	cv-train-auc:0.8225979999999999+0.0055345190576959555
[2]	cv-test-auc:0.8114684000000001+0.006108599728251982	cv-train-auc:0.834536+0.0040347851987435285
[3]	cv-test-auc:0.8136671999999999+0.007995657556448982	cv-train-auc:0.8415526+0.0038074918568527494
[4]	cv-test-auc:0.8118451999999999+0.0074994640448501505	cv-train-auc:0.8442172000000001+0.0033283632253706774
[5]	cv-test-auc:0.8138852+0.009564554070106967	cv-train-auc:0.8484242+0.003020876389394317
[6]	cv-test-auc:0.8175866+0.0074359206719814	cv-train-auc:0.8536376000000001+0.0022591264329381803
[7]	cv-test-auc:0.8185775999999999+0.0075122929815070565	cv-train-auc:0.8555984000000001+0.0025622172117133087
[8]	cv-test-auc:0.8188704+0.007011005308798471	cv-train-auc:0.8577750000000002+0.002459494175638592
[9]	cv-test-auc:0.8203258+0.

CPU times: user 1min 3s, sys: 1.56 s, total: 1min 4s
Wall time: 1min 5s


Stopping. Best iteration:
[9] cv-mean:0.8203258	cv-std:0.0054596506994495685


In [15]:
def compare_estimators():
    import time
    from sklearn.model_selection import cross_val_score

    from sklearn import svm
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import SGDClassifier, BayesianRidge, ElasticNet, LinearRegression, LogisticRegression, Perceptron
    from sklearn.gaussian_process import GaussianProcessRegressor
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
    from sklearn.naive_bayes import GaussianNB
    import xgboost as xgb

    estimators = [
                  {'name': 'xgb', 'clf': xgb.XGBClassifier, 'params': {'no_random_state': True}, 'clf_params': {'n_estimators': 40, 'learning_rate': 0.2}},
                  {'name': 'gbt', 'clf': GradientBoostingClassifier, 'clf_params': {'n_estimators': 40, 'learning_rate': 0.2}},
                  {'name': 'random_forest', 'clf': RandomForestClassifier, 'clf_params': {'n_estimators': 100, 'n_jobs': 4}},
                  {'name': 'extra_trees', 'clf': ExtraTreesClassifier, 'clf_params': {'n_estimators': 100, 'n_jobs': 2}},
                  {'name': 'ada_boost', 'clf': AdaBoostClassifier,
                   'clf_params': {'n_estimators': 10}},
                  {'name': 'lin_reg', 'clf': LinearRegression, 'params': {'no_random_state': True}},

                  {'name': 'dec_tree', 'clf': DecisionTreeClassifier},

                  {'name': 'svm', 'clf': svm.SVC, 'clf_params': {'kernel': 'linear', 'tol': 1e-1}, 'params': {'subsamples_n': 0.002}},
                  {'name': 'knn', 'clf': KNeighborsClassifier, 'params': {'no_random_state': True, 'subsamples_n': 0.1}},
                  {'name': 'sgd', 'clf': SGDClassifier},
                  {'name': 'bayes_ridge', 'clf': BayesianRidge, 'params': {'no_random_state': True}},
                  {'name': 'elastic_net', 'clf': ElasticNet},
                  {'name': 'log_reg', 'clf': LogisticRegression},
                  {'name': 'perceptron', 'clf': Perceptron},
                 ]

    for e in estimators:
        start_time = time.time()
        clf_class = e['clf']
        clf_params = dict(e.get('clf_params', {}))
        params = e.get('params', {})
        if not params.get('no_random_state'):
            clf_params['random_state'] = RANDOM_STATE
        if params.get('subsamples_n'):
            subsamples_n = int(X_train.shape[0] * params['subsamples_n'])
            x, y = X_train[:subsamples_n], y_train[:subsamples_n]
        else:
            x, y = X_train, y_train

        clf = clf_class(**clf_params)
        r = cross_val_score(clf, x, y,
                            cv=2, n_jobs=4, scoring='roc_auc').mean()
        print('{}: {:.5f} for {:.1f} sec'.format(e['name'], r, time.time() - start_time))
        
compare_estimators()

xgb: 0.83588 for 29.8 sec
gbt: 0.83175 for 29.0 sec
random_forest: 0.75196 for 32.4 sec
extra_trees: 0.69849 for 38.8 sec
ada_boost: 0.81779 for 4.5 sec
lin_reg: 0.78822 for 4.7 sec
dec_tree: 0.57324 for 6.8 sec
svm: 0.29054 for 2.6 sec
knn: 0.52334 for 13.8 sec
sgd: 0.62868 for 2.1 sec
bayes_ridge: 0.77733 for 13.1 sec




elastic_net: 0.69058 for 48.0 sec
log_reg: 0.60418 for 6.3 sec
perceptron: 0.62868 for 2.1 sec
