In [1]:
%%time
RANDOM_STATE = 42
import pandas as pd
train_data = pd.read_csv('train.csv', index_col='ID')
train_data = train_data.sample(frac=1.0)

test_data = pd.read_csv('test.csv', index_col='ID')

y_train = train_data['TARGET']
train_data.drop(['TARGET'], axis=1, inplace=True)
X_train = train_data
print(X_train.shape, y_train.shape)
print(y_train.value_counts())
assert not (X_train.count() != y_train.shape[0]).any() # ensure that there is no missing data

(76020, 369) (76020,)
0    73012
1     3008
Name: TARGET, dtype: int64
CPU times: user 7.52 s, sys: 590 ms, total: 8.11 s
Wall time: 9.21 s


In [4]:
%%time
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif, VarianceThreshold
import numpy as np

def remove_constant_features(X):
    non_const_cols = (X != X.ix[1]).any()
    return X.loc[:, non_const_cols]

def get_best_features(X, y):
    selectK = SelectPercentile(f_classif, 90)
    selectK.fit(X[:y.shape[0]], y)
    return pd.DataFrame(selectK.transform(X))

def replace_outliers_with_mean(X):
    for col in X.columns:
        mean, std = X[col].mean(), X[col].std()
        outliers = (X[col] - mean).abs() > 3 * std
        X[outliers] = X[col][~outliers].mean()
    return X

from pandas.core.common import array_equivalent

def get_duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1, lcs):
                ja = vs.iloc[:,j].values
                if array_equivalent(ia, ja):
                    dups.append(cs[i])
                    break

    return dups

def remove_dup_cols(X):
    dup_cols = get_duplicate_columns(X)
    print('columns are duplicates: ', dup_cols)

    # too high memory usage
    #return X.T.drop_duplicates().T

    return X.drop(dup_cols, axis=1)

def encode_categorical_columns(X):
    categorical_cols = []
    for col_name in X.loc[:, X.dtypes == np.int64]:
        col = X[col_name]
        if col.value_counts().count() <= 10:
#            print('vc of {}: {}'.format(col_name, col.value_counts()))
            categorical_cols.append(col_name)

    print('categorical columns count: ', len(categorical_cols))
    if not categorical_cols:
        return X
    
    dummies = pd.get_dummies(X[categorical_cols])
    ret = pd.concat([X, dummies], axis=1)
    ret.drop(categorical_cols, axis=1, inplace=True)
    return ret
    
def data_transform(X, y):
    X = remove_dup_cols(X)
    X = remove_constant_features(X)
    X = encode_categorical_columns(X)
    X = get_best_features(X, y)
    return X
 
# XXX: it decreses auc on train set, don't use
# BUT: one time it increased auc on test set for 0.003
#    X = get_best_features(X, y)

# XXX: it dramatically decreses auc on train set, don't use
#df = pd.DataFrame(x)
#return replace_outliers_with_mean(df)

X_test = test_data
X_all = pd.concat([X_train, X_test])
X_all_transformed = data_transform(X_all, y_train)

X_train_transformed, X_test_transformed = X_all_transformed[:X_train.shape[0]], X_all_transformed[X_train.shape[0]:]
assert(X_train_transformed.shape[0] == X_train.shape[0])
assert(X_test_transformed.shape[0] == X_test.shape[0])
assert X_train_transformed.shape[1] == X_test_transformed.shape[1]

print('data transformation: {} -> {} features'.format(X_train.shape[1], X_train_transformed.shape[1]))

columns are duplicates:  ['ind_var2_0', 'ind_var2', 'ind_var6_0', 'ind_var6', 'ind_var13_medio_0', 'ind_var18_0', 'ind_var26_0', 'ind_var25_0', 'ind_var27_0', 'ind_var28_0', 'ind_var28', 'ind_var27', 'ind_var32_0', 'ind_var34_0', 'ind_var37_0', 'ind_var40', 'ind_var41', 'ind_var46_0', 'ind_var46', 'num_var6_0', 'num_var6', 'num_var18_0', 'num_var26_0', 'num_var25_0', 'num_var27_0', 'num_var28_0', 'num_var28', 'num_var27', 'num_var32_0', 'num_var34_0', 'num_var37_0', 'num_var40', 'num_var41', 'num_var46_0', 'num_var46', 'saldo_var28', 'saldo_var27', 'saldo_var41', 'saldo_var46', 'delta_imp_reemb_var13_1y3', 'delta_imp_reemb_var17_1y3', 'delta_imp_reemb_var33_1y3', 'delta_imp_trasp_var17_in_1y3', 'delta_imp_trasp_var17_out_1y3', 'delta_imp_trasp_var33_in_1y3', 'delta_imp_trasp_var33_out_1y3', 'imp_amort_var18_hace3', 'imp_amort_var34_hace3', 'imp_reemb_var13_hace3', 'imp_reemb_var33_hace3', 'imp_trasp_var17_out_hace3', 'imp_trasp_var33_out_hace3', 'num_var2_0_ult1', 'num_var2_ult1', 'num

In [9]:
%%time
import xgboost as xgb
import numpy as np

def report(grid_scores, n_top=3):
    import numpy as np
    from operator import itemgetter
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

def find_best_xgb_params(X, y, X_all, y_all, n_iter):
    from sklearn.grid_search import RandomizedSearchCV

    clf = xgb.XGBClassifier(missing=np.nan, seed=RANDOM_STATE)
    
    param_distributions = {
        'max_depth': np.arange(3, 7),
        'n_estimators': [1000], # early stop helps us
        'learning_rate': np.random.uniform(0.01, 0.1, 30),
        'nthread': [4],
        'subsample': np.arange(0.5, 1.05, 0.05),
        'colsample_bytree': np.arange(0.5, 1.05, 0.05),
        'seed': [RANDOM_STATE],
    }
    fit_params = {'early_stopping_rounds': 30, 'eval_metric': "auc", 'verbose': False, 'eval_set': [(X_all, y_all)]}
    rs = RandomizedSearchCV(estimator=clf, param_distributions=param_distributions,
                            n_iter=n_iter, n_jobs=1, refit=False,
                            cv=2, random_state=RANDOM_STATE,
                            scoring='roc_auc', fit_params=fit_params, verbose=1)

    rs.fit(X, y)
    report(rs.grid_scores_, n_top=3)


    
def find_best_xgb_params_with_hyperopt(X, y, X_check, y_check, n_iter):
    from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
    from sklearn.metrics import roc_auc_score
    
    X_train, y_train = X[:int(X.shape[0]/2)], y[:int(X.shape[0]/2)]
    X_test, y_test = X[int(X.shape[0]/2):], y[int(X.shape[0]/2):]
    
    def objective(s):
        clf = xgb.XGBClassifier(missing=np.nan, n_estimators=10000,
                                nthread=4, seed=RANDOM_STATE,
                                max_depth=int(s['max_depth']),
                                min_child_weight=s['min_child_weight'],
                                learning_rate=s['learning_rate'],
                                subsample=0.7,
                                colsample_bytree=0.7,
                               )
        eval_set = [(X_test, y_test)]

        clf.fit(X_train, y_train,
                eval_set=eval_set, eval_metric="auc", 
                early_stopping_rounds=30)

        pred = clf.predict_proba(X_check)[:,1]
        auc = roc_auc_score(y_check, pred)
        nonlocal iter_no
        print("#{}/{}: SCORE={}, best_tree_n={}, space={}".format(iter_no, n_iter, auc, clf.best_iteration, s))
        iter_no += 1
        return {'loss':1-auc, 'status': STATUS_OK }
    
    iter_no = 1
    space = {
        'max_depth': hp.quniform("max_depth", 2, 6, 1),
        'min_child_weight': hp.quniform('min_child', 1, 20, 1),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.03),
#        'subsample': hp.uniform('subsample', 0.7, 1),
#        'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
        'scale_pos_weight': hp.uniform('scale_pos_weight', 0.1, 1.0)
    }


    trials = Trials()
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=n_iter,
                trials=trials)
    print(best)

train_subsamples_n = 40000

#find_best_xgb_params(x_, y_, X_train_transformed, y_train, n_iter=100)
find_best_xgb_params_with_hyperopt(X_train_transformed[:train_subsamples_n], y_train[:train_subsamples_n],
                                   X_train_transformed[train_subsamples_n:], y_train[train_subsamples_n:],
                                   n_iter=400)

#1/400: SCORE=0.8316644668622255, best_tree_n=191, space={'max_depth': 6.0, 'learning_rate': 0.02573836181067011, 'scale_pos_weight': 0.4675681276096899, 'min_child_weight': 16.0}
#2/400: SCORE=0.8125182769769025, best_tree_n=9, space={'max_depth': 3.0, 'learning_rate': 0.011491738571228687, 'scale_pos_weight': 0.45397674265419286, 'min_child_weight': 20.0}
#3/400: SCORE=0.8334464270930491, best_tree_n=286, space={'max_depth': 3.0, 'learning_rate': 0.022239771354419062, 'scale_pos_weight': 0.48717674213153306, 'min_child_weight': 2.0}
#4/400: SCORE=0.8337151002423598, best_tree_n=284, space={'max_depth': 3.0, 'learning_rate': 0.02945331282472169, 'scale_pos_weight': 0.8018829845241376, 'min_child_weight': 3.0}
#5/400: SCORE=0.8338386803579918, best_tree_n=165, space={'max_depth': 5.0, 'learning_rate': 0.029455927779875282, 'scale_pos_weight': 0.9443492346096453, 'min_child_weight': 8.0}
#6/400: SCORE=0.8335988711681477, best_tree_n=194, space={'max_depth': 4.0, 'learning_rate': 0.02491

KeyboardInterrupt: 

In [5]:
%%time
import xgboost as xgb
import numpy as np
# also working params: missing=9999999999, max_depth = 4, n_estimators=200, learning_rate=0.001,
# nthread=4, subsample=0.75, colsample_bytree=0.75, seed=4242

#[125] cv-mean:0.8293896666666667	cv-std:0.003718656507694988
clf_params_from_kaggle_scripts = {'max_depth': 5, 'learning_rate': 0.03,
                                  'subsample': 0.95, 'colsample_bytree': 0.85}

#[48] cv-mean:0.8289763333333333	cv-std:0.004993271561700667
clf_params_from_hyperopt = {'learning_rate': 0.023533308727518153, 'colsample_bytree': 0.7028779789099826,
                            'max_depth': 4, 'subsample': 0.7408078537264539, 'min_child_weight': 10.0}

clf_params = dict(clf_params_from_hyperopt)
clf_params['missing'] = np.nan
clf_params['seed'] = RANDOM_STATE
clf_params['nthread'] = 4
clf_params['n_estimators'] = 10000

def do_xgb_cv(X, y):
    dtrain = xgb.DMatrix(X, label=y, missing=clf_params['missing'])
    xgb.cv(clf_params, dtrain, clf_params['n_estimators'], nfold=3, metrics='auc',
           early_stopping_rounds=20, verbose_eval=True, seed=RANDOM_STATE)

def xgb_predict(X_train, y_train, X_test, n_estimators):
    params = dict(clf_params)
    params['n_estimators'] = n_estimators
    xgb_clf = xgb.XGBClassifier(**params)
    xgb_clf.fit(X_train, y_train, verbose=True)

    return xgb_clf.predict_proba(X_test)[:,1]

# ipython crashes often on full collection, so run on subcollection
cv_limit = int(X_train_transformed.shape[0]/2)
if False:
    do_xgb_cv(X_train_transformed[:cv_limit], y_train[:cv_limit])

n_estimators = 238 # get it from best iteration of xgb.cv

# double check CV score for overfitting check
if True:
    X_train_test, y_train_test = X_train_transformed[:cv_limit], y_train[:cv_limit]
    y_train_test_predicted = xgb_predict(X_train_transformed[cv_limit:], y_train[cv_limit:],
                                         X_train_test, n_estimators=n_estimators)
    from sklearn.metrics import roc_auc_score
    auc = roc_auc_score(y_train_test, y_train_test_predicted)
    print('double checked AUC is ', auc)

if False:
    y_pred = xgb_predict(X_train_transformed, y_train, X_test_transformed, n_estimators=n_estimators)
    submission = pd.DataFrame({"ID": test_data.index, "TARGET": y_pred})
    submission.to_csv("submission.csv", index=False)

double checked AUC is  0.839140157487
CPU times: user 56.2 s, sys: 780 ms, total: 57 s
Wall time: 14.5 s


In [15]:
def compare_estimators():
    import time
    from sklearn.model_selection import cross_val_score

    from sklearn import svm
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import SGDClassifier, BayesianRidge, ElasticNet, LinearRegression, LogisticRegression, Perceptron
    from sklearn.gaussian_process import GaussianProcessRegressor
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
    from sklearn.naive_bayes import GaussianNB
    import xgboost as xgb

    estimators = [
                  {'name': 'xgb', 'clf': xgb.XGBClassifier, 'params': {'no_random_state': True}, 'clf_params': {'n_estimators': 40, 'learning_rate': 0.2}},
                  {'name': 'gbt', 'clf': GradientBoostingClassifier, 'clf_params': {'n_estimators': 40, 'learning_rate': 0.2}},
                  {'name': 'random_forest', 'clf': RandomForestClassifier, 'clf_params': {'n_estimators': 100, 'n_jobs': 4}},
                  {'name': 'extra_trees', 'clf': ExtraTreesClassifier, 'clf_params': {'n_estimators': 100, 'n_jobs': 2}},
                  {'name': 'ada_boost', 'clf': AdaBoostClassifier,
                   'clf_params': {'n_estimators': 10}},
                  {'name': 'lin_reg', 'clf': LinearRegression, 'params': {'no_random_state': True}},

                  {'name': 'dec_tree', 'clf': DecisionTreeClassifier},

                  {'name': 'svm', 'clf': svm.SVC, 'clf_params': {'kernel': 'linear', 'tol': 1e-1}, 'params': {'subsamples_n': 0.002}},
                  {'name': 'knn', 'clf': KNeighborsClassifier, 'params': {'no_random_state': True, 'subsamples_n': 0.1}},
                  {'name': 'sgd', 'clf': SGDClassifier},
                  {'name': 'bayes_ridge', 'clf': BayesianRidge, 'params': {'no_random_state': True}},
                  {'name': 'elastic_net', 'clf': ElasticNet},
                  {'name': 'log_reg', 'clf': LogisticRegression},
                  {'name': 'perceptron', 'clf': Perceptron},
                 ]

    for e in estimators:
        start_time = time.time()
        clf_class = e['clf']
        clf_params = dict(e.get('clf_params', {}))
        params = e.get('params', {})
        if not params.get('no_random_state'):
            clf_params['random_state'] = RANDOM_STATE
        if params.get('subsamples_n'):
            subsamples_n = int(X_train.shape[0] * params['subsamples_n'])
            x, y = X_train[:subsamples_n], y_train[:subsamples_n]
        else:
            x, y = X_train, y_train

        clf = clf_class(**clf_params)
        r = cross_val_score(clf, x, y,
                            cv=2, n_jobs=4, scoring='roc_auc').mean()
        print('{}: {:.5f} for {:.1f} sec'.format(e['name'], r, time.time() - start_time))
        
compare_estimators()

xgb: 0.83588 for 29.8 sec
gbt: 0.83175 for 29.0 sec
random_forest: 0.75196 for 32.4 sec
extra_trees: 0.69849 for 38.8 sec
ada_boost: 0.81779 for 4.5 sec
lin_reg: 0.78822 for 4.7 sec
dec_tree: 0.57324 for 6.8 sec
svm: 0.29054 for 2.6 sec
knn: 0.52334 for 13.8 sec
sgd: 0.62868 for 2.1 sec
bayes_ridge: 0.77733 for 13.1 sec




elastic_net: 0.69058 for 48.0 sec
log_reg: 0.60418 for 6.3 sec
perceptron: 0.62868 for 2.1 sec
