In [1]:
import xgboost as xgb
import catboost as catb
import lightgbm as lgb

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
%matplotlib notebook
import matplotlib.pyplot as plt

In [3]:
train_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')

In [4]:
train_df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [5]:
#validation_df = train_df.sample(frac=0.7, random_state=2011, axis)

In [6]:
X = train_df.drop(['ID_code', 'target'], axis=1)
y = train_df['target']
#X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
X_test = test_df.drop(['ID_code'], axis=1)
X_test_IDs = test_df['ID_code']

## Naive bayes

In [None]:
from sklearn.preprocessing import QuantileTransformer

transformed = pd.DataFrame(QuantileTransformer(output_distribution='normal').fit_transform(X))

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB

pipeline = make_pipeline(QuantileTransformer(output_distribution='normal'), GaussianNB())
pipeline.fit(X, y)

In [None]:
predictions = pipeline.predict_proba(X_test)[:,1]

### LightGBM

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
params = {'num_leaves': 31,
         'min_data_in_leaf': np.arange(2,40,5), 
         'objective':'binary',
         'max_depth': [2, 5, 7, 10, 13, -1],
         'learning_rate': 10**((np.random.random(10)-1)*2.5),
         "min_child_samples": 20,
         "boosting": ['gbdt', 'gbrt', 'random_forest'],
         "bagging_freq": 1,
         "bagging_fraction":[0.7, 0.8, 0.9, 0.95],
         "bagging_seed": 11,
         "feature_fraction":[0.7, 0.8, 0.9, 0.95], 
         "metric": 'auc',
         "lambda_l1": 10**((.5*np.random.random(10)-1)*3),
         "lambda_l2": 10**((.5*np.random.random(10)-1)*3),
         "verbosity": -1,
         "nthread": -1,
         "random_state": 1992}

In [None]:
def get_parms(params_dict):
    dict_ = {}
    for param, value in params_dict.items():
        if isinstance(value, list) or isinstance(value, np.ndarray): 
            dict_[param] = np.random.choice(value, 1)[0]
        else:
            dict_[param] = value
            
    return dict_
    

In [None]:
small_train_df = train_df.sample(50000)
small_X = small_train_df.drop(['ID_code', 'target'], axis=1)
small_y = small_train_df['target']

In [10]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1992)

In [15]:
oof_predictions = np.zeros(y.shape)
test_predictions = np.zeros(X_test_IDs.shape)

In [None]:
scores = []
best_score = 0.
for i in range(50):
    print("------------ training the {} th model -------------".format(i))
    param = get_parms(params)
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(small_X, small_y.values)):
        print("fold {}".format(fold_))
        trn_data = lgb.Dataset(small_X.iloc[trn_idx,:], label=small_y.iloc[trn_idx])
        val_data = lgb.Dataset(small_X.iloc[val_idx,:], label=small_y.iloc[val_idx])

        num_round = 20000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=300, early_stopping_rounds=200)
        oof_predictions[val_idx] = clf.predict(small_X.iloc[val_idx,:], num_iteration=clf.best_iteration)
    score = roc_auc_score(small_y, oof_predictions)
    print('----------------- auc score == {} -------------------'.format(score))
    if score > best_score:
        best_score = score
        best_params = param
    scores.append((score, param))

In [None]:
sorted(scores)

In [16]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.05,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'auc',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 2019}

param = {'num_leaves': 31,
   'min_data_in_leaf': 12,
   'objective': 'binary',
   'max_depth': 13,
   'learning_rate': 0.0053688798975117845,
   'min_child_samples': 20,
   'boosting': 'gbdt',
   'bagging_freq': 1,
   'bagging_fraction': 0.9,
   'bagging_seed': 11,
   'feature_fraction': 0.8,
   'metric': 'auc',
   'lambda_l1': 0.00027574321473024596,
   'lambda_l2': 0.22104802305840615,
   'verbosity': -1,
   'nthread': -1,
   'random_state': 1992}


for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y.values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(X.iloc[trn_idx,:], label=y.iloc[trn_idx])
    val_data = lgb.Dataset(X.iloc[val_idx,:], label=y.iloc[val_idx])

    num_round = 20000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=300, early_stopping_rounds=200)
    oof_predictions[val_idx] = clf.predict(X.iloc[val_idx,:], num_iteration=clf.best_iteration)

    
    test_predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

fold 0
Training until validation scores don't improve for 200 rounds.
[300]	training's auc: 0.838027	valid_1's auc: 0.805029
[600]	training's auc: 0.87684	valid_1's auc: 0.83491
[900]	training's auc: 0.898092	valid_1's auc: 0.850715
[1200]	training's auc: 0.912177	valid_1's auc: 0.860903
[1500]	training's auc: 0.922454	valid_1's auc: 0.868016
[1800]	training's auc: 0.930385	valid_1's auc: 0.87338
[2100]	training's auc: 0.936786	valid_1's auc: 0.877525
[2400]	training's auc: 0.942032	valid_1's auc: 0.880677
[2700]	training's auc: 0.946608	valid_1's auc: 0.883314
[3000]	training's auc: 0.95068	valid_1's auc: 0.885497
[3300]	training's auc: 0.954236	valid_1's auc: 0.887297
[3600]	training's auc: 0.957469	valid_1's auc: 0.888705
[3900]	training's auc: 0.960335	valid_1's auc: 0.889867
[4200]	training's auc: 0.96297	valid_1's auc: 0.890711
[4500]	training's auc: 0.965422	valid_1's auc: 0.891601
[4800]	training's auc: 0.967738	valid_1's auc: 0.892288
[5100]	training's auc: 0.969917	valid_1's 

[3900]	training's auc: 0.959708	valid_1's auc: 0.892172
[4200]	training's auc: 0.962354	valid_1's auc: 0.893215
[4500]	training's auc: 0.964864	valid_1's auc: 0.894168
[4800]	training's auc: 0.967242	valid_1's auc: 0.894914
[5100]	training's auc: 0.969408	valid_1's auc: 0.895539
[5400]	training's auc: 0.971454	valid_1's auc: 0.896142
[5700]	training's auc: 0.973451	valid_1's auc: 0.896618
[6000]	training's auc: 0.975335	valid_1's auc: 0.896958
[6300]	training's auc: 0.977127	valid_1's auc: 0.89727
[6600]	training's auc: 0.978873	valid_1's auc: 0.897469
[6900]	training's auc: 0.980486	valid_1's auc: 0.897692
[7200]	training's auc: 0.982019	valid_1's auc: 0.89786
[7500]	training's auc: 0.983465	valid_1's auc: 0.89807
[7800]	training's auc: 0.984796	valid_1's auc: 0.89814
[8100]	training's auc: 0.986028	valid_1's auc: 0.898263
[8400]	training's auc: 0.987206	valid_1's auc: 0.898347
[8700]	training's auc: 0.988284	valid_1's auc: 0.898413
[9000]	training's auc: 0.989285	valid_1's auc: 0.898

In [None]:
len(val_idx)

In [None]:
roc_auc_score(y, oof_predictions)

In [None]:
predictions = test_predictions.copy()

In [None]:
predictions = predictions.clip(0, 1).round()

In [None]:
predictions.max()

In [None]:
rfc = RandomForestClassifier(criterion='entropy', n_estimators=100, max_depth=7, n_jobs=-1, random_state=123, verbose=1, class_weight={1: .9, 0: .1})
rfc.fit(X, y)

In [None]:
predictions = rfc.predict(X_valid)

In [None]:
roc_auc_score(y_valid, predictions)

In [None]:
from sklearn.preprocessing import StandardScaler, quantile_transform

In [None]:
qt = quantile_transform(X)

In [None]:
rfc = RandomForestClassifier(n_estimators=200, max_depth=7, n_jobs=-1, random_state=123, verbose=1, class_weight={1: .9, 0: .1})
rfc.fit(X, y)

In [None]:
predictions = rfc.predict(X_test)

In [None]:
roc_auc_score(y_valid, predictions)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
predictions = lr.predict(X_valid)

In [None]:
roc_auc_score(y_valid, predictions)

In [None]:
np.mean(y_valid==predictions)

In [None]:
len(X)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
params_dist = {
    'n_estimators':[10, 30, 50, 75, 100, 120],
    'class_weight':[{1: 0.9, 0: 0.1}, {1: 0.99, 0: 0.01}, {1: 0.8, 0: 0.2}, {1: 0.95, 0: 0.05}],
    'max_depth':[2, 3, 5, 7, 9, 11, 13, 15, 20],
}

In [None]:
rfc = RandomForestClassifier(n_jobs=-1, verbose=1)
randomized_grid_search = RandomizedSearchCV(rfc, param_distributions=params_dist, n_iter=20,
                                            scoring='precision', n_jobs=-1)

In [None]:
randomized_grid_search.fit(X,y)

In [None]:
predictions = randomized_grid_search.predict(X_test)

In [None]:
standard_scaler = StandardScaler()
quantile_scaler = quantile_transform()

In [None]:
X_scaled = standard_scaler.fit_transform(X)

In [None]:
X_test_scaled = standard_scaler.transform(X_test)

In [None]:
log_reg = LogisticRegression()

In [None]:
log_reg.fit(X_scaled, y)

In [None]:
predictions = log_reg.predict(X_test_scaled)

In [None]:
naive_bayes_preds = pd.read_csv('submission_naive_bayes.csv')

In [None]:
submission_df = pd.DataFrame()

In [None]:
submission_df['ID_code'] = X_test_IDs
submission_df['target'] = (predictions + naive_bayes_preds['target'].values)/2

In [None]:
submission_df.to_csv('submission_lgbm+naive_bayes.csv', index=False)

In [None]:
randomized_grid_search.best_score_