# Setup

In [None]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.8.1-py2.py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 3.0 MB/s 
Collecting pyaml>=16.9
  Downloading pyaml-21.8.3-py2.py3-none-any.whl (17 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.8.3 scikit-optimize-0.8.1


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, f1_score, precision_recall_curve, confusion_matrix
import matplotlib.pyplot as plt
from skopt import gp_minimize
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import backend as K
import joblib as jb
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Import Data

In [None]:
PATH = "/content/drive/MyDrive/Colab Notebooks/Porto_Seguro_competition/"
APPROACH = 9

In [None]:
df = pd.read_csv(PATH + "datasets/train.csv")
test = pd.read_csv(PATH + "datasets/test.csv")
submission_sample = pd.read_csv(PATH + "datasets/submission_sample.csv")
metadata = pd.read_csv(PATH + "datasets/metadata.csv")
metadata.columns = ["cod", "type"]

In [None]:
num_dis = metadata[metadata.type == "Quantitativo discreto"].cod.to_list()
num_dis.remove("y")
num_con = metadata[metadata.type == "Quantitativo continua"].cod.to_list()
cat_nom = metadata[metadata.type == "Qualitativo nominal"].cod.to_list()
cat_nom.remove("id")
cat_ord = metadata[metadata.type == "Qualitativo ordinal"].cod.to_list() 
y = df["y"].astype(np.float32)

In [None]:
import sys
sys.path.append(PATH + "pipeline")
from approach6 import make_pipeline
pipeline = make_pipeline(cat_nom, cat_ord, num_con, num_dis)

In [None]:
test_id = test["id"]
test.drop(["id"], axis=1, inplace=True)
X_prepared = df.drop(["id", "y"], axis=1)

In [None]:
X = pd.concat([X_prepared, test])

In [None]:
X = pipeline.fit_transform(X)
X_tv, test = X.iloc[:X_prepared.shape[0], :], X.iloc[X_prepared.shape[0]:,:]
train, valid, y_train, y_valid = train_test_split(X_tv, y, test_size=0.2, random_state=42)

In [None]:
train = pd.read_csv(PATH + "/preprocessed_data/approach{}/train.csv".format(str(APPROACH)))
valid = pd.read_csv(PATH + "/preprocessed_data/approach{}/valid.csv".format(str(APPROACH)))
test = pd.read_csv(PATH + "/preprocessed_data/approach{}/test.csv".format(str(APPROACH)))

In [None]:
y_train = train["y"].astype(np.float32)
y_valid = valid["y"].astype(np.float32)
test_id = test["id"]
train.drop(["y"], axis=1, inplace=True)
valid.drop(["y"], axis=1, inplace=True)
test.drop(["id"], axis=1, inplace=True)

In [None]:
X_train0, X_train1, y_train0, y_train1 = train_test_split(train, y_train, test_size=0.4, random_state=42)

In [None]:
train.head()

Unnamed: 0,var24,var25,var27,var40,var44,var45,var46,var47,var48,var49,var50,var51,var52,var53,var54,var55,var56,var57,var58,var59,var60,var61,var62,var63,var64,var65,var66,var67,var68,NA_var1,NA_var2,NA_var3,NA_var4,NA_var5,NA_var6,NA_var7,NA_var8,NA_var9,NA_var10,NA_var11,...,bias_var1_Categorify_var7_Categorify_var29_Categorify,bias_factor_weighted_var1_Categorify_var7_Categorify_var29_Categorify,bias_var1_Categorify_var7_Categorify_var31_Categorify,bias_factor_weighted_var1_Categorify_var7_Categorify_var31_Categorify,bias_var1_Categorify_var7_Categorify_var39_Categorify,bias_factor_weighted_var1_Categorify_var7_Categorify_var39_Categorify,bias_var1_Categorify_var8_Categorify_var20_Categorify,bias_factor_weighted_var1_Categorify_var8_Categorify_var20_Categorify,bias_var1_Categorify_var8_Categorify_var23_Categorify,bias_factor_weighted_var1_Categorify_var8_Categorify_var23_Categorify,bias_var66_var54,bias_factor_weighted_var66_var54,bias_var65_var54,bias_factor_weighted_var65_var54,bias_var24_var50,bias_factor_weighted_var24_var50,bias_var48_var54,bias_factor_weighted_var48_var54,bias_var60_var54,bias_factor_weighted_var60_var54,bias_var64_var53,bias_factor_weighted_var64_var53,bias_var63_var54,bias_factor_weighted_var63_var54,bias_var53_var54,bias_factor_weighted_var53_var54,bias_var47_var54,bias_factor_weighted_var47_var54,bias_var45_var54,bias_factor_weighted_var45_var54,bias_var46_var54,bias_factor_weighted_var46_var54,bias_var60_var24,bias_factor_weighted_var60_var24,bias_var51_var54,bias_factor_weighted_var51_var54,bias_var44_var54,bias_factor_weighted_var44_var54,bias_var61_var54,bias_factor_weighted_var61_var54
0,-1.046907,-1.498576,1.309163,0.411784,-2.502184,-0.300472,-0.274428,-0.18555,-0.21816,1.376749,-0.418231,-0.24014,-1.066913,0.487018,0.670718,-0.235996,-1.332219,1.881402,0.281828,-1.097194,-0.803626,-1.011941,0.67477,0.896935,1.696334,-0.078804,-0.063458,0.003499,1.322693,0,0,0,0,0,0,0,0,0,0,0,...,1.567322,1.178184,1.532041,1.075414,1.713648,1.192627,0.401717,0.665445,0.351477,0.61636,-0.334837,-0.231403,-0.309,-0.188226,0.78556,0.716992,-0.357112,-0.197858,-0.349131,-0.232658,-0.090737,0.148606,-0.224291,-0.04546,-0.348134,-0.232107,-0.345025,-0.238277,-0.332933,-0.243419,-0.331873,-0.238256,0.638257,0.631999,-0.378618,-0.272538,-0.560423,-0.297595,-0.345818,-0.233885
1,1.137791,-0.777314,-0.763847,-0.421072,0.399651,-0.300472,-0.274428,-0.18555,-0.21816,-0.726349,-0.418231,4.164242,1.92316,0.487018,0.670718,-0.21288,-1.130095,1.490978,0.290465,-0.31894,-0.237453,1.98987,-0.62792,-0.630618,-0.344967,-0.078804,-0.063458,1.839389,-0.388244,0,0,0,0,0,0,0,0,0,0,0,...,-0.739506,-0.05796,-0.657983,0.037659,-0.626134,0.055325,-0.749764,-0.340907,-0.691544,-0.321511,-0.334837,-0.231403,-0.309,-0.188226,0.432795,0.473694,-0.357112,-0.197858,-0.349131,-0.232658,-0.16679,0.117168,-0.224291,-0.04546,-0.348134,-0.232107,-0.345025,-0.238277,-0.332933,-0.243419,-0.331873,-0.238256,0.195188,0.203331,0.378917,0.216226,-0.312671,-0.210965,-0.345818,-0.233885
2,0.045442,1.386473,-0.763847,-0.421072,0.399651,-0.300472,-0.274428,-0.18555,-0.21816,1.376749,-0.418231,-0.24014,2.306502,0.487018,0.670718,-0.437014,-0.905151,0.382172,-0.721049,0.361361,0.395376,0.926923,-0.309475,0.459265,-0.230012,-0.078804,-0.063458,-1.220427,-1.046297,0,0,0,0,0,0,0,0,0,1,0,...,-0.716635,-0.488369,-0.583836,-0.268016,-0.511075,-0.225777,-0.332935,-0.291279,-0.449449,-0.491089,-0.334837,-0.231403,-0.309,-0.188226,-1.670567,-1.635711,-0.357112,-0.197858,-0.349131,-0.232658,-0.16679,0.117168,-0.224291,-0.04546,-0.348134,-0.232107,-0.345025,-0.238277,-0.332933,-0.243419,-0.331873,-0.238256,-2.241979,-2.245051,-0.378618,-0.272538,-0.312671,-0.210965,-0.345818,-0.233885
3,-1.046907,1.386473,1.309163,-1.045714,0.399651,0.810005,0.993664,-0.18555,1.858312,1.376749,-0.418231,-0.24014,-0.606901,-0.847204,-0.727395,-0.418376,0.780303,-0.636054,-0.677277,-0.853282,-0.797448,2.017026,0.256251,-1.759119,0.164508,0.215689,0.062441,0.737855,-1.441129,0,0,0,0,0,0,0,0,0,0,0,...,-0.192028,0.165137,-0.220864,0.170702,-0.159407,0.186395,-0.593912,-0.756844,-0.553663,-0.688561,1.637171,1.514905,1.588454,1.387027,0.78556,0.716992,1.559757,0.912904,1.647589,1.520546,2.167618,1.80963,1.620593,1.450251,1.534959,1.442729,1.617849,1.531156,1.631536,1.434676,1.867328,1.673412,0.638257,0.631999,1.549878,1.48236,1.56731,1.457503,1.647188,1.517002
4,0.045442,1.386473,-0.763847,0.828212,0.399651,-0.300472,-0.274428,-0.18555,-0.21816,-0.726349,2.391026,-0.24014,-0.223559,1.821241,2.068831,-0.509371,0.754222,-0.636054,1.564065,0.020271,-0.237453,-0.37943,-0.217411,0.364866,-0.25654,-0.078804,-0.063458,-0.363679,-0.059218,0,0,0,0,0,1,1,0,0,0,0,...,-0.716635,-0.488369,-1.728224,-2.639646,-1.766034,-2.744641,-0.332935,-0.291279,-0.449449,-0.491089,-0.582881,0.039908,-0.570693,0.041112,-2.44706,-3.25365,-0.388168,0.115785,-0.349131,-0.232658,0.760166,0.390317,-0.224291,-0.04546,-0.348134,-0.232107,-0.65778,0.020021,-0.418543,0.089702,-0.448038,0.07781,-2.241979,-2.245051,-0.773528,-0.035856,-0.312671,-0.210965,-0.586744,0.046565


## save y and id

In [None]:
name_ytrain1 = PATH + "./preds_train1/approach{}/y.pkl.z".format(APPROACH) 
jb.dump(y_train1, name_ytrain1)
name_yvalid = PATH + "./preds_val1/approach{}/y.pkl.z".format(APPROACH) 
jb.dump(y_valid, name_yvalid)
name_id = PATH + "./preds_test/approach{}/id.pkl.z".format(APPROACH) 
jb.dump(test_id, name_id)

['/content/drive/MyDrive/Colab Notebooks/Porto_Seguro_competition/./preds_test/approach9/id.pkl.z']

# Evaluate Validation F1_score

In [None]:
def evaluate(y_pred, y_true, plot_matrix=True):
    score = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    if plot_matrix:
        conf_matrix = confusion_matrix(y_true, y_pred)
        plt.matshow(conf_matrix, cmap=plt.cm.gray)
        plt.show()
    return score, f1

# Precision/Recall Trade-off

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds, thrh):
    plt.figure(figsize=(8, 4))
    plt.axis([0, 1.1, 0, 1])
    precision_by_thrs = precisions[np.argmax(thresholds == thrh)]
    recall_by_thrs = recalls[np.argmax(thresholds == thrh)]
    
    plt.plot([thrh, thrh], [0., precision_by_thrs], "r:")
    plt.plot([thrh, thrh], [0., recall_by_thrs], "r:")
    plt.plot([0, thrh], [precision_by_thrs, precision_by_thrs], "r:")
    plt.plot([0, thrh], [recall_by_thrs, recall_by_thrs], "r:")
    plt.plot([thrh], [precision_by_thrs], "ro")        
    plt.plot([thrh], [recall_by_thrs], "ro")   
    
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")    
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.legend(loc="center right", fontsize=14)
    plt.xlabel("Threshold")
    plt.ylabel("Value")
    plt.grid(True)
    
def better_threshold(precisions, recalls, thresholds):
    f1_best = 0
    threshold = 0
    for i in range(len(precisions)):
        if precisions[i] != 0 and recalls[i] != 0:
            f1 = 2*(precisions[i]*recalls[i])/(precisions[i] + recalls[i])
        else:
            f1 = 0
        if f1 > f1_best:
            f1_best = f1
            threshold = thresholds[i]
            
    
    return threshold, f1_best

# Models

## Neural nets

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    loss = f1_loss(y_true, y_pred)
    return 1 - loss

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, tf.float32), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), tf.float32), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, tf.float32), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), tf.float32), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])
    plt.grid(True)
    plt.show()

In [None]:
model = keras.models.Sequential([
    keras.Input(shape=X_train0.shape[1]),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
model.compile(loss=f1_loss,
             optimizer=keras.optimizers.Adam(lr=0.01),
             metrics=[f1_m,precision_m, recall_m])

In [None]:
keras.utils.plot_model(model=model, show_shapes=True, dpi=76)

In [None]:
model.summary()

In [None]:
 history = model.fit(X_train0, 
                    y_train0, 
                    epochs=20, 
                    validation_data=(valid, y_valid),
                    verbose=0)

In [None]:
model.predict(X_train1)

In [None]:
(p >= 0.5).astype(int)

In [None]:
plot_graphs(history, 'loss')

### Tunning 

In [None]:
def tune_nn(params):
  hidden1, hidden2, epoch, learning_rate = params
  model = keras.models.Sequential([
    keras.Input(shape=X_train0.shape[1]),
    keras.layers.Dense(hidden1, activation="relu"),
    keras.layers.Dense(hidden2, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
  ])
  model.compile(loss=f1_loss,
             optimizer=keras.optimizers.Adam(lr=learning_rate),
             metrics=[f1_m,precision_m, recall_m])
    
  with tf.device('GPU:0'):
    history = model.fit(X_train0, 
                    y_train0, 
                    epochs=epoch, 
                    validation_data=(valid, y_valid),
                    verbose=0)
  p = model.predict(X_train1)
  y_pred = (p >= 0.5).astype(int)

  model_name_train1 = PATH + "./preds_train1/approach{}/nn_{}_{}_{}_{}.pkl.z".format(APPROACH,hidden1, hidden2, epoch, learning_rate) 
  jb.dump(y_pred, model_name_train1)

  p = model.predict(valid)
  y_pred = (p >= 0.5).astype(int)
  _, metric = evaluate(y_pred, y_valid, plot_matrix=False)
  model_name_val1 = PATH + "/preds_val1/approach{}/nn_{}_{}_{}_{}.pkl.z".format(APPROACH,hidden1, hidden2, epoch, learning_rate) 
  jb.dump(p, model_name_val1)
    
  p = model.predict(test)
  y_pred = (p >= 0.5).astype(int)
  model_name_test = PATH + "./preds_test/approach{}/nn_{}_{}_{}_{}.pkl.z".format(APPROACH,hidden1, hidden2, epoch, learning_rate) 
  jb.dump(p, model_name_test)
  print(params, metric)
  print()
  return -metric
    

In [None]:
space = [(10, 600),
         (10, 600),
         (10, 100),
         (1e-5, 1e-3, 'log-uniform')]


res = gp_minimize(tune_nn, space, random_state=42, verbose=1, n_calls=70)

Iteration No: 1 started. Evaluating function at random point.
[480, 118, 80, 0.0001562069367563987] 0.44326617179215266

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 57.0678
Function value obtained: -0.4433
Current minimum: -0.4433
Iteration No: 2 started. Evaluating function at random point.
[273, 69, 51, 4.649617447336329e-05] 0.4895522388059702

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 31.7539
Function value obtained: -0.4896
Current minimum: -0.4896
Iteration No: 3 started. Evaluating function at random point.
[94, 394, 15, 0.0002779697551526683] 0.5111706881143879

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 10.1937
Function value obtained: -0.5112
Current minimum: -0.5112
Iteration No: 4 started. Evaluating function at random point.
[564, 10, 99, 0.00017177621112338383] 0.46555323590814196

Iteration No: 4 ended. Evaluation done at random point.
Time taken: 63.0641
Function value obtained: -0.4656
Current m

### Notes

files:
* *approach{}/nn_1_layer_{}_{}.pkl.z* -> 1 hidden layer and the parameters are: number the neurons in the hidden layer and the learning rate

* *approach{}/nn_{}_{}_{}.pkl.z* -> 2 hidden layers and the parameters are: number the neurons in the hidden layers and the learning rate

Best:

* Approach 4: *approach{}/nn_{}_{}_{}.pkl.z* -> 200, 79, 1e-5 -> f1_score = 0.5563

* Approach 5: [10, 200, 5.708255341556299e-05] -> 0.5454545454545454

* Approach 7: [200, 170, 7.348397051356418e-05] -> 0.6229177183240788

* [600, 76, 39, 2.4268007090778586e-05] -> 0.6225234619395204

* Approach 8:



## Logistic Regression

In [None]:
def tune_logistic(params):
    tol, max_iter, C = params
    clf = LogisticRegression(random_state=42, 
                         solver='liblinear', 
                         max_iter=max_iter, 
                         tol=tol,
                         C=C,
                         penalty='l1')

    clf.fit(X_train0, y_train0)
    
    y_pred = clf.predict_proba(X_train1)[:, 1]
    model_name_train1 = PATH + "./preds_train1/approach{}/rf_{}_{}_{}.pkl.z".format(APPROACH, tol, max_iter, C ) 
    jb.dump(y_pred, model_name_train1)
    
    y_pred = clf.predict_proba(valid)[:, 1]
    model_name_val1 = PATH + "/preds_val1/approach{}/rf_{}_{}_{}.pkl.z".format(APPROACH, tol, max_iter, C ) 
    jb.dump(y_pred, model_name_val1)

    precisions, recalls, thresholds = precision_recall_curve(y_valid, y_pred)
    thrs, _ = better_threshold(precisions, recalls, thresholds)
    y_pred = (y_pred >= thrs).astype(int)
    _, metric = evaluate(y_pred, y_valid, plot_matrix=False)
    
    y_pred = clf.predict_proba(test)[:, 1]
    model_name_test = PATH + "./preds_test/approach{}/rf_{}_{}_{}.pkl.z".format(APPROACH, tol, max_iter, C ) 
    jb.dump(y_pred, model_name_test)
    
    print(params, metric)
    print()
    
    return -metric

In [None]:
space = [(1e-8, 1e-6, 'log-uniform'),
         (100, 500),
         (1, 15)]

res = gp_minimize(tune_logistic, space, random_state=42, verbose=1, n_calls=20)

Iteration No: 1 started. Evaluating function at random point.
[3.918194347141745e-07, 173, 12] 0.5003182686187142

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 616.1198
Function value obtained: -0.5003
Current minimum: -0.5003
Iteration No: 2 started. Evaluating function at random point.
[1.5620693675639854e-07, 278, 2] 0.5030674846625768

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 951.4299
Function value obtained: -0.5031
Current minimum: -0.5031
Iteration No: 3 started. Evaluating function at random point.
[8.288916866885136e-08, 233, 3] 0.5057142857142857

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 841.9044
Function value obtained: -0.5057
Current minimum: -0.5057
Iteration No: 4 started. Evaluating function at random point.
[2.0034427927560744e-07, 123, 11] 0.5

Iteration No: 4 ended. Evaluation done at random point.
Time taken: 385.5537
Function value obtained: -0.5000
Current minimum: -0.5057
Iteration No: 5

Best:

* Approach4 -> tol, max_iter, C: [9.272695017719055e-07, 475, 1] -> 0.5448343079922028

* Approach5 -> tol, max_iter, C: [9.272695017719055e-07, 475, 1] -> 0.5429943256219991

* Approach6 -> tol, max_iter, C: [1.2951873050264239e-08, 103, 1] -> 0.5403001667593107

* Approach6 -> tol, max_iter, C: [1.5620693675639854e-07, 278, 2] -> 0.6258426966292134

## LGBMClassifier

In [None]:
def tune_lgbm(params):
    num_leaves, min_data_in_leaf, n_estimators, learning_rate = params
    mdl = LGBMClassifier(num_leaves=num_leaves,min_child_samples=min_data_in_leaf, learning_rate=learning_rate, 
                        n_estimators=n_estimators, random_state=42)
    mdl.fit(X_train0, y_train0)
    
    y_pred = mdl.predict_proba(X_train1)[:, 1]
    model_name_train1 = PATH + "/preds_train1/approach{}/lgbm_{}_{}_{}_{}.pkl.z".format(APPROACH, num_leaves, min_data_in_leaf, n_estimators, learning_rate) 
    jb.dump(y_pred, model_name_train1)
    
    p = mdl.predict_proba(valid)[:,1]
    model_name_val1 = PATH + "/preds_val1/approach{}/lgbm_{}_{}_{}_{}.pkl.z".format(APPROACH, num_leaves, min_data_in_leaf, n_estimators, learning_rate) 
    jb.dump(p, model_name_val1)

    precisions, recalls, thresholds = precision_recall_curve(y_valid, p)
    thrs, _ = better_threshold(precisions, recalls, thresholds)
    y_pred = (p >= thrs).astype(int)
    _, metric = evaluate(y_pred, y_valid, plot_matrix=False)
    
    p = mdl.predict_proba(test)[:,1]
    model_name_test = PATH + "/preds_test/approach{}/lgbm_{}_{}_{}_{}.pkl.z".format(APPROACH, num_leaves, min_data_in_leaf, n_estimators, learning_rate) 
    jb.dump(p, model_name_test)
    
    print(params, metric)
    print()
    
    return -metric

In [None]:
space = [(200, 1000),
         (1, 1000),
         (100, 800),
         (1e-4, 1e-1, 'log-uniform')]

res = gp_minimize(tune_lgbm, space, random_state=42, verbose=1, n_calls=70)

Iteration No: 1 started. Evaluating function at random point.
[837, 184, 646, 0.006173770394704579] 0.5559380378657487

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 6.2396
Function value obtained: -0.5559
Current minimum: -0.5559
Iteration No: 2 started. Evaluating function at random point.
[557, 101, 421, 0.0010025956902289567] 0.5174129353233832

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 5.3429
Function value obtained: -0.5174
Current minimum: -0.5559
Iteration No: 3 started. Evaluating function at random point.
[314, 651, 139, 0.014655354118727714] 0.5511450381679389

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 1.1000
Function value obtained: -0.5511
Current minimum: -0.5559
Iteration No: 4 started. Evaluating function at random point.
[951, 2, 795, 0.007119418600172993] 0.5036764705882353

Iteration No: 4 ended. Evaluation done at random point.
Time taken: 99.7660
Function value obtained: -0.5037
Current minim

In [None]:
mdl = LGBMClassifier(random_state=42)
mdl.fit(X_train0, y_train0)
    
y_pred = mdl.predict_proba(X_train1)[:, 1]
model_name_train1 = PATH + "/preds_train1/approach{}/lgbm_default.pkl.z".format(APPROACH)
jb.dump(y_pred, model_name_train1)
    
p = mdl.predict_proba(valid)[:,1]
model_name_val1 = PATH + "/preds_val1/approach{}/lgbm_default.pkl.z".format(APPROACH)
jb.dump(p, model_name_val1)

precisions, recalls, thresholds = precision_recall_curve(y_valid, p)
thrs, _ = better_threshold(precisions, recalls, thresholds)
y_pred = (p >= thrs).astype(int)
_, metric = evaluate(y_pred, y_valid, plot_matrix=False)

p = mdl.predict_proba(test)[:,1]
model_name_test = PATH + "/preds_test/approach{}/lgbm_default.pkl.z".format(APPROACH)
jb.dump(p, model_name_test)

print(metric)

0.5462114904246461


Best:

* Approach 4: num_leaves, min_data_in_leaf, n_estimators, learning_rate: [200, 338, 800, 0.01979897628808835] -> 0.6058666666666668

* Approach 5: num_leaves, min_data_in_leaf, n_estimators, learning_rate: [200, 268, 800, 0.007705699106497642] -> 0.6074313408723748

* Approach 6: num_leaves, min_data_in_leaf, n_estimators, learning_rate: [1000, 249, 393, 0.012735565972107158] -> 0.5954035275253875

* Approach 7: num_leaves, min_data_in_leaf, n_estimators, learning_rate: [1000, 192, 800, 0.005747479446525644] -> 0.6367713004484306

* 0.5536

## XGBClassifier

In [None]:
def tune_xgboost(params):
    learning_rate, n_estimators= params
    xgb = XGBClassifier(learning_rate=learning_rate,
                    n_estimators=n_estimators,
                    random_state=42, gpu_id=0)

    xgb.fit(X_train0, y_train0)
    
    y_pred = xgb.predict_proba(X_train1)[:,1]
    model_name_train1 = PATH + "./preds_train1/approach{}/xgb_{}_{}.pkl.z".format(APPROACH,n_estimators, learning_rate) 
    jb.dump(y_pred, model_name_train1)
    
    precisions, recalls, thresholds = precision_recall_curve(y_train1, y_pred)
    thrs, _ = better_threshold(precisions, recalls, thresholds)
    y_pred = (y_pred >= thrs).astype(int)
    _, metric = evaluate(y_pred, y_train1, plot_matrix=False)
    p = xgb.predict_proba(valid)[:,1]
    model_name_val1 = PATH + "/preds_val1/approach{}/xgb_{}_{}.pkl.z".format(APPROACH,n_estimators, learning_rate) 
    jb.dump(p, model_name_val1)
    
    p = xgb.predict_proba(test)[:,1]
    model_name_test = PATH + "./preds_test/approach{}/xgb_{}_{}.pkl.z".format(APPROACH, n_estimators, learning_rate) 
    jb.dump(p, model_name_test)
    
    print(params, metric)
    print()
    
    return -metric

In [None]:
space = [(1e-4, 9e-1, 'log-uniform'),
         (300, 1300)]

res = gp_minimize(tune_xgboost, space, random_state=42, verbose=1, n_calls=50)

Iteration No: 1 started. Evaluating function at random point.
[0.1411640595733612, 483] 0.6072548141513658

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 8.4221
Function value obtained: -0.6073
Current minimum: -0.6073
Iteration No: 2 started. Evaluating function at random point.
[0.12108415002621291, 897] 0.6022232962783954

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 15.4093
Function value obtained: -0.6022
Current minimum: -0.6073
Iteration No: 3 started. Evaluating function at random point.


KeyboardInterrupt: ignored

## Notebook Kagle parameters

In [None]:
def tune_xgboost_(params):
    learning_rate = params[0]
    max_depth = params[1]
    min_child_weight=params[2]
    subsample = params[3]
    colsample_bynode = params[4]
    num_parallel_tree = params[5]
    n_estimators = params[6]
    xgb = XGBClassifier(
            n_jobs=-1,
            eval_metric='auc',
            random_state=42,
            n_estimators=n_estimators,  # you should tune n_estimators aswell
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_child_weight=min_child_weight,
            subsample=subsample,
            colsample_bynode=colsample_bynode,
            num_parallel_tree=num_parallel_tree
    )
    fit_params = {
            'early_stopping_rounds': 100,
            'eval_metric' : 'auc',
            'eval_set': [(valid, y_valid)],
            'verbose': False,
        }
    xgb.fit(X_train0, y_train0, **fit_params)
    
    y_pred = xgb.predict_proba(X_train1)[:,1]
    model_name_train1 = PATH + "./preds_train1/approach{}/xgb_{}_{}_{}_{}_{}_{}_{}.pkl.z".format(APPROACH, learning_rate, max_depth, min_child_weight, subsample, colsample_bynode, num_parallel_tree, n_estimators) 
    jb.dump(y_pred, model_name_train1)
    
    p = xgb.predict_proba(valid)[:,1]
    model_name_val1 = PATH + "/preds_val1/approach{}/xgb_{}_{}_{}_{}_{}_{}_{}.pkl.z".format(APPROACH, learning_rate, max_depth, min_child_weight, subsample, colsample_bynode, num_parallel_tree, n_estimators)
    jb.dump(p, model_name_val1)
    
    precisions, recalls, thresholds = precision_recall_curve(y_valid, p)
    thrs, _ = better_threshold(precisions, recalls, thresholds)
    y_pred = (p >= thrs).astype(int)
    _, metric = evaluate(y_pred, y_valid, plot_matrix=False)
    p = xgb.predict_proba(test)[:,1]
    model_name_test = PATH + "./preds_test/approach{}/xgb_{}_{}_{}_{}_{}_{}_{}.pkl.z".format(APPROACH, learning_rate, max_depth, min_child_weight, subsample, colsample_bynode, num_parallel_tree, n_estimators)
    jb.dump(p, model_name_test)
    
    print(params, metric)
    print()
    
    return -metric

In [None]:
space = [
    (1e-3, 9e-1, 'log-uniform'),  # learning_rate
    (3, 30),  # max_depth
    (0.01, 20.0, 'log-uniform'),  # min_child_weight
    (0.2, 1.0),  # subsample
    (0.2, 1.0),  # colsample_bynode
    [1, 2, 3],  # num_parallel_tree
    (100, 1000)
]

res = gp_minimize(tune_xgboost_, space, random_state=42, verbose=1, n_calls=70)

Iteration No: 1 started. Evaluating function at random point.
[0.22551796937629706, 8, 3.7478601695703286, 0.6774801263571897, 0.556666202282873, 1, 513] 0.5422153369481022

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 7.5056
Function value obtained: -0.5422
Current minimum: -0.5422
Iteration No: 2 started. Evaluating function at random point.
[0.009679572227018539, 7, 1.4079990864907204, 0.24512926322168022, 0.77759901781346, 3, 101] 0.558102766798419

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 9.8677
Function value obtained: -0.5581
Current minimum: -0.5581
Iteration No: 3 started. Evaluating function at random point.
[0.8535590410250283, 20, 1.044925745499645, 0.20565304417577393, 0.21844994003313262, 2, 460] 0.49186483103879847

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 3.2708
Function value obtained: -0.4919
Current minimum: -0.5581
Iteration No: 4 started. Evaluating function at random point.
[0.00137360444

In [None]:
mdl = XGBClassifier(n_jobs=-1, random_state=42)
mdl.fit(X_train0, y_train0)
    
y_pred = mdl.predict_proba(X_train1)[:, 1]
model_name_train1 = PATH + "/preds_train1/approach{}/xgboost_default.pkl.z".format(APPROACH)
jb.dump(y_pred, model_name_train1)
    
p = mdl.predict_proba(valid)[:,1]
model_name_val1 = PATH + "/preds_val1/approach{}/xgboost_default.pkl.z".format(APPROACH)
jb.dump(p, model_name_val1)

precisions, recalls, thresholds = precision_recall_curve(y_valid, p)
thrs, _ = better_threshold(precisions, recalls, thresholds)
y_pred = (p >= thrs).astype(int)
_, metric = evaluate(y_pred, y_valid, plot_matrix=False)

p = mdl.predict_proba(test)[:,1]
model_name_test = PATH + "/preds_test/approach{}/xgboost_default.pkl.z".format(APPROACH)
jb.dump(p, model_name_test)

print(metric)

0.5663082437275986


### Notes

Best:

* Approach 4: learning_rate, n_estimators: [0.05429007944240353, 300] -> 0.5993071593533489

* Approach 5: learning_rate, n_estimators: [0.026187309786660864, 683] -> 0.6017391304347827

* Approach 6: learning_rate, n_estimators: [0.026187309786660864, 683] -> 0.6017391304347827

* Approach 7: [0.13716021653659308, 3, 0.01, 1.0, 0.2, 2] -> 0.6415525114155252 overfitting

* approach 9: 

# Random Forest

In [None]:
def tune_trees(params):
    min_samples_leaf, weight, max_depth, n_estimators = params
    rf = RandomForestClassifier(n_estimators=n_estimators, 
                                min_samples_leaf=min_samples_leaf, 
                                max_depth = max_depth,
                                class_weight = {0:weight, 1: 1},
                                random_state=42)

    rf.fit(X_train0, y_train0)
    
    y_pred = rf.predict_proba(X_train1)[:,1]
    model_name_train1 = PATH + "./preds_train1/approach{}/rf_{}_{}_{}.pkl.z".format(APPROACH, min_samples_leaf, weight, n_estimators) 
    jb.dump(y_pred, model_name_train1)
    
    precisions, recalls, thresholds = precision_recall_curve(y_train1, y_pred)
    thrs, _ = better_threshold(precisions, recalls, thresholds)
    y_pred = (y_pred >= thrs).astype(int)
    _, metric = evaluate(y_pred, y_train1, plot_matrix=False)
    
    p = rf.predict_proba(valid)[:, 1]
    model_name_val1 = PATH + "/preds_val1/approach{}/rf_{}_{}_{}.pkl.z".format(APPROACH, min_samples_leaf, weight, n_estimators) 
    jb.dump(p, model_name_val1)
    
    p = rf.predict_proba(test)[:,1]
    model_name_test = PATH + "./preds_test/approach{}/rf_{}_{}_{}.pkl.z".format(APPROACH, min_samples_leaf, weight, n_estimators) 
    jb.dump(p, model_name_test)
    
    print(params, metric)
    print()
    
    return -metric

In [None]:
space = [(1, 30),
         (1.0, 5.5),
         (1, 30),
         (100, 1300)]

res = gp_minimize(tune_trees, space, random_state=42, verbose=1, n_calls=50)

Iteration No: 1 started. Evaluating function at random point.
[24, 1.8254565543977372, 24, 816] 0.5755782678859602

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 17.8917
Function value obtained: -0.5756
Current minimum: -0.5756
Iteration No: 2 started. Evaluating function at random point.
[14, 1.4498871211810131, 14, 500] 0.5766541151156536

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 11.9253
Function value obtained: -0.5767
Current minimum: -0.5767
Iteration No: 3 started. Evaluating function at random point.
[5, 3.9289981282698383, 3, 966] 0.5500273373428103

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 8.6613
Function value obtained: -0.5500
Current minimum: -0.5767
Iteration No: 4 started. Evaluating function at random point.
[28, 1.0035044462845646, 30, 841] 0.5747006767308692

Iteration No: 4 ended. Evaluation done at random point.
Time taken: 17.7129
Function value obtained: -0.5747
Current minimum: -0.5767
Ite

In [None]:
mdl = RandomForestClassifier( random_state=42)
mdl.fit(X_train0, y_train0)
    
y_pred = mdl.predict_proba(X_train1)[:, 1]
model_name_train1 = PATH + "/preds_train1/approach{}/rf_default.pkl.z".format(APPROACH)
jb.dump(y_pred, model_name_train1)
    
p = mdl.predict_proba(valid)[:,1]
model_name_val1 = PATH + "/preds_val1/approach{}/rf_default.pkl.z".format(APPROACH)
jb.dump(p, model_name_val1)

precisions, recalls, thresholds = precision_recall_curve(y_valid, p)
thrs, _ = better_threshold(precisions, recalls, thresholds)
y_pred = (p >= thrs).astype(int)
_, metric = evaluate(y_pred, y_valid, plot_matrix=False)

p = mdl.predict_proba(test)[:,1]
model_name_test = PATH + "/preds_test/approach{}/rf_default.pkl.z".format(APPROACH)
jb.dump(p, model_name_test)

print(metric)

0.5234113712374582


### Notes

Best:

* min_samples_leaf, min_samples_split, n_estimators: [2, 10, 379] -> 0.58719646799117

* approach 5: min_samples_leaf, weight, n_estimators: [9, 5.5, 1300] -> 0.577240241891149

* approach 6: min_samples_leaf, weight, n_estimators: [1, 1.0, 698] -> 0.5343000557724484

* approach 7: min_samples_leaf, weight, n_estimators: [30, 1.0, 100] -> 0.6177083333333333


# KNN

In [None]:
def tune_knn(params):
    n_neighbors = params[0]
    neigh = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1)

    neigh.fit(X_train0, y_train0)
    
    y_pred = neigh.predict_proba(X_train1)[:,1]
    model_name_train1 = PATH + "./preds_train1/approach{}/knn_{}.pkl.z".format(APPROACH, n_neighbors)
    jb.dump(y_pred, model_name_train1)
    
    precisions, recalls, thresholds = precision_recall_curve(y_train1, y_pred)
    thrs, _ = better_threshold(precisions, recalls, thresholds)
    y_pred = (y_pred >= thrs).astype(int)
    _, metric = evaluate(y_pred, y_train1, plot_matrix=False)
    
    p = neigh.predict_proba(valid)[:, 1]
    model_name_val1 = PATH + "/preds_val1/approach{}/knn_{}.pkl.z".format(APPROACH, n_neighbors)
    jb.dump(p, model_name_val1)
    
    p = neigh.predict_proba(test)[:,1]
    model_name_test = PATH + "./preds_test/approach{}/knn_{}.pkl.z".format(APPROACH, n_neighbors) 
    jb.dump(p, model_name_test)
    
    print(params, metric)
    print()
    
    return -metric

In [None]:
space = [(1, 21)]

res = gp_minimize(tune_knn, space, random_state=42, verbose=1, n_calls=30)

Iteration No: 1 started. Evaluating function at random point.
[17] 0.5043988269794721

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 46.7063
Function value obtained: -0.5044
Current minimum: -0.5044
Iteration No: 2 started. Evaluating function at random point.
[5] 0.4874274661508704

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 46.5940
Function value obtained: -0.4874
Current minimum: -0.5044
Iteration No: 3 started. Evaluating function at random point.
[17] 0.5043988269794721

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 46.6999
Function value obtained: -0.5044
Current minimum: -0.5044
Iteration No: 4 started. Evaluating function at random point.
[13] 0.5002409638554217

Iteration No: 4 ended. Evaluation done at random point.
Time taken: 46.8036
Function value obtained: -0.5002
Current minimum: -0.5044
Iteration No: 5 started. Evaluating function at random point.
[10] 0.4896675651392633

Iteration No: 5 ended. Evaluat

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)

neigh.fit(X_train0, y_train0)
neigh.predict_proba(X_train1)[:,1]

KeyboardInterrupt: ignored