# Setup

In [None]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.8.1-py2.py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 3.9 MB/s 
Collecting pyaml>=16.9
  Downloading pyaml-21.8.3-py2.py3-none-any.whl (17 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.8.3 scikit-optimize-0.8.1


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, f1_score, precision_recall_curve, confusion_matrix
import sklearn.tree as tree
from skopt import gp_minimize
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import warnings
import os
from scipy.stats import binom
from lightgbm import LGBMClassifier
from sklearn.cluster import KMeans  
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
warnings.filterwarnings("ignore")

## Important parameters

In [None]:
# list_remove = [31, 44, 45, 46, 47, 48, 49, 50, 51, 55, 62, 63, 64, 65, 66]
PATH = "/content/drive/MyDrive/Colab Notebooks/Porto_Seguro_competition/"
list_remove = [10,11,12,13,15,16,17,18,22,24,25,26,27,30,31,32,33,34,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,57,59,60,61,62,63,64,65,66,67,68]
list_remove = ['var'+str(x) for x in list_remove]

def return_var_names(x):
  names = []
  for i in range(len(x)):
    names.append('var'+str(x[i]) + '_Categorify')

  return tuple(names)

list_combined_idx_cat = [(1,7), (1,20),(7,8),(7,20),(7,23),(7,28),(7,29), (7,39),(1,7,8),(1,7,14),(1,7,20),(1,7,23),(1,7,28),(1,7,29),(1,7,31),(1,7,39),(1,8,20),(1,8,23),(1,20,23),(1,20,28),(1,20,29),(1,20,39),(1,23,28),(1,7,14),(1,7,23),(1,7,28),(1,7,29),(1,7,31),(1,7,39),(1,8,20),(1,8,23)]
list_combined_cat = [return_var_names(x) for x in list_combined_idx_cat]
combined_columns_cat = [f"{x[0]}_{x[1]}" if len(x) == 2 else f"{x[0]}_{x[1]}_{x[2]}"for x in list_combined_cat]
list_comb = [file.split("_") for file in os.listdir(PATH + "trees/kmeans")][:-1]
list_comb = [lt[:2] + [lt[2][:-4]] for lt in list_comb]
list_comb = sorted(list_comb, key= lambda x: x[2], reverse=True)[:15]
list_combined_num = [lt[:2] for lt in list_comb]
combined_columns_num = [f"{x[0]}_{x[1]}" for x in list_combined_num]

## Loading datasets

In [None]:
df = pd.read_csv(PATH + "datasets/train.csv")
test = pd.read_csv(PATH + "datasets/test.csv")
submission_sample = pd.read_csv(PATH + "datasets/submission_sample.csv")
metadata = pd.read_csv(PATH + "datasets/metadata.csv")
metadata.columns = ["cod", "type"]
num_dis = metadata[metadata.type == "Quantitativo discreto"].cod.to_list()
num_dis.remove("y")
num_con = metadata[metadata.type == "Quantitativo continua"].cod.to_list()
cat_nom = metadata[metadata.type == "Qualitativo nominal"].cod.to_list()
cat_nom.remove("id")
cat_ord = metadata[metadata.type == "Qualitativo ordinal"].cod.to_list() 
y = df["y"]
test_id = test["id"]
test.drop(["id"], axis=1, inplace=True)
X_prepared = df.drop(["id", "y"], axis=1)
all_cat_columns = cat_nom+cat_ord+combined_columns_cat+combined_columns_num
cat_columns_without_remove = [col for col in all_cat_columns if col not in list_remove]

# Functions for get better threshold

In [None]:
def evaluate(y_pred, y_true, plot_matrix=True):
    score = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    if plot_matrix:
        conf_matrix = confusion_matrix(y_true, y_pred)
        plt.matshow(conf_matrix, cmap=plt.cm.gray)
        plt.show()
    return score, f1

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds, thrh):
    plt.figure(figsize=(8, 4))
    plt.axis([0, 1.1, 0, 1])
    precision_by_thrs = precisions[np.argmax(thresholds == thrh)]
    recall_by_thrs = recalls[np.argmax(thresholds == thrh)]
    
    plt.plot([thrh, thrh], [0., precision_by_thrs], "r:")
    plt.plot([thrh, thrh], [0., recall_by_thrs], "r:")
    plt.plot([0, thrh], [precision_by_thrs, precision_by_thrs], "r:")
    plt.plot([0, thrh], [recall_by_thrs, recall_by_thrs], "r:")
    plt.plot([thrh], [precision_by_thrs], "ro")        
    plt.plot([thrh], [recall_by_thrs], "ro")   
    
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")    
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.legend(loc="center right", fontsize=14)
    plt.xlabel("Threshold")
    plt.ylabel("Value")
    plt.grid(True)
    
def better_threshold(precisions, recalls, thresholds):
    f1_best = 0
    threshold = 0
    for i in range(len(precisions)):
        if precisions[i] != 0 and recalls[i] != 0:
            f1 = 2*(precisions[i]*recalls[i])/(precisions[i] + recalls[i])
        else:
            f1 = 0
        if f1 > f1_best:
            f1_best = f1
            threshold = thresholds[i]
            
    
    return threshold, f1_best

# Pipelines

## Classes

In [None]:
class ProcessMissingValues(BaseEstimator, TransformerMixin):
  def __init__(self, 
               columns=cat_nom + cat_ord,
               type_columns='categorical',
               num_imputer=None):
    self.columns = columns
    self.type_columns = type_columns
    self.num_imputer = None

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    if self.type_columns == 'categorical':
      X = self.transform_categorical(X)

    else:
      X = self.transform_numerical(X)

    return X

  def transform_categorical(self, X):
    for col in self.columns:
      X[col] = X[col].replace(-999, np.nan)
      X['NA_' + col] = X[col].isna().astype(np.int8)
      X[col].fillna('UNKNOWN', inplace=True)

    return X

  def transform_numerical(self, X):
    self.columns = [col for col in X.columns if 'NA_' not in col and col not in cat_nom + cat_ord]
    for col in self.columns:
      X[col] = X[col].replace(-999, np.nan)
      if self.num_imputer == None:
        imputer = X[col].median()
      else:
        imputer = self.imputer
      X['NA_' + col] = X[col].isna().astype(np.int8)
      X = X.fillna(imputer)

    return X

In [None]:
class Categorify(BaseEstimator, TransformerMixin):
  def __init__(self, 
               columns=cat_nom+cat_ord, 
               freq_treshhold=5,
               lowfrequency_id=0,
                unkown_id=1):
    self.columns = columns
    self.freqs = []
    self.freq_treshhold = freq_treshhold
    self.lowfrequency_id = lowfrequency_id
    self.unkown_id = unkown_id

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    self.make_columns(X)
    for idx, col in enumerate(self.columns):
      col_name = self.freqs[idx].columns[0]
      X = X.merge(self.freqs[idx], how='left', on=col, suffixes=("_", ""))
      if col_name + "_" in X.columns:
        X.drop(col_name + "_", axis=1, inplace=True)


    return X

  def make_columns(self, X):
    self.freqs = []
    for col in self.columns:
      freq = X[col].value_counts()
      freq = freq.reset_index()
      freq.columns = [col, 'count']
      freq = freq.reset_index()
      freq.columns = [col + '_Categorify', col, 'count']
      freq[col + '_Categorify'] = freq[col + '_Categorify'] + 2
      freq.loc[freq['count'] < self.freq_treshhold, col + '_Categorify'] = self.lowfrequency_id
      freq.loc[freq[col]=='UNKNOWN', col + '_Categorify'] = self.unkown_id
      freq = freq.drop('count', axis=1)
      self.freqs.append(freq)

In [None]:
class Likelihood_pvalue(BaseEstimator, TransformerMixin):
  def __init__(self, 
               columns=cat_nom+cat_ord,
               bias_denominator=12,
               min_pvalue=1e-5,
               imputer_value='mean',
               features=["bias_factor_weighted"]):
    self.columns = columns
    self.cat_features = []
    self.bias_denominator = bias_denominator
    self.min_pvalue = min_pvalue
    self.imputer_value = imputer_value
    self.features = features

  def fit(self, X, y):
    global_like = y.to_numpy().sum()
    global_bias = global_like/X.shape[0]
    df_X = X.copy()
    df_X["y"] = y.to_numpy()
    for col in self.columns:
      col_name = col + '_Categorify'
      features = df_X[[col_name, 'y']].groupby(col_name).mean()
      features = features.reset_index()
      bias = df_X[[col_name, 'y']].groupby(col_name).mean()/global_bias
      bias = bias.reset_index()
      features = features.merge(bias, how='left', on=col_name)
      features.columns = [col_name, 'like', 'bias']
      features["p_value"] = features.apply(lambda row: self.p_value(df_X, col_name, row, global_bias), axis=1)
      features['logit'] = features.apply(lambda row: self.logit(row[3]), axis=1)
      features["bias_factor_weighted"] = features.apply(lambda row: row[2]**row[4], axis=1)
      self.cat_features.append(features)
    return self

  def transform(self, X):
    for idx, col in enumerate(self.columns):
      col_name = col + '_Categorify'
      features = self.cat_features[idx].copy()
      features.columns = [col_name] + [f"{ftr}_{col}" for ftr in features.columns[1:]]
      col_name_ = [f"{ftr}_{col}" for ftr in self.features]
      X = X.merge(features[[col_name] + col_name_], how='left', on=col_name, suffixes=("_", ""))
      for ftr_col in col_name_:
        if ftr_col + "_" in X.columns:
          X.drop(ftr_col + "_", axis=1, inplace=True)
      if self.imputer_value == 'mean':
        for ftr_col in col_name_:
          mean =  X[ftr_col].dropna().mean()
          X.loc[:, ftr_col] = X.loc[:, ftr_col].replace(np.nan, mean)

      else:
        X.loc[:, ftr_col] = X.loc[:, ftr_col].replace(np.nan, self.imputer_value)

    return X

  def p_value(self, X, col_name, row, global_bias):
    aux = X[[col_name, 'y']].groupby(col_name)          \
                          .sum()                        \
                          .reset_index()
    k = aux[aux[col_name] == row[0]]['y'].to_numpy()[0]  
    aux = X[[col_name, 'y']].groupby(col_name)          \
                          .count()                      \
                          .reset_index()
    n = aux[aux[col_name] == row[0]]['y'].to_numpy()[0]  
    return np.min([binom.cdf(k=k, n=n, p=global_bias), binom.sf(k=k-1, n=n, p=global_bias)])

  def logit(self, pvalue):
    pvalue = np.max([pvalue, 1e-5])
    # print("p:", pvalue)
    inverse_pvalue = 1 - pvalue
    logit_ = np.log(inverse_pvalue) - np.log(1 - inverse_pvalue)
    # print("logit:", np.max([0, logit_])/self.bias_denominator)
    return np.max([0, logit_])/self.bias_denominator



In [None]:
class GetRidCategoricalFeatures(BaseEstimator, TransformerMixin):
  def __init__(self, columns=cat_nom+cat_ord):
    self.columns = columns

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    for col in self.columns:
        categority_col = col + '_Categorify'
        if col in X.columns:
          X.drop([col], axis=1,inplace=True)
        if categority_col in X.columns:
          X.drop([categority_col], axis=1,inplace=True)

    return X

In [None]:
class CountEncoding(BaseEstimator, TransformerMixin):
  def __init__(self, 
               columns=cat_nom + cat_ord):
    self.columns = columns
    self.ces = []

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    for idx, col in enumerate(self.columns):
      col_name = col + '_Categorify'
      ce = X[col_name].value_counts()
      ce = ce.reset_index()
      ce.columns = [col_name, 'CE_' + col]
      X = X.merge(ce, how='left', on=col_name)

    return X

In [None]:
class StandardNumerical(BaseEstimator, TransformerMixin):
  def __init__(self):
    self.columns = []

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    self.columns = [col for col in X.columns if "NA" not in col]
    for col in self.columns:
      X[col] = X[col].replace(-999, np.nan)
      median = X[col].dropna().median()
      if median != median:
        X.drop(col, axis=1, inplace=True)
      else:
        X.loc[X[col] == np.nan, col] = median
        X[col]= (X[col] - np.mean(X[col]))/np.std(X[col])

    return X

In [None]:
class CombiningColumns(BaseEstimator, TransformerMixin):
  def __init__(self, list_combined=list_combined_cat, col_type='categorical'):
    self.list_combined = list_combined
    self.col_type = col_type
    self.space = [(2, 20),
                  (200, 1000),
                  (2, 20),
                  (1e-5, 1e-3, 'log-uniform')]

    self.clusters = {}
    self.names_col = []

  def fit(self, X, y):
    if self.col_type == 'numerical':
      for cols in self.list_combined:
        def tune_kmeans(params):
          n_clusters, max_iter, n_init, tol = params

          kmeans = KMeans(n_clusters=n_clusters, 
                          max_iter=max_iter, 
                          tol=tol, n_init=n_init, 
                          n_jobs=-1, 
                          random_state=42)
          cluster = df_copy.copy()
          cluster["Cluster"] = kmeans.fit_predict(cluster, y)
          cluster["Cluster"] = cluster["Cluster"].astype("category")
          cluster["y"] = y.to_numpy()
          return -cluster[["Cluster", "y"]].groupby("Cluster").mean().var()["y"]

        df_copy = X[cols].copy()
        res = gp_minimize(tune_kmeans, self.space, random_state=42, verbose=0, n_calls=30)

        name_col = cols[0] + "_" + cols[1]

        n_clusters, max_iter, n_init, tol = res.x
        kmeans = KMeans(n_clusters=n_clusters, 
                          max_iter=max_iter, 
                          tol=tol, n_init=n_init, 
                          n_jobs=-1, 
                          random_state=42)
        

        df_copy[name_col] = kmeans.fit_predict(df_copy, y)
        df_copy[name_col] = df_copy[name_col].astype("category")
        self.clusters[name_col] = kmeans
        self.names_col.append(name_col)

    return self

  def transform(self, X):
    if self.col_type == 'categorical':
      X = self.transform_cat(X)

    else:
      X = self.transform_num(X)

    return X

  def transform_cat(self, X):
    for cols in self.list_combined:
      if len(cols) == 2:
        name_col = cols[0] + '_' + cols[1]
        X[name_col] = X[cols[0]].astype(str) + "_" +  X[cols[1]].astype(str)
      else:
        name_col = cols[0] + '_' + cols[1] + '_' + cols[2]
        X[name_col] = X[cols[0]].astype(str) + "_" +  X[cols[1]].astype(str) + '_' + X[cols[2]].astype(str)

    return X

  def transform_num(self, X):
    for idx, cols in enumerate(self.list_combined):
      name_col = self.names_col[idx]
      X[name_col] = self.clusters[name_col].predict(X[cols].copy())
      X[name_col] = X[name_col].astype("category")


    return X

## Pipeline of features engineering (7th approach)

In [None]:
def remove_feature(list_remove, columns):
    return [x for x in columns if x not in list_remove]

num_dis = remove_feature(list_remove, num_dis)
num_con = remove_feature(list_remove, num_con)
cat_nom = remove_feature(list_remove, cat_nom)
cat_ord = remove_feature(list_remove, cat_ord)
test.drop(list_remove, axis=1, inplace=True)
X_prepared.drop(list_remove, axis=1, inplace=True)
pipeline = Pipeline([
    ('missing_values_categorical', ProcessMissingValues()),
    ('missing_values_numerical', ProcessMissingValues(type_columns='numerical')),
    ('categorify', Categorify()),
    ('Likelihood_pvalue', Likelihood_pvalue()),
    ('get_rid_categorical_features', GetRidCategoricalFeatures()),
    ('standard', StandardNumerical())               
])

TypeError: ignored

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_prepared, y, test_size=0.2, random_state=42)
X_train = pipeline.fit_transform(X_train, y_train)
X_valid = pipeline.transform(X_valid)
X_test = pipeline.transform(test)

In [None]:
X_test.head()

Unnamed: 0,var24,var25,var27,var40,var52,var53,var54,var56,var57,var58,var59,var60,var61,var67,var68,NA_var1,NA_var2,NA_var3,NA_var4,NA_var5,NA_var6,NA_var7,NA_var8,NA_var9,NA_var10,NA_var11,NA_var12,NA_var13,NA_var14,NA_var15,NA_var16,NA_var17,NA_var18,NA_var19,NA_var20,NA_var21,NA_var22,NA_var23,NA_var28,NA_var29,...,like_var42,like_var43,p_value_var1,p_value_var2,p_value_var3,p_value_var4,p_value_var5,p_value_var6,p_value_var7,p_value_var8,p_value_var9,p_value_var10,p_value_var11,p_value_var12,p_value_var13,p_value_var14,p_value_var15,p_value_var16,p_value_var17,p_value_var18,p_value_var19,p_value_var20,p_value_var21,p_value_var22,p_value_var23,p_value_var28,p_value_var29,p_value_var30,p_value_var33,p_value_var34,p_value_var35,p_value_var36,p_value_var37,p_value_var38,p_value_var39,p_value_var41,p_value_var26,p_value_var32,p_value_var42,p_value_var43
0,1.155203,0.630702,-0.77861,-0.001453,1.36725,-0.846231,-0.733072,-0.654262,0.08627,-0.346804,-0.160526,1.176273,-0.75806,-0.24424,1.351308,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0.100563,-2.499992,-0.231916,1.78577,-1.04165,0.267321,-0.630208,0.841754,-0.383585,-0.175001,0.492056,2.346147,0.374049,0.883663,1.004633,-0.445082,-0.343095,-0.342473,-0.192069,0.208177,-0.830186,-0.414548,0.791972,-0.234215,-0.343474,-0.468657,-0.350385,0.24802,-1.250792,-0.931362,0.858825,0.751969,-1.354195,0.108853,0.080568,0.306708,-0.309606,-0.468004,1.451772,-1.327064
1,0.061737,-0.088553,-0.77861,-1.258146,-1.032297,-0.846231,-0.733072,1.232952,-0.812894,0.516503,-1.31736,-0.613102,-0.385951,-0.489065,-1.459963,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,4.654372,0.43528,-0.240458,1.78577,-0.296491,0.267321,2.809911,1.20332,-0.384714,-0.175001,-0.35844,-0.403062,0.374049,-0.759319,-0.773512,-0.44519,1.498994,1.502976,-0.625504,-0.927205,0.855513,-0.414605,1.551438,-0.234215,-0.343474,-0.468657,-0.350385,4.602822,0.375061,-1.050024,-1.11459,-0.94512,-0.903707,-0.981156,0.080568,-0.888561,-0.14493,-1.507442,-1.469059,0.118615
2,0.061737,-1.527064,-0.77861,0.836341,0.748012,-0.846231,-0.733072,0.515096,-0.677403,0.772377,0.93584,1.374976,0.453818,0.735056,-0.790612,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,-0.378078,1.516464,-0.231916,-0.668868,0.578966,0.267321,1.869187,1.339339,-0.318679,1.075023,0.492056,-0.403062,0.374049,0.883663,0.15789,-0.445131,-0.510867,-0.510551,-0.617625,-0.927205,-0.670003,0.111358,-0.145347,-0.234215,2.912403,1.158297,2.850326,-0.697423,0.845741,1.04053,-1.334923,-0.109351,1.187735,0.108853,0.080568,0.306708,-0.14493,-0.778172,-0.888076,-1.116322
3,1.155203,1.349958,-0.77861,0.626893,-1.032297,-0.846231,-0.733072,-1.157735,1.712156,-0.769066,-0.791675,-0.236917,-0.17964,0.735056,0.748893,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,-0.315595,-0.119394,-0.241684,-0.794005,1.352715,0.267321,-0.630208,1.032033,-0.384714,-0.175001,-0.35844,1.366477,0.374049,-0.759319,0.2221,-0.44519,0.744758,0.747364,-0.624698,1.331623,1.327245,-0.414605,1.349267,-0.234215,-0.343474,-0.468658,-0.350385,-0.697423,0.375061,0.97992,-0.873486,-0.952106,-0.216091,0.108853,0.080568,0.306708,4.585331,-0.37877,-0.229043,0.963775
4,-1.031728,-0.807808,-0.77861,-0.001453,1.909084,-0.846231,-0.733072,-1.281167,1.804536,-0.633528,0.047518,-0.706701,-0.121583,1.224704,-1.192223,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,-0.315595,0.893177,-0.241684,1.78577,-1.092443,0.267321,-0.630208,-0.709591,0.421276,-0.175001,-0.35844,-0.403062,0.374049,2.499596,-0.003472,-0.252662,1.045697,0.113756,2.385892,-0.927205,-0.84521,0.077712,-0.451072,-0.27701,-0.343474,0.095752,-0.350386,0.24802,0.293108,-0.286166,-1.012337,-0.109351,-0.216091,0.108853,0.080568,0.306708,-0.309606,-1.302924,-0.229043,-1.037134


In [None]:
X_train["y"] = y_train.to_numpy()
X_valid["y"] = y_valid.to_numpy()
X_test["id"] = test_id.to_numpy()
X_train.to_csv(PATH + "/preprocessed_data/approach7/train.csv", index=False)
X_valid.to_csv(PATH + "/preprocessed_data/approach7/valid.csv", index=False)
X_test.to_csv(PATH + "/preprocessed_data/approach7/test.csv", index=False)

## Pipeline of features engineering (8th approach)

In [None]:
def remove_feature(list_remove, columns):
    return [x for x in columns if x not in list_remove]

num_dis = remove_feature(list_remove, num_dis)
num_con = remove_feature(list_remove, num_con)
cat_nom = remove_feature(list_remove, cat_nom)
cat_ord = remove_feature(list_remove, cat_ord)
test.drop(list_remove, axis=1, inplace=True)
X_prepared.drop(list_remove, axis=1, inplace=True)

In [None]:
pipeline = Pipeline([
      ('missing_values_categorical', ProcessMissingValues()),
      ('missing_values_numerical', ProcessMissingValues(type_columns='numerical')),
      ('categorify', Categorify(freq_treshhold=freq_treshhold)),
      ('Likelihood_pvalue', Likelihood_pvalue(bias_denominator=bias_denominator,
                                              imputer_value=imputer_value, 
                                              features=features)),
      ('get_rid_categorical_features', GetRidCategoricalFeatures()),
      ('standard', StandardNumerical())               
    ])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_prepared, y, test_size=0.2, random_state=42)
combined_columns = [f"{x[0]}_{x[1]}" if len(x) == 2 else f"{x[0]}_{x[1]}_{x[2]}"for x in list_combined]

In [None]:
def tune_featuresEngineering(params):
    freq_treshhold1, freq_treshhold2, bias_denominator, imputer_value, features1 = params # features2, features3
    # if features1 != features2:
    #   features = [features1, features2]
    # else:
    features = [features1]
    pipeline = Pipeline([
      ('missing_values_categorical', ProcessMissingValues()),
      ('missing_values_numerical', ProcessMissingValues(type_columns='numerical')),
      ('categorify', Categorify(freq_treshhold=freq_treshhold1)),
      ('combiningColumns', CombiningColumns()),
      ('categorify_combined', Categorify(columns=combined_columns, freq_treshhold=freq_treshhold2)),
      ('count_encoder', CountEncoding(columns=cat_nom + cat_ord)),
      ('Likelihood_pvalue', Likelihood_pvalue(columns=cat_nom + cat_ord+combined_columns,
                                              bias_denominator=bias_denominator,
                                              imputer_value=imputer_value, 
                                              features=features)),
      ('get_rid_categorical_features', GetRidCategoricalFeatures()),
      # ('standard', StandardNumerical())               
    ])
    
    X_train_ = pipeline.fit_transform(X_train, y_train)
    X_valid_ = pipeline.transform(X_valid)
    mdl1 = LGBMClassifier(random_state=42)
    mdl1.fit(X_train_, y_train)
    y_pred = mdl1.predict_proba(X_valid_)[:, 1]
    precisions, recalls, thresholds = precision_recall_curve(y_valid, y_pred)
    thrs, _ = better_threshold(precisions, recalls, thresholds)
    y_pred = (y_pred >= thrs).astype(int)
    _, metric1 = evaluate(y_pred, y_valid, plot_matrix=False)

    # mdl2 = LogisticRegression(random_state=42)
    # mdl2.fit(X_train_, y_train)
    # y_pred = mdl2.predict_proba(X_valid_)[:, 1]
    # precisions, recalls, thresholds = precision_recall_curve(y_valid, y_pred)
    # thrs, _ = better_threshold(precisions, recalls, thresholds)
    # y_pred = (y_pred >= thrs).astype(int)
    # _, metric2 = evaluate(y_pred, y_valid, plot_matrix=False)

    mdl3 = XGBClassifier(random_state=42)
    mdl3.fit(X_train_, y_train)
    y_pred = mdl3.predict_proba(X_valid_)[:, 1]
    precisions, recalls, thresholds = precision_recall_curve(y_valid, y_pred)
    thrs, _ = better_threshold(precisions, recalls, thresholds)
    y_pred = (y_pred >= thrs).astype(int)
    _, metric3 = evaluate(y_pred, y_valid, plot_matrix=False)
    

    metric = np.mean([metric1, metric3])
    print(params, metric)
    print()
    
    return -metric

In [None]:
space = [(5, 500),
         (5, 500),
         (1, 20),
         (0.0, 1.0),
         ['like','bias','p_value','logit','bias_factor_weighted']]
        #  ['like','bias','p_value','logit','bias_factor_weighted'],
        #  ['like','bias','p_value','logit','bias_factor_weighted']]


res = gp_minimize(tune_featuresEngineering, space, random_state=42, verbose=1, n_calls=700)

Iteration No: 1 started. Evaluating function at random point.


KeyError: ignored

### Notes

* without remove list: [39, 1, 1.0, 'bias', 'logit'] 0.5758531107625263

* [170, 20, 1.0, 'like'] 0.5717528552174221

In [None]:
tune_featuresEngineering([170, 20, 1.0, 'like'])

[170, 20, 1.0, 'like'] 0.5717528552174221



-0.5717528552174221

In [None]:
pipeline = Pipeline([
  ('missing_values_categorical', ProcessMissingValues()),
  ('missing_values_numerical', ProcessMissingValues(type_columns='numerical')),
  ('categorify', Categorify(freq_treshhold=170)),
  ('count_encoder', CountEncoding()),
  ('Likelihood_pvalue', Likelihood_pvalue(bias_denominator=20,
                                              imputer_value=1.0, 
                                              features=['like'])),
  ('get_rid_categorical_features', GetRidCategoricalFeatures()),
  ('standard', StandardNumerical())               
])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_prepared, y, test_size=0.2, random_state=42)
X_train = pipeline.fit_transform(X_train, y_train)
X_valid = pipeline.transform(X_valid)
X_test = pipeline.transform(test)

In [None]:
X_test.head()

Unnamed: 0,var24,var40,var52,var53,var54,var56,var57,var58,var59,var60,var62,var63,var67,NA_var1,NA_var2,NA_var3,NA_var4,NA_var5,NA_var6,NA_var7,NA_var8,NA_var9,NA_var10,NA_var11,NA_var12,NA_var13,NA_var14,NA_var15,NA_var16,NA_var17,NA_var18,NA_var20,NA_var21,NA_var22,NA_var23,NA_var28,NA_var29,NA_var34,NA_var35,NA_var37,...,NA_var54,NA_var56,NA_var57,NA_var58,NA_var59,NA_var60,NA_var62,NA_var63,NA_var67,like_var1,like_var2,like_var3,like_var4,like_var5,like_var6,like_var7,like_var8,like_var9,like_var10,like_var11,like_var12,like_var13,like_var14,like_var15,like_var16,like_var17,like_var18,like_var20,like_var21,like_var22,like_var23,like_var28,like_var29,like_var34,like_var35,like_var37,like_var26,like_var32,like_var42,like_var43
0,1.155203,-0.001453,1.36725,-0.846231,-0.733072,-0.654262,0.08627,-0.346804,-0.160526,1.176273,-0.432125,0.888234,-0.24424,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.935656,0.046119,-0.162144,0.265105,0.375183,0.187412,-0.728177,2.268769,-0.259264,0.508266,0.374049,-0.184384,-0.036897,-1.12082,0.33184,0.33184,0.405594,0.394386,-0.965521,0.030786,-0.323396,0.592052,-1.601942,0.520178,-0.039881,-0.206354,-1.561998,-0.326561,0.764735,0.140211,-2.700098
1,0.061737,-1.258146,-1.032297,-0.846231,-0.733072,1.232952,-0.812894,0.516503,-1.31736,-0.613102,1.28361,0.481284,-0.489065,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.995098,0.046119,-1.954219,0.265105,0.375183,0.187412,0.773546,2.268769,0.582923,0.508266,0.374049,0.462807,1.456736,0.798613,-0.112662,-0.112662,1.651144,0.444059,0.736316,0.030786,-0.323396,0.592052,-1.601942,0.520178,-0.039881,-0.206354,2.056017,0.374966,2.463204,5.568706,0.482422
2,0.061737,0.836341,0.748012,-0.846231,-0.733072,0.515096,-0.677403,0.772377,0.93584,1.374976,-0.004413,0.645796,0.735056,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.935656,-0.524866,-0.162144,0.265105,0.020125,0.187412,0.531123,-0.304996,-0.259264,0.508266,0.374049,-0.184384,-0.036897,-1.440358,0.564893,0.564893,-0.728756,0.444059,0.398654,0.030786,-0.323396,0.429455,0.339694,0.314676,-0.288724,-0.206354,0.206567,0.374966,0.827074,-0.430366,1.654678
3,1.155203,0.626893,-1.032297,-0.846231,-0.733072,-1.157735,1.712156,-0.769066,-0.791675,-0.236917,1.529994,0.888234,0.735056,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1.038497,-1.803865,-0.162144,0.265105,0.375183,0.187412,0.773546,2.268769,0.582923,0.263108,0.374049,0.462807,-0.036897,0.798613,-0.355034,-0.355034,0.967777,0.181966,0.736316,0.030786,-0.323396,0.592052,0.686691,0.520178,-0.039881,-0.206354,0.373564,-0.014411,-0.779307,-0.355881,-0.118974
4,-1.031728,-0.001453,1.909084,-0.846231,-0.733072,-1.281167,1.804536,-0.633528,0.047518,-0.706701,0.131352,0.706405,1.224704,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.038497,0.046119,-0.162144,0.265105,0.375183,0.187412,0.610436,2.268769,0.582923,0.508266,0.374049,0.462807,-0.036897,0.798613,0.524828,0.524828,0.941881,0.444059,-0.719086,0.030786,-1.820946,-1.978872,-0.980191,-2.167714,-0.039881,-1.90047,0.373564,-0.326561,-1.916204,-0.355881,0.978889


In [None]:
X_train["y"] = y_train.to_numpy()
X_valid["y"] = y_valid.to_numpy()
X_test["id"] = test_id.to_numpy()
X_train.to_csv(PATH + "/preprocessed_data/approach8/train.csv", index=False)
X_valid.to_csv(PATH + "/preprocessed_data/approach8/valid.csv", index=False)
X_test.to_csv(PATH + "/preprocessed_data/approach8/test.csv", index=False)

## Pipeline of features engineering (9th approach)

In [None]:
pipeline = Pipeline([
      ('missing_values_cat', ProcessMissingValues()),                       
      ('missing_values_num', ProcessMissingValues(columns=num_dis+num_con, 
                                                  type_columns='numerical', 
                                                  num_imputer=None)),
      ('categorify_1', Categorify(freq_treshhold=5)),
      ('combining_columns_cat', CombiningColumns()),
      ('categorify_2', Categorify(columns=combined_columns_cat, 
                                  freq_treshhold=5)),
      ('combining_columns_num', CombiningColumns(list_combined=list_combined_num, 
                                                 col_type='numerical')),
      ('categorify_3', Categorify(columns=combined_columns_num, 
                                  freq_treshhold=5)),
      ('count_encoder', CountEncoding(columns=all_cat_columns)),
      ('Likelihood_pvalue', Likelihood_pvalue(columns=all_cat_columns,
                                              bias_denominator=12,
                                              imputer_value='mean', 
                                              features=['like', 'bias'])), 
                           
      ('get_rid_categorical_features', GetRidCategoricalFeatures(columns=all_cat_columns)),
      ('standard', StandardNumerical()) 

    ])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_prepared, y, test_size=0.2, random_state=42)
pipeline.fit_transform(X_train, y_train)

In [None]:
def tune_featuresEngineering(params, return_separated=False):
    freq_treshhold1, freq_treshhold2, freq_treshhold3, bias_denominator, features1, features2 = params # features2, features3 num_imputer, ismean1, imputer_value, ismean2,
    if features1 != features2:
      features = [features1, features2]
    else:
      features = [features1]
    # if ismean1 == 1:
    #   nun_imputer = None

    # if ismean2 == 1:
    #   imputer_value='mean'
    pipeline = Pipeline([
      ('missing_values_cat', ProcessMissingValues()),                       
      ('missing_values_num', ProcessMissingValues(columns=num_dis+num_con, 
                                                  type_columns='numerical', 
                                                  num_imputer=None)),
      ('categorify_1', Categorify(freq_treshhold=freq_treshhold1)),
      ('combining_columns_cat', CombiningColumns()),
      ('categorify_2', Categorify(columns=combined_columns_cat, 
                                  freq_treshhold=freq_treshhold2)),
      ('combining_columns_num', CombiningColumns(list_combined=list_combined_num, 
                                                 col_type='numerical')),
      ('categorify_3', Categorify(columns=combined_columns_num, 
                                  freq_treshhold=freq_treshhold3)),
      ('count_encoder', CountEncoding(columns=all_cat_columns)),
      ('get_rid_categorical_features1', GetRidCategoricalFeatures(columns=list_remove)), 
      ('Likelihood_pvalue', Likelihood_pvalue(columns=cat_columns_without_remove,
                                              bias_denominator=bias_denominator,
                                              imputer_value='mean', 
                                              features=features)),
      ('get_rid_categorical_features2', GetRidCategoricalFeatures(columns=cat_columns_without_remove)), 
      ('standard', StandardNumerical()) 

    ])
    
    X_train_ = pipeline.fit_transform(X_train, y_train)
    X_valid_ = pipeline.transform(X_valid)
    mdl1 = LGBMClassifier(random_state=42)
    mdl1.fit(X_train_, y_train)
    y_pred = mdl1.predict_proba(X_valid_)[:, 1]
    precisions, recalls, thresholds = precision_recall_curve(y_valid, y_pred)
    thrs, _ = better_threshold(precisions, recalls, thresholds)
    y_pred = (y_pred >= thrs).astype(int)
    _, metric1 = evaluate(y_pred, y_valid, plot_matrix=False)

    # mdl2 = LogisticRegression(random_state=42)
    # mdl2.fit(X_train_, y_train)
    # y_pred = mdl2.predict_proba(X_valid_)[:, 1]
    # precisions, recalls, thresholds = precision_recall_curve(y_valid, y_pred)
    # thrs, _ = better_threshold(precisions, recalls, thresholds)
    # y_pred = (y_pred >= thrs).astype(int)
    # _, metric2 = evaluate(y_pred, y_valid, plot_matrix=False)

    mdl3 = XGBClassifier(random_state=42)
    mdl3.fit(X_train_, y_train)
    y_pred = mdl3.predict_proba(X_valid_)[:, 1]
    precisions, recalls, thresholds = precision_recall_curve(y_valid, y_pred)
    thrs, _ = better_threshold(precisions, recalls, thresholds)
    y_pred = (y_pred >= thrs).astype(int)
    _, metric3 = evaluate(y_pred, y_valid, plot_matrix=False)
    

    metric = np.mean([metric1, metric3])
    print(params, metric)
    print()
    if return_separated:
      return metric1, metric3, metric
    return -metric

In [None]:
space = [(2, 100),
         (2, 100),
         (2, 100),
         (0.1, 50.0),
         ['like','bias','p_value','logit','bias_factor_weighted'],
         ['like','bias','p_value','logit','bias_factor_weighted']]

X_train, X_valid, y_train, y_valid = train_test_split(X_prepared, y, test_size=0.2, random_state=42)
res = gp_minimize(tune_featuresEngineering, space, random_state=42, verbose=1, n_calls=70)

Iteration No: 1 started. Evaluating function at random point.
[80, 20, 78, 11.977318143135092, 'like', 'bias'] 0.41867844464045145

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 261.0957
Function value obtained: -0.4187
Current minimum: -0.4187
Iteration No: 2 started. Evaluating function at random point.
[47, 35, 16, 13.052680611682174, 'bias', 'logit'] 0.5516868114049311

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 258.3670
Function value obtained: -0.5517
Current minimum: -0.5517
Iteration No: 3 started. Evaluating function at random point.
[94, 2, 99, 12.38788204159156, 'like', 'bias'] 0.37533966274149

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 318.6568
Function value obtained: -0.3753
Current minimum: -0.5517
Iteration No: 4 started. Evaluating function at random point.
[4, 53, 41, 1.0286466979509472, 'p_value', 'bias_factor_weighted'] 0.43582965595526324

Iteration No: 4 ended. Evaluation done at random point

KeyboardInterrupt: ignored

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_prepared, y, test_size=0.2, random_state=42)
tune_featuresEngineering(params=[32, 60, 2 ,50,'bias', 'bias_factor_weighted'], return_separated=True)

[32, 60, 2, 50, 'bias', 'bias_factor_weighted'] 0.5781939833201497



(0.579732914375491, 0.5766550522648085, 0.5781939833201497)

# Best parameter:

* [34,66,28,25,'bias', 'p_value'] -> 0.5737000809660419

In [None]:
    pipeline = Pipeline([
      ('missing_values_cat', ProcessMissingValues()),                       
      ('missing_values_num', ProcessMissingValues(columns=num_dis+num_con, 
                                                  type_columns='numerical', 
                                                  num_imputer=None)),
      ('categorify_1', Categorify(freq_treshhold=32)),
      ('combining_columns_cat', CombiningColumns()),
      ('categorify_2', Categorify(columns=combined_columns_cat, 
                                  freq_treshhold=60)),
      ('combining_columns_num', CombiningColumns(list_combined=list_combined_num, 
                                                 col_type='numerical')),
      ('categorify_3', Categorify(columns=combined_columns_num, 
                                  freq_treshhold=2)),
      ('count_encoder', CountEncoding(columns=all_cat_columns)),
      ('Likelihood_pvalue', Likelihood_pvalue(columns=all_cat_columns,
                                              bias_denominator=50,
                                              imputer_value='mean', 
                                              features=['bias', 'bias_factor_weighted'])), 
                           
      ('get_rid_categorical_features', GetRidCategoricalFeatures(columns=all_cat_columns)),
      ('standard', StandardNumerical()) 

    ])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_prepared, y, test_size=0.2, random_state=42)
X_train = pipeline.fit_transform(X_train, y_train)
X_valid = pipeline.transform(X_valid)
X_test = pipeline.transform(test)

In [None]:
X_train.drop("bias_factor_weighted_var31", axis=1, inplace=True)
X_valid.drop("bias_factor_weighted_var31", axis=1, inplace=True)
X_test.drop("bias_factor_weighted_var31", axis=1, inplace=True)

In [None]:
X_train["y"] = y_train.to_numpy()
X_valid["y"] = y_valid.to_numpy()
X_test["id"] = test_id.to_numpy()
X_train.to_csv(PATH + "/preprocessed_data/approach9/train.csv", index=False)
X_valid.to_csv(PATH + "/preprocessed_data/approach9/valid.csv", index=False)
X_test.to_csv(PATH + "/preprocessed_data/approach9/test.csv", index=False)