# Notes

Source: https://www.kaggle.com/c/tabular-playground-series-mar-2021

>Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.
* https://en.wikipedia.org/wiki/Receiver_operating_characteristic
* `clf.predict_proba(test)[:, 1]`

---

* Cat1 - Cat10 have many categories
* Cont5 is weird
* target class "1" makes only 24%
* think how to combine models
* make pipeline (read the medium article about best practice)
```python
regression = Pipeline(
    steps=[("preprocessor", preprocessor), ("regression", LinearRegression())]
)
regression.fit(X_train, y_train)
```
* read the "sklearn pitfalls"

---
| Model | Optimization | Test AUC | Kaggle AUC | Notes |
| --- | --- | --- | --- | --- |
| LogisticRegression | N | 0.7707 | 0.87 | problems with predicting the underrepresented class "1" |
| LogisticRegression | Y | 0.803 | 0.87 | balanced weights improve test AUC |

# Libraries and helper functions

In [1]:
import math

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn-whitegrid')

import scikitplot as skplt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import roc_curve, auc, confusion_matrix, plot_confusion_matrix, plot_roc_curve

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
import datetime
def timestamp():
    time = f"{datetime.datetime.now().date()}--{datetime.datetime.now().time().replace(microsecond=0)}"
    return time.replace(":", "-")

In [3]:
def print_auc(model):
    global X_train
    global y_train
    global X_test
    global y_test
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    fpr, tpr, threshold = roc_curve(y_train, y_train_pred)
    roc_auc = auc(fpr, tpr)
    print("Training AUC:", roc_auc)

    fpr, tpr, threshold = roc_curve(y_test, y_test_pred)
    roc_auc = auc(fpr, tpr)
    print("Test AUC:", roc_auc)

# Load and Wrangle

In [4]:
df = pd.read_csv("../data/train.csv")
df = df.drop(columns=["id"])

In [None]:
df.info()

In [None]:
df.describe()

# Visualization

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
numerical_features = df.select_dtypes(include="float64").columns
for i in numerical_features:
    sns.histplot(df, x=i)
    plt.show()

In [None]:
sns.histplot(df, x="target")

In [None]:
categorical_features = df.select_dtypes(include="object").columns
for i in categorical_features:
    fig, ax = plt.subplots(1, 2, figsize=(12, 8))
    sns.histplot(df, x=i, ax=ax[0])
    sns.boxplot(data=df, x=i, y="target", ax=ax[1])
    plt.show()

In [None]:
categorical_features = df.select_dtypes(include="object").columns
for i in categorical_features:
    print(i, len(df[i].value_counts()))

In [None]:
categorical_features = df.select_dtypes(include="object").columns
for i in categorical_features:
    print(pd.crosstab(df[i], df["target"], normalize='index'))

# Prepare for Models

In [6]:
df = df.drop(columns=["cont0", "cont7", "cont9", "cont10", "cat5", "cat8", "cat10"])
#df_test = df_test.drop(columns=["cont0", "cont7", "cont9", "cont10", "cat5", "cat8", "cat10"])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns="target"),
    df["target"],
    test_size=0.3,
    random_state=42
)

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), make_column_selector(dtype_include=np.number)),
        ("cat", OneHotEncoder(handle_unknown="ignore"), make_column_selector(dtype_include=object)),    
    ],
    sparse_threshold=0,
    n_jobs=-1
)

# Models

## Logistic Regression

In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", LogisticRegression())
    ]
)

pipeline.fit(X_train, y_train)

In [None]:
print_auc(pipeline)

### Hyperparameter optimization

In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", LogisticRegression())
    ]
)

hyperparameters = {
    "model__penalty": ["l1", "l2"],
    "model__C": np.logspace(0, 4, 10),
    "model__class_weight": ["balanced", None]
}

gridsearch = GridSearchCV(pipeline, hyperparameters, scoring="roc_auc", cv=3, verbose=0, n_jobs=-1)

logistic_regression = gridsearch.fit(X_train, y_train)

In [None]:
logistic_regression.best_estimator_.get_params()

In [None]:
print_auc(logistic_regression)

In [None]:
plot_roc_curve(logistic_regression, X_test, y_test)

In [None]:
matrix = confusion_matrix(y_test, y_test_pred, normalize="true")
matrix

In [None]:
plot_confusion_matrix(logistic_regression, X_test, y_test, normalize="true")
plt.show()

In [None]:
logistic_regression.best_params_

## Decision Trees

In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", DecisionTreeClassifier())
    ]
)

pipeline.fit(X_train, y_train)

In [None]:
print_auc(pipeline)

### Hyperparameter optimization

In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", DecisionTreeClassifier())
    ]
)

hyperparameters = {
    "model__max_depth": [5, 10, 50],
    "model__min_samples_split": [5, 10, 50],
    "model__class_weight": ["balanced", None]
}

gridsearch = GridSearchCV(pipeline, hyperparameters, scoring="roc_auc", cv=3, verbose=0, n_jobs=-1)

tree_classifier = gridsearch.fit(X_train, y_train)

In [None]:
tree_classifier.best_estimator_.get_params()

In [None]:
print_auc(tree_classifier)

## SGD

In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", SGDClassifier())
    ]
)

pipeline.fit(X_train, y_train)

In [None]:
print_auc(pipeline)

### Hyperparameter optimization

In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", SGDClassifier())
    ]
)

hyperparameters = {
    "model__loss": ["hinge", "log"],
    "model__penalty": ["l2", "l1"],
    "model__alpha": [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 5, 10],
    "model__class_weight": ["balanced", None]
}

gridsearch = GridSearchCV(pipeline, hyperparameters, scoring="roc_auc", cv=3, verbose=0, n_jobs=-1)

sgd = gridsearch.fit(X_train, y_train)

In [None]:
sgd.best_estimator_.get_params()

In [None]:
print_auc(sgd)

## Neural Networks

### Pipeline without model

In [9]:
val_size = int(len(X_train) * 0.66)
X_val = X_train[-val_size:]
X_train = X_train[:-val_size]
y_val = y_train[-val_size:]
y_train = y_train[:-val_size]

X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
X_test = preprocessor.transform(X_test)

In [10]:
X_train.shape

(71400, 186)

In [None]:
model = keras.Sequential(
        [
            layers.Dense(50, activation="relu", input_shape=(X_train.shape[1],)),
            layers.Dense(units=1, activation="sigmoid")
        ]
)

model.compile(
    loss="binary_crossentropy", 
    optimizer="adam", 
    metrics=["AUC"]
)

history = model.fit(X_train, y_train, epochs=5, verbose=1, validation_data=(X_val, y_val))

In [None]:
plt.plot(history.history['auc'], label="training_auc")
plt.plot(history.history['val_auc'], label="val_auc")
plt.ylabel('AUC')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [None]:
#y_test_pred_proba = pipeline.predict(X_test)
y_test_pred_classes = np.where(model.predict(X_test) > 0.5, 1, 0)

In [None]:
fpr, tpr, threshold = roc_curve(y_test, y_test_pred_classes)
roc_auc = auc(fpr, tpr)
print("Test AUC:", roc_auc)

### NN Gridsearch

In [11]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [12]:
def neural_network_wrapper(layers_1_size):
    model = keras.Sequential(
        [
            layers.Dense(layers_1_size, activation="relu", input_shape=(186,)),
            layers.Dense(units=1, activation="sigmoid")
        ]
    )
    model.compile(
        loss="binary_crossentropy", 
        optimizer="adam", 
        metrics=['accuracy']
    )
    
    return model

In [13]:
neural_network = KerasClassifier(build_fn=neural_network_wrapper, epochs=5, verbose=0)

In [15]:
hyperparameters = {
    "layers_1_size": [20, 25, 30, 35, 40],
}

grid = GridSearchCV(estimator=neural_network, param_grid=hyperparameters)
grid_result = grid.fit(X_train, y_train)

In [16]:
grid_result.get_params()

{'cv': None,
 'error_score': nan,
 'estimator__verbose': 0,
 'estimator__build_fn': <function __main__.neural_network_wrapper(layers_1_size)>,
 'estimator': <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier at 0x285747d01f0>,
 'n_jobs': None,
 'param_grid': {'layers_1_size': [20, 25, 30, 35, 40]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}

In [17]:
grid_result.best_params_

{'layers_1_size': 30}

In [18]:
#y_test_pred_proba = pipeline.predict(X_test)
y_test_pred_classes = np.where(grid_result.predict(X_test) > 0.5, 1, 0)



In [19]:
fpr, tpr, threshold = roc_curve(y_test, y_test_pred_classes)
roc_auc = auc(fpr, tpr)
print("Test AUC:", roc_auc)

Test AUC: 0.780880518556413


### Hyperopt

In [None]:
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK, Trials

In [None]:
def neural_network_wrapper(layers_1_size):
    model = keras.Sequential(
        [
            layers.Dense(layers_1_size, activation="relu", input_shape=(200,)),
            layers.Dense(units=1, activation="sigmoid")
        ]
    )
    model.compile(
        loss="binary_crossentropy", 
        optimizer="adam", 
        metrics=["AUC"]
    )
    
    return model

In [None]:
def objective(layers_1_size):
    clf = neural_network_wrapper(layers_1_size)
    
    y_test_pred_classes = np.where(model.predict(X_test) > 0.5, 1, 0)
    fpr, tpr, threshold = roc_curve(y_test, y_test_pred_classes)
    roc_auc = auc(fpr, tpr)
    
    return {'loss': -roc_auc, 'status': STATUS_OK}

In [None]:
search_space = hp.uniform('layers_1_size', 10, 50)

In [None]:
argmin = fmin(
  fn=objective,
  space=search_space,
  algo=tpe.suggest,
  max_evals=100)

In [None]:
print("Best value found: ", argmin)

# Submission

In [None]:
df_test = pd.read_csv("../data/test.csv")

submission_id = df_test["id"]
df_test = df_test.drop(columns="id")

df_test = preprocessor.transform(df_test)


#submission_y = pipeline.predict_proba(df_test)[:, 1]
submission_y = model.predict(df_test)

submission = pd.DataFrame()
submission["id"] = submission_id
submission["target"] = submission_y
submission.to_csv(f"../submissions/submission_{timestamp()}.csv", index=False)