# Notes

Source: https://www.kaggle.com/c/tabular-playground-series-mar-2021

>Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.
* https://en.wikipedia.org/wiki/Receiver_operating_characteristic
* `clf.predict_proba(test)[:, 1]`

---

* Cat1 - Cat10 have many categories
* Cont5 is weird
* target class "1" makes only 24%
* think how to combine models
* make pipeline (read the medium article about best practice)
```python
regression = Pipeline(
    steps=[("preprocessor", preprocessor), ("regression", LinearRegression())]
)
regression.fit(X_train, y_train)
```
* read the "sklearn pitfalls"

---
| Model | Optimization | Test AUC | Kaggle AUC | Notes |
| --- | --- | --- | --- | --- |
| LogisticRegression | N | 0.7707 | 0.87 | problems with predicting the underrepresented class "1" |
| LogisticRegression | Y | 0.803 | 0.87 | balanced weights improve test AUC |

# Libraries and helper functions

In [77]:
import math

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn-whitegrid')

import scikitplot as skplt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import roc_curve, auc, confusion_matrix, plot_confusion_matrix, plot_roc_curve

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
import datetime
def timestamp():
    time = f"{datetime.datetime.now().date()}--{datetime.datetime.now().time().replace(microsecond=0)}"
    return time.replace(":", "-")

In [3]:
def print_auc(model):
    global X_train
    global y_train
    global X_test
    global y_test
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    fpr, tpr, threshold = roc_curve(y_train, y_train_pred)
    roc_auc = auc(fpr, tpr)
    print("Training AUC:", roc_auc)

    fpr, tpr, threshold = roc_curve(y_test, y_test_pred)
    roc_auc = auc(fpr, tpr)
    print("Test AUC:", roc_auc)

# Load and Wrangle

In [64]:
df = pd.read_csv("../data/train.csv")

df = df.drop(columns=["id"])

In [None]:
df.info()

In [None]:
df.describe()

# Visualization

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
numerical_features = df.select_dtypes(include="float64").columns
for i in numerical_features:
    sns.histplot(df, x=i)
    plt.show()

In [None]:
sns.histplot(df, x="target")

In [None]:
categorical_features = df.select_dtypes(include="object").columns
for i in categorical_features:
    fig, ax = plt.subplots(1, 2, figsize=(12, 8))
    sns.histplot(df, x=i, ax=ax[0])
    sns.boxplot(data=df, x=i, y="target", ax=ax[1])
    plt.show()

In [None]:
categorical_features = df.select_dtypes(include="object").columns
for i in categorical_features:
    print(i, len(df[i].value_counts()))

In [None]:
categorical_features = df.select_dtypes(include="object").columns
for i in categorical_features:
    print(pd.crosstab(df[i], df["target"], normalize='index'))

# Prepare for Models

In [None]:
# df = df.drop(columns=["cont0", "cont7", "cont9", "cont10", "cat5", "cat8", "cat10"])
# df_test = df_test.drop(columns=["cont0", "cont7", "cont9", "cont10", "cat5", "cat8", "cat10"])

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns="target"),
    df["target"],
    test_size=0.3,
    random_state=42
)

In [39]:
numerical_features = X_train.select_dtypes(include="float64").columns
numerical_transformer = StandardScaler()

categorical_features = X_train.select_dtypes(include="object").columns
categorial_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorial_transformer, categorical_features)
    ],
    sparse_threshold=0
)

# ('select_k_best', select_best_features)

# Models

## Logistic Regression

In [26]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ('select_k_best', select_best_features),
        ("model", LogisticRegression())
    ]
)

pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  Index(['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10'],
      dtype='object')),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16',
       'cat17', 'cat18'],
      dtype='object'))])),
                ('select_k_best', SelectKBest(k=150)),
                ('model', LogisticRegression())])

In [27]:
print_auc(pipeline)

Training AUC: 0.7644333719274699
Test AUC: 0.7686323942160777


### Hyperparameter optimization

In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", LogisticRegression())
    ]
)

hyperparameters = {
    "model__penalty": ["l1", "l2"],
    "model__C": np.logspace(0, 4, 10),
    "model__class_weight": ["balanced", None]
}

gridsearch = GridSearchCV(pipeline, hyperparameters, scoring="roc_auc", cv=3, verbose=0, n_jobs=-1)

logistic_regression = gridsearch.fit(X_train, y_train)

In [None]:
logistic_regression.best_estimator_.get_params()

In [None]:
print_auc(logistic_regression)

In [None]:
plot_roc_curve(logistic_regression, X_test, y_test)

In [None]:
matrix = confusion_matrix(y_test, y_test_pred, normalize="true")
matrix

In [None]:
plot_confusion_matrix(logistic_regression, X_test, y_test, normalize="true")
plt.show()

In [None]:
logistic_regression.best_params_

## Decision Trees

In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", DecisionTreeClassifier())
    ]
)

pipeline.fit(X_train, y_train)

In [None]:
print_auc(pipeline)

### Hyperparameter optimization

In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", DecisionTreeClassifier())
    ]
)

hyperparameters = {
    "model__max_depth": [5, 10, 50],
    "model__min_samples_split": [5, 10, 50],
    "model__class_weight": ["balanced", None]
}

gridsearch = GridSearchCV(pipeline, hyperparameters, scoring="roc_auc", cv=3, verbose=0, n_jobs=-1)

tree_classifier = gridsearch.fit(X_train, y_train)

In [None]:
tree_classifier.best_estimator_.get_params()

In [None]:
print_auc(tree_classifier)

## SGD

In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", SGDClassifier())
    ]
)

pipeline.fit(X_train, y_train)

In [None]:
print_auc(pipeline)

### Hyperparameter optimization

In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", SGDClassifier())
    ]
)

hyperparameters = {
    "model__loss": ["hinge", "log"],
    "model__penalty": ["l2", "l1"],
    "model__alpha": [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 5, 10],
    "model__class_weight": ["balanced", None]
}

gridsearch = GridSearchCV(pipeline, hyperparameters, scoring="roc_auc", cv=3, verbose=0, n_jobs=-1)

sgd = gridsearch.fit(X_train, y_train)

In [None]:
sgd.best_estimator_.get_params()

In [None]:
print_auc(sgd)

## Neural Networks

In [78]:
def neural_network_wrapper():
    model = keras.Sequential(
        [
            layers.Dense(50, activation="relu", input_shape=(200,)),
            layers.Dense(units=20, activation="relu"),
            layers.Dense(units=1, activation="sigmoid")
        ]
    )
    model.compile(
        loss="binary_crossentropy", 
        optimizer="adam", 
        metrics=["AUC"]
    )
    
    return model

In [82]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ('select_k_best', SelectKBest(f_classif, k=200)),
        ("model", neural_network_wrapper())
    ]
)

pipeline.fit(X_train, y_train, model__epochs=20, model__verbose=1, model__validation_data=(X_test, y_test))

Epoch 1/20

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [80]:
#y_test_pred_proba = pipeline.predict(X_test)
y_test_pred_classes = np.where(pipeline.predict(X_test) > 0.5, 1, 0)

In [81]:
fpr, tpr, threshold = roc_curve(y_test, y_test_pred_classes)
roc_auc = auc(fpr, tpr)
print("Test AUC:", roc_auc)

Test AUC: 0.777037668159822


# Submission

In [66]:
df_test = pd.read_csv("../data/test.csv")

submission_id = df_test["id"]
df_test = df_test.drop(columns="id")

#submission_y = pipeline.predict_proba(df_test)[:, 1]
submission_y = pipeline.predict(df_test)

submission = pd.DataFrame()
submission["id"] = submission_id
submission["target"] = submission_y
submission.to_csv(f"../submissions/submission_{timestamp()}.csv", index=False)