In [None]:
# -------------------------------------------------------------------
# Step 0: Setup
# -------------------------------------------------------------------

# Uncomment and adjust to install libraries not available on Google Colab
# !pip install packagename

# Load any libraries used for this project

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    roc_curve,
    roc_auc_score
)

# for stepwise selection later
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LogisticRegression


In [None]:
# Load data
from google.colab import files  # so we can upload a CSV to our notebook

uploaded = files.upload() # opens a file chooser to select our csv file


In [None]:
penguins = pd.read_csv("PalmerPenguins.csv")

In [None]:
### Step 1: Pre-processing ----

# check data structure

# any obvious pre-formatting, do here (e.g., numeric to categorical etc.)

# Initial structure of the data
print("Structure / dtypes:")
print(penguins.info())

print("\nMissing values per column:")
print(penguins.isna().sum())


In [None]:
# Let's drop rows with missing values
penguins = penguins.dropna()

# Let's turn sex into a binary variable
penguins["sex"] = penguins["sex"].map({"male": 1, "female": 0})

In [None]:
### Step 2: Split data into train and test sets ----

train, test = train_test_split(
    penguins,
    test_size=0.30,
    random_state=90210
    )


In [None]:
### Step 3: Exploratory data analysis ----

# Summarize and visualize your *training* data
# Do so to understand the data set and inform both
# model selection and feature engineering

# summary statistics
print("\nTrain summary:")
print(train.describe())

# From past experience, here are our numeric predictors
predictors = [
    'bill_length_mm',
    'bill_depth_mm',
    'flipper_length_mm',
    'body_mass_g'
]

# Pairs plot of our data, colored by our outcome variable
sns.pairplot(
    penguins[predictors + ['sex']].assign(
        sex = train['sex'].astype("category")
    ),
    hue='sex',
    diag_kind='kde'
    )

plt.show()



# Instructions

1. Build an initial model of all variables.
2. Plot a ROC curve and calculate AUC in-sample, based on the training data
3. Use stepwise regression (that's both directions) to select variables for a
    second model
4. Get AUC and plot a ROC curve for both models (if they are different)
    on your test data
5. Choose the model you think is better.
6. Calculate exact marginal effects of the odds.
7. Calculate average marginal effects of the probability of 'male'
8. Pick the variable that has the highest MFX in odds and probabilites.
    Are they the same variable?
9. For each, write a sentence in plain english interpreting the marginal effect
    of that variable on the odds/probability of a penguin being male.

In [None]:
### Step 4: Feature engineering ----

# Any variable transformations, creation of new variables etc.
# goes here. Also imputing missing values etc.
# Don't forget to apply it to your test set too.




In [None]:
### Step 5: Feature and Model selection ----

# Build different models, select different hyperparameters, validate the models
# etc. here. Steps 4 and 5 are usually iterative until you've settled on a model
# or models that you're comfortable with.




In [None]:
### Step 6: Predictions and final model evaluation ----

# Predict on your test set to accurately measure out-of-sample performance
# Past this point, you need to report results back in the real world
