# Mend the Money Marker

### Practicum Overview

In this practicum, we'll be "mending" the money marker. In other words, we're going to create a neural network that can identify individuals who, in 5 years time, will make more than $50,000. We'll use a real dataset from the 1994 US Census to do so.

<div style="text-align: center;"> <img src = "res/model_building/money_marker_icon.jpg" width="25%"/> </div>

However, there's an issue! Our first neural network sucks! It's diagramed below. The diagram is accurate except for the input layer. The actual input layer will have over 100 neurons (1 neuron per feature), but 100+ neurons cannot be intelligibly displayed in this sort of diagram. Thus, for the input layer, 1 neuron represents 10.

<div style="text-align: center;"> <img src = "res/model_building/money_marker_initial_nn.jpg" width="75%"/> </div>

Thus, <strong> you will be tinkering with neural network architecture </strong> in order to fix this neural network. Namely, you'll be engaging in both steps of the neural network diagnosis prossess in which you'll combat (i) underfitting and then (ii) overfitting.

<div style="text-align: center;"> <img src = "res/model_building/money_marker_steps_to_fixing_nn.jpg" width="30%"/> </div>

# 0 | Google Colab Code

In [None]:
import os
import shutil
import stat

In [None]:
def copy_safe(src, dst, max_len=200):
    """Copy files, skip long paths"""
    skipped = 0
    for root, dirs, files in os.walk(src):
        rel_path = os.path.relpath(root, src)
        dst_root = os.path.join(dst, rel_path) if rel_path != '.' else dst
        if len(dst_root) < max_len:
            os.makedirs(dst_root, exist_ok=True)
            for file in files:
                dst_file = os.path.join(dst_root, file)
                if len(dst_file) < max_len:
                    try: shutil.copy2(os.path.join(root, file), dst_file)
                    except: skipped += 1
                else: skipped += 1
        else: skipped += len(files)
    return skipped

In [None]:
# Setup resources if needed
setup_ran = False
if not os.path.exists('res'):
    print("Setting up resources...")
    setup_ran = True
    
    # Cleanup, clone, copy
    repo = 'deep_learning_resources'
    if os.path.exists(repo):
        shutil.rmtree(repo, onerror=lambda f,p,e: os.chmod(p, stat.S_IWRITE) or f(p))
    
    !git clone --depth=1 https://github.com/jjv31/deep_learning_resources
    
    if os.path.exists(f'{repo}/res'):
        skipped = copy_safe(f'{repo}/res', 'res')
        print(f"Setup complete! {'(' + str(skipped) + ' long filenames skipped)' if skipped else ''}")
    
    shutil.rmtree(repo, onerror=lambda f,p,e: os.chmod(p, stat.S_IWRITE) or f(p))

In [None]:
# Only refresh if we just downloaded resources
if setup_ran:
    from IPython.display import Javascript, display
    import time
    
    print("Refreshing images...")
    
    # Try browser refresh + aggressive image reload
    display(Javascript(f'''
    try {{ setTimeout(() => window.location.reload(true), 2000); }} catch(e) {{}}
    
    const t = {int(time.time())};
    document.querySelectorAll('img').forEach((img, i) => {{
        if (img.src.includes('res/')) {{
            const src = img.src.split('?')[0];
            setTimeout(() => img.src = src + '?v=' + t + '_' + i, i * 50);
        }}
    }});
    '''))
    
    print("If images don't appear, press Ctrl+Shift+R to hard refresh!")
else:
    print("Resources already exist, skipping setup.")

# 1 | Imports & Preprocessing

### 1.0 | Imports & Auxilary Functions

Just run these. No need to modify them.

In [None]:
#load libraries
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

#Scikit-learn libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

#Set plot styles
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

# Get Pandas to display all rows/columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None) 

In [None]:
# Mutes Pandas' annoying future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Neural Network
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout
from keras.optimizers import Adam
from tensorflow.keras import regularizers
import keras as keras

In [None]:
def print_univariates_metric(data, nameToPrint=None):

    # Mode - Handling multimodal cases
    mode_result = data.mode()
    if len(mode_result) == 0:  # No mode found
        mode_result = None
    else:
        mode_result = mode_result[0]

    # Print output
    print(f"Descriptives for {nameToPrint}")
    print(f"Mean = {round(data.mean(),2)} | Median = {round(data.median(),2)} | Mode = {mode_result} | "
          f"Min = {data.min()} | Max = {data.max()} | SD = {round(data.std(),2)} | "
          f"IQR(25) = {data.quantile(0.25)} | IQR(75) = {data.quantile(0.75)}")

In [None]:
#Function to facilitate evaluating our models
def print_score(clf, X, y_true):

    # Gets predicted labels
    if isinstance(clf, keras.models.Sequential): # If the model is a Keras neural network
        y_pred = (clf.predict(X) >= 0.5).astype(int) 
    else: # Normal scikit-learn model
        y_pred = clf.predict(X)

    # Gets key performance indicators
    accuracy = round(accuracy_score(y_true, y_pred), 4)
    recall = round(recall_score(y_true, y_pred), 4)
    precision = round(precision_score(y_true, y_pred), 4)
    f1 = round(f1_score(y_true, y_pred), 4)

    # Displays them
    print(f"F1 = {f1:.4f} | Recall = {recall* 100:.2f}% | Precision = {precision*100:.2f}%")

In [None]:
# Plots the performance of the neural network
def plot_performance(training_values, validation_values, metric_name = "Recall"):

    epochs = range(1, len(training_values) + 1)
    
    sns.set() 
    plt.plot(epochs, training_values, '-', label=f'Training {metric_name}')
    plt.plot(epochs, validation_values, ':', label=f'Validation {metric_name}')

    plt.title(f'Training and Validation {metric_name}')
    plt.xlabel('Epoch')
    plt.ylabel(metric_name)
    plt.legend(loc='lower right')
    plt.plot()

### 1.1 | Explore Data

In [None]:
# Import the data
df = pd.read_csv("res/model_building/income.csv")
df.head(3)

In [None]:
# Displays the label (income_5y)
df["income_5y"].value_counts()

### 1.2 | Preprocessing: Check for NAs

In [None]:
# Checks to see if there's any NAs
assert(df.isnull().sum().all() == 0)
print("Congratulations. There are no NAs in your dataset.")

### 1.3 | Preprocessing: Handle Numeric Features

In [None]:
# Converts age to z-scores, as neural networks prefer smaller numbers 
print_univariates_metric(df["age"], "Age")

u = df["age"].mean()
sd = df["age"].std()

# Converts to z-scores
df["age"] = round( ( (df["age"] - u) / sd ), 4)
print_univariates_metric(df["age"], "Age (z-scores)")

In [None]:
# Converts hours per week to z-scores
print_univariates_metric(df["hours-per-week"], "Hours per Week")

u = df["hours-per-week"].mean()
sd = df["hours-per-week"].std()

# Converts to z-scores
df["hours-per-week"] = round( ( (df["hours-per-week"] - u) / sd ), 4)
print_univariates_metric(df["hours-per-week"], "Hours per Week (z-scores)")

In [None]:
# HANDLES CAPITAL GAIN

# Prints capital gain 
print_univariates_metric(df["capital-gain"], "Capital Gain")
print_univariates_metric(df["capital-gain"][df["capital-gain"] != 0], "Capital Gain (no zeros)")
# Most don't have capital gain. Of those who do, it's usually modest (<$10K).

# Let's convert this large number into binaries that summarize it
df["has_capital_gain"] = df["capital-gain"] > 0
df["capital_gain_above_median"] = df["capital-gain"] > np.median( df["capital-gain"][df["capital-gain"] != 0] )
df["capital_gain_above_mean"] = df["capital-gain"] > np.mean( df["capital-gain"][df["capital-gain"] != 0] )

# Now that we've effectively summarized capital gain, we can drop it.
df = df.drop(columns=["capital-gain"])

In [None]:
# HANDLES CAPITAL LOSS

# Prints capital gain 
print_univariates_metric(df["capital-loss"], "Capital Loss")
print_univariates_metric(df["capital-loss"][df["capital-loss"] != 0], "Capital loss (no zeros)")

# Let's convert this large number into binaries that summarize it.
# Mean and median are about the same, so we'll just stick with mean
df["has_capital_loss"] = df["capital-loss"] > 0
df["capital_loss_above_mean"] = df["capital-loss"] > np.mean( df["capital-loss"][df["capital-loss"] != 0] )

# Now that we've effectively summarized capital gain, we can drop it.
df = df.drop(columns=["capital-loss"])

### 1.4 | Preprocessing: Handle Nominal Features

We'll handle these by one-hot encoding them

In [None]:
df.head(1)

In [None]:
df = pd.get_dummies(df, columns=['workclass', 'education', "marital-status", "occupation", "relationship", "race", "sex", "native-country"],
                    prefix=['class_of_work', 'education', "marital_status", "occupation", "relationship", "race", "sex", "native_country"])

In [None]:
# Converts all bools to binary integers (0/1)
# Ignore the warnings
boolean_cols = df.select_dtypes(include='bool').columns
df.loc[:, boolean_cols] = df[boolean_cols].astype(int)

### 1.5 | View Preprocessed Dataset

In [None]:
df.head(2)

In [None]:
df.dtypes

# 2 | Split Data into Training and Testing Sets

In [None]:
# Seperate the output (y) from the inputs (X). The output is what we're hoping to predict.
# In machine learning lingo, the input variable should be named X (capital x) and the output variable should be named y (lowercase y)
X = df.drop(['income_5y'], axis=1)
y = df['income_5y']

In [None]:
#Displays first rows of features. Confirms (a) they do not contain the label and (b) they look OK
X.head(3)

In [None]:
#Displays first 3 outputs. Confirms we only have the label (i.e. whether an individual will commit GH)
y.head(3)

In [None]:
# Splits into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Let's examine the sizes of the training and testing sets
print(f"Training set size = {y_train.size}\nTesting set size = {y_test.size}")

# 3 | Create Initial Neural Network

### 3.0 | Section Overview

We'll construct & train the initial neural network (§3.1), evaluate it so that it outputs a score (§3.2), and then evaluate it's training (§3.3). In the next section (§4), you'll go through this model and fix it.

However, there's a rule --- <strong> do NOT change the number of epoches or the batch size </strong>. In other words, we're going to make it so you can only train the neural network on 50 iterations of the training set. 

This means you will be forced to improve the neural network by tinkering with its architecture. Allowing it to train more (via increasing the number of epoches) will certainly improve its performance, but that would be antithetical to the learning objective of understanding neural network architecture.

In [None]:
# Initialize constants. DO NOT CHANGE ANY OF THESE.
EPOCHES = 50
BATCH_SIZE = 2048

NUMBER_OF_FEATURES = X.shape[1]
print(f"There are {NUMBER_OF_FEATURES} to be inputted into the neural network. Thus, there should be {NUMBER_OF_FEATURES} input nodes")

### 3.1 | Construct & Train Initial Neural Network

In [None]:
# Create neural network
initial_neural_network = Sequential() 
initial_neural_network.add( Input( shape= (NUMBER_OF_FEATURES,) ) ) 
initial_neural_network.add(Dense(1, activation='linear'))
initial_neural_network.add(Dense(1, activation='sigmoid')) 

# Compiles the model
my_learning_rate = 0.000000001
initial_neural_network.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=my_learning_rate), 
              metrics=[keras.metrics.Precision(name="precision"), keras.metrics.Recall(name="recall"),]) 
initial_neural_network.summary()

In [None]:
#Trains the model
hist = initial_neural_network.fit(X_train, y_train, validation_data=(X_test, y_test), 
                 epochs=EPOCHES, batch_size=BATCH_SIZE)

### 3.2 | Evaluates Model

In [None]:
print_score(initial_neural_network, X_test, y_test)

### 3.3 | Evaluates Training

In [None]:
loss, val_loss = hist.history["loss"], hist.history["val_loss"]
plot_performance(loss, val_loss, "Loss (Error)")

In [None]:
prec, val_prec = hist.history["precision"], hist.history["val_precision"]
plot_performance(prec, val_prec, "Precision")

In [None]:
recall, val_recall = hist.history["recall"], hist.history["val_recall"]
plot_performance(recall, val_recall, "Recall")

# 4 | Combat Underfitting: Fix the Neural Network ("Money Marker")

### 4.0 | Section Overview

Our neural network is so bad that it's not even fitting the training data. Thus, the first step should be to combat underfitting.

<div style="text-align: center;"> <img src = "res/model_building/money_marker_steps_to_fixing_nn_underfitting.jpg" width="30%"/> </div>

You're welcome to perform one or more of the following operations to combat underfitting:

<ul>
  <li> Add or remove more neurons to the hidden layer. </li>
  <li> Add more hidden layer(s). </li>
  <li> Adjust the activation function. </li>
  <li> Adjust the learning rate. </li>
</ul>

### 4.1 | Create your Neural Network

In [None]:
# ******************************
# EXERCSE
# *******************************

# Fix this neural network

# Create neural network
your_neural_network = Sequential() 
your_neural_network.add( Input( shape= (NUMBER_OF_FEATURES,) ) ) 
your_neural_network.add(Dense(1, activation='linear'))
your_neural_network.add(Dense(1, activation='sigmoid')) 

# Compiles the model
my_learning_rate = 0.00000001 # HINT: This is a very small learning rate. The default is 0.001.
your_neural_network.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=my_learning_rate), 
              metrics=[keras.metrics.Precision(name="precision"), keras.metrics.Recall(name="recall"),]) 
your_neural_network.summary()

In [None]:
#Trains the model.
hist = your_neural_network.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=EPOCHES, batch_size=BATCH_SIZE)

### 4.2 | Evaluates Model

In [None]:
# Initial Model
print("Score of your model.")
print_score(your_neural_network, X_test, y_test)

In [None]:
print("Here's the score of the initial neural network. Does your F1 score exceed this one?")
print_score(initial_neural_network, X_test, y_test)

### 4.3 | Evaluates Training

In [None]:
loss, val_loss = hist.history["loss"], hist.history["val_loss"]
plot_performance(loss, val_loss, "Loss (Error)")

In [None]:
prec, val_prec = hist.history["precision"], hist.history["val_precision"]
plot_performance(prec, val_prec, "Precision")

In [None]:
recall, val_recall = hist.history["recall"], hist.history["val_recall"]
plot_performance(recall, val_recall, "Recall")

# 5 | Combat Overfitting: Fix the Neural Network ("Money Marker")

### 5.0 | Section Overview

Good news! I managed to create a neural network that works! The bad news is that it's overfitted, something we'll need to fix.

<div style="text-align: center;"> <img src = "res/model_building/money_marker_steps_to_fixing_nn_overfitting.jpg" width="30%"/> </div>

To combat overfitting, you will do one of the following:

<ul>
  <li> Add L1 or L2 Regularization. </li>
  <li> Add a dropout layer </li>
</ul>

### 5.1 | Create and Display Overfitted Model

In [None]:
# Let's create an overfitted neural network.
overfitted_neural_network = Sequential() 
overfitted_neural_network.add( Input( shape= (NUMBER_OF_FEATURES,) ) ) 
overfitted_neural_network.add(Dense(64, activation='relu'))
overfitted_neural_network.add(Dense(1, activation='sigmoid')) 

# Compiles the model
my_learning_rate = 0.01
overfitted_neural_network.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=my_learning_rate), 
              metrics=[keras.metrics.Precision(name="precision"), keras.metrics.Recall(name="recall"),]) 

# Trains the overfitted model
hist = overfitted_neural_network.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=EPOCHES, batch_size=BATCH_SIZE)

In [None]:
# Let's see the scores
print("Score of overfitted model")
print_score(overfitted_neural_network, X_test, y_test)

In [None]:
# The score isn't bad, but let's look at the loss function.
# Validation loss should go down with training loss, but it's not. That's evidence of overfitting.

loss, val_loss = hist.history["loss"], hist.history["val_loss"]
plot_performance(loss, val_loss, "Loss (Error)")

### 5.2 | Your Fix: Compile & Train Your Model

In [None]:
# ***************************
# EXERCISE 
# ***************************

# The overfitted model is pasted below. 
# Change the model by either adding L1/L2 regularization OR a dropout layer.

# To add a dropout layer
# your_fix.add(Dropout(0.XX)) 
# Replace 0.XX with a decimal between 0 and 1. This is the percentage of neurons in the previous layer that should be deleted.

# To add regularization, add it to 
# your_fix.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001) ) )
# " kernel_regularizer=regularizers.l2(0.001) " is the regularization technique you're adding
# l1 regularization uses absolute value, whereas l2 regularization uses the square of the weights.
# The number is the penalty term, where the higher the term, the greater the regularization

your_fix = Sequential() 
your_fix.add( Input( shape= (NUMBER_OF_FEATURES,) ) ) 
your_fix.add(Dense(64, activation='relu') )
your_fix.add(Dense(1, activation='sigmoid')) 

# Compiles the model
my_learning_rate = 0.01
your_fix.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=my_learning_rate), 
              metrics=[keras.metrics.Precision(name="precision"), keras.metrics.Recall(name="recall"),]) 
your_fix.summary()

In [None]:
# Trains your model
hist = your_fix.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=EPOCHES, batch_size=BATCH_SIZE)

### 5.3 | Evaluate Your Fix

Let's start by graphing the loss function. Does the validation loss overlap with the training loss? If so, congratulations! You've adjusted overfitting!

In [None]:
loss, val_loss = hist.history["loss"], hist.history["val_loss"]
plot_performance(loss, val_loss, "Loss (Error)")

In [None]:
# Let's see the scores
print("Score of your model.")
print_score(your_fix, X_test, y_test)

In [None]:
# Let's see the scores
print("Score of the original overfitted model.")
print_score(overfitted_neural_network, X_test, y_test)