# **Over-Fitting Desing**

### ***Loading Libraries***

In [1]:
# Operating Systems
import os
import shutil

# Numerical Computing
import numpy as np

# Data Manipuation
import pandas as pd

# SciPy
import scipy
from scipy import stats

# Data Visualization
import itertools
import seaborn as sns
import matplotlib.pyplot as plt

# BigQuery
from google.cloud import bigquery
from google.colab import auth

# Scikit-Learn
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer

# Extreme Gradient Boosting
import xgboost as xgb

# TensorFlow
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import Model
import tensorflow_datasets as tfds
from tensorflow_hub import KerasLayer
from tensorflow import feature_column as fc
from tensorflow.keras.preprocessing import text
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, layers, models, utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, Input, Flatten, Conv2D, MaxPooling2D

### ***Loading Data***

In [3]:
df = pd.read_csv("/content/babyweight_train.csv")

#### ***Model Features Set-Up***

In [16]:
df.is_male = df.is_male.astype(str)
df.mother_race.fillna(0, inplace=True)
df.mother_race = df.mother_race.astype(str)

FEATURES = ['is_male', 'mother_age', 'plurality', 'gestation_weeks', 'mother_race']
LABEL = ['weight_pounds']

N_TRAIN = int(df.shape[0] * 0.80)

X_train = df[FEATURES][:N_TRAIN]
X_valid = df[FEATURES][N_TRAIN:]
y_train = df[LABEL][:N_TRAIN]
y_valid = df[LABEL][N_TRAIN:]

X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(0).astype('float32')
X_valid = X_valid.apply(pd.to_numeric, errors='coerce').fillna(0).astype('float32')
y_train = y_train.astype('float32')
y_valid = y_valid.astype('float32')

X_train_array = X_train.to_numpy()
y_train_array = y_train.to_numpy()
X_valid_array = X_valid.to_numpy()
y_valid_array = y_valid.to_numpy()

#### Input Pipeline

In [17]:
# Train/Validation Dataset
trainds = tf.data.Dataset.from_tensor_slices((X_train_array, y_train_array))

evalds = tf.data.Dataset.from_tensor_slices((X_valid_array, y_valid_array))

In [21]:
BATCH_SIZE = 32

trainds = trainds.shuffle(buffer_size=len(X_train)).batch(BATCH_SIZE)
evalds = evalds.batch(BATCH_SIZE)

In [22]:
# Examine the datasets
for dict_slice in trainds.take(1):
    print("{}\n".format(dict_slice))

(<tf.Tensor: shape=(32, 5), dtype=float32, numpy=
array([[ 0., 19.,  0., 33.,  0.],
       [ 0., 20.,  0., 37.,  1.],
       [ 0., 19.,  0., 39.,  0.],
       [ 0., 19.,  0., 41.,  2.],
       [ 0., 20.,  0., 36.,  1.],
       [ 0., 20.,  0., 37.,  0.],
       [ 0., 20.,  0., 39.,  1.],
       [ 0., 18.,  0., 38.,  1.],
       [ 0., 17.,  0., 40.,  1.],
       [ 0., 18.,  0., 40.,  1.],
       [ 0., 17.,  0., 38.,  0.],
       [ 0., 19.,  0., 39.,  2.],
       [ 0., 20.,  0., 40.,  1.],
       [ 0., 19.,  0., 41.,  0.],
       [ 0., 19.,  0., 37.,  1.],
       [ 0., 20.,  0., 39.,  0.],
       [ 0., 20.,  0., 41.,  1.],
       [ 0., 18.,  0., 38.,  1.],
       [ 0., 20.,  0., 40.,  1.],
       [ 0., 19.,  0., 41.,  0.],
       [ 0., 20.,  0., 38.,  1.],
       [ 0., 18.,  0., 31.,  0.],
       [ 0., 20.,  0., 45.,  1.],
       [ 0., 20.,  0., 31.,  1.],
       [ 0., 17.,  0., 36.,  1.],
       [ 0., 18.,  0., 41.,  1.],
       [ 0., 20.,  0., 39.,  1.],
       [ 0., 18.,  0., 40.,  1.]

#### Set up the feature columns for the model.

In [25]:
# create feature columns to handle categorical variables
numeric_columns = [
    fc.numeric_column("mother_age"),
    fc.numeric_column("gestation_weeks")
]

CATEGORIES = {
    'plurality': list(df.plurality.unique()),
    'is_male': list(df.is_male.unique()),
    'mother_race': list(df.mother_race.unique())
}

categorical_columns = []
for feature, vocab in CATEGORIES.items():
    cat_col = fc.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab, dtype=tf.string)
    categorical_columns.append(fc.indicator_column(cat_col))

#### SNN Model Build-Up

In [28]:
# Create Inputs for model
inputs = {colname: tf.keras.layers.Input(name=colname, shape=(), dtype=tf.float32)
          for colname in ["mother_age", "gestation_weeks"]}
inputs.update({colname: tf.keras.layers.Input(name=colname, shape=(), dtype=tf.string)
               for colname in ["plurality", "is_male", "mother_race"]})

# Ensure all inputs are in the correct type before combining
for key in inputs:
    if inputs[key].dtype == tf.string:
        inputs[key] = tf.expand_dims(inputs[key], -1)

# Build DenseFeatures for the model
feature_layer = layers.DenseFeatures(categorical_columns + numeric_columns)
dnn_inputs = feature_layer(inputs)

# Create hidden layers
h1 = layers.Dense(20, activation="relu")(dnn_inputs)
h2 = layers.Dense(10, activation="relu")(h1)

# Create model
output = layers.Dense(1, activation="relu")(h2)
model = tf.keras.models.Model(inputs=inputs, outputs=output)
model.compile(optimizer='Adam',
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=['mse'])

### ***Overfit on a Batch***

In [None]:
for feature_batch, label_batch in trainds.batch(5).take(1):
    print("'babyweight': {}".format(label_batch))
    print("features:")
    for key, value in feature_batch.items():
      print("  {!r:20s}: {}".format(key, value))

In [None]:
BATCH_SIZE = 256

single_batch = trainds.batch(BATCH_SIZE).take(1)

In [None]:
# train the classifcation model
tf.random.set_seed(33)

NUM_TRAINING_EXAMPLES = BATCH_SIZE  # total number of training examples
NUM_VALID_EXAMPLES = X_valid.shape[0]
NUM_EPOCHS = 100
TOTAL_TRAINING_EXAMPLES = int(NUM_EPOCHS * NUM_TRAINING_EXAMPLES)

steps_per_epoch = (TOTAL_TRAINING_EXAMPLES // (BATCH_SIZE * NUM_EPOCHS))

evalds = evalds.batch(X_valid.shape[0]).take(int(NUM_VALID_EXAMPLES//BATCH_SIZE))

# train the model
history = model.fit(single_batch.repeat(),
                    validation_data=evalds,
                    epochs=NUM_EPOCHS,
                    steps_per_epoch=steps_per_epoch,
                    verbose=0
                   )

In [None]:
def plot_curves(history, metrics):
    nrows = 1
    ncols = 2
    fig = plt.figure(figsize=(10, 5))

    for idx, key in enumerate(metrics):
        ax = fig.add_subplot(nrows, ncols, idx+1)
        plt.plot(history.history[key])
        plt.plot(history.history['val_{}'.format(key)])
        plt.title('model {}'.format(key))
        plt.ylabel(key)
        plt.xlabel('epoch')
        plt.legend(['train', 'validation'], loc='upper left');

In [None]:
plot_curves(history, ['loss'])