In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

import matplotlib.pyplot as plt
import tensorflow.keras as keras

from keras.layers import Dense
from keras.models import Sequential
from keras.regularizers import L2

from collections import defaultdict
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures

You can call this for the binary classification problem if you set `from_logits=False` 
when declaring the loss. With this, you will also not need to call the `tf.math.sigmoid()` function in the loopbecause the model output is already a probability.

In [11]:
def build_models(rows_len, regu):
    print("build_models")
    tf.random.set_seed(20)
    
    models = dict()

    model_1 = Sequential(
        [
            keras.Input(shape=rows_len),
            Dense(14, activation = 'relu', kernel_regularizer=L2(regu)),
            Dense(12, activation = 'relu', kernel_regularizer=L2(regu)),
            Dense(1, activation = 'sigmoid', kernel_regularizer=L2(regu))
        ],
        name='Model_1'
    )
    models['Model_1'] = model_1

    model_2 = Sequential(
        [
            keras.Input(shape=rows_len),
            Dense(20, activation = 'relu', kernel_regularizer=L2(regu)),
            Dense(12, activation = 'relu', kernel_regularizer=L2(regu)),
            Dense(12, activation = 'relu', kernel_regularizer=L2(regu)),
            Dense(20, activation = 'relu', kernel_regularizer=L2(regu)),
            Dense(1, activation = 'sigmoid', kernel_regularizer=L2(regu))
        ],
        name='Model_2'
    )
    models['Model_2'] = model_2
    
    model_3 = Sequential(
        [
            keras.Input(shape=rows_len),
            Dense(32, activation = 'relu', kernel_regularizer=L2(regu)),
            Dense(16, activation = 'relu', kernel_regularizer=L2(regu)),
            Dense(8, activation = 'relu', kernel_regularizer=L2(regu)),
            Dense(4, activation = 'relu', kernel_regularizer=L2(regu)),
            Dense(12, activation = 'relu', kernel_regularizer=L2(regu)),
            Dense(1, activation = 'sigmoid', kernel_regularizer=L2(regu))
        ],
        name='Model_3'
    )
    models['Model_3'] = model_3
#     models = [model_1, model_2, model_3]
    
    return models

In [20]:
def poly_optimizer(degrees, datasets):
    X_train, y_train, X_cv, y_cv, X_test, y_test = datasets
    # Loop over 10 times. Each adding one more degree of polynomial higher than the last.
    mses = defaultdict(list)
    maps = defaultdict(list)
    
    
    for degree in degrees:
        # Add polynomial features to the training set
        poly = PolynomialFeatures(degree, include_bias=False)
        X_train_mapped = poly.fit_transform(X_train)

        # Scale the training set
        scaler_poly = MinMaxScaler()
        X_train_mapped = scaler_poly.fit_transform(X_train_mapped)
        maps['train'].append(X_train_mapped)

        # Create and train the model
        model = LinearRegression()
        model.fit(X_train_mapped, y_train )
        maps['models'].append(model)

        # Compute the training MSE
        yhat = model.predict(X_train_mapped)
        train_mse = mean_squared_error(y_train, yhat) / 2
        mses['train'].append(train_mse)

        # Add polynomial features and scale the cross validation set
        X_cv_mapped = poly.transform(X_cv)
        X_cv_mapped = scaler_poly.transform(X_cv_mapped)
        maps['cv'].append(X_cv_mapped)

        # Compute the cross validation MSE
        yhat = model.predict(X_cv_mapped)
        cv_mse = mean_squared_error(y_cv, yhat) / 2
        mses['cv'].append(cv_mse)

        # Add polynomial features and scale the test set
        X_test_mapped = poly.transform(X_test)
        X_test_mapped = scaler_poly.transform(X_test_mapped)
        maps['test'].append(X_test_mapped)

        # Compute the test MSE
        yhat = model.predict(X_test_mapped)
        test_mse = mean_squared_error(y_test, yhat) / 2
        mses['test'].append(test_mse)

        print(degree, X_train_mapped.shape, X_cv_mapped.shape, X_test_mapped.shape)
        
    return maps, mses


In [14]:
def search_boundary(model, X, y):
    print(model, X.shape, y.shape)
    # Convert actual output to numpy array
    y_actual = np.array(y)

    # Do prediction on test set
    y_pred = model.predict(X)
    y_pred = y_pred.squeeze()

    # Search for an optimal value for the decision line
    steps = range(10, 90, 1)
    results = []
    max_correct = val = 0

    for step in steps:
        x_val = step/100
        y_vals = np.array([1 if x >= x_val else 0 for x in y_pred]) 
        ct = sum([int(pred == act) for pred, act in zip(y_vals, y_actual)])

        results.append(ct)

        if ct > max_correct:
            max_correct = ct
            val = x_val
            
    return val, max_correct, y_pred, results

In [15]:
def split_dataset(x, y):
    
    # Get 60% of the dataset as the training set. Put the remaining 40% in temporary variables.
    x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.40, random_state=80)

    # Split the 40% subset above into two: one half for cross validation and the other for the test set
    x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=80)
    
    return x_train, y_train, x_cv, y_cv, x_test, y_test

## Useful Methods for Plotting Graphs

In [21]:
def plot_train_cv_test_mses(degrees, train_mses, cv_mses, test_mses, title):
    plt.plot(degrees, train_mses, marker='o', c='r', label='training MSEs'); 
    plt.plot(degrees, cv_mses, marker='o', c='b', label='CV MSEs'); 
    plt.plot(degrees, test_mses, marker='o', c='g', label='test MSEs'); 
    plt.title(title)
    plt.xlabel("degree"); 
    plt.ylabel("MSE"); 
    plt.legend()
    plt.show()

In [17]:
def visualize_data(diabetes_df, features):
    
    diabetes_df.hist(bins=10, figsize = (20, 20), color = 'g');

    diabetes_pos_df = diabetes_df.query('Diagnosis == 1')
    diabetes_pos_df.hist(bins=10, figsize=(20, 20), color = 'r');

 
    sns.heatmap(diabetes_df.corr(), annot = True);

    sns.pairplot(diabetes_df[features + ['Diagnosis']], hue="Diagnosis")
    
    plt.title("3d Scatterplot")
    fig, ax = plt.subplots(subplot_kw={"projection": "3d"})

    x = diabetes_df['FastingBloodSugar']
    y = diabetes_df['HbA1c']
    z = diabetes_df['Diagnosis']

    ax.scatter(x, y, z)

    plt.show()

## Methods unused
Not used in the lab. You can call this for the binary 
classification problem if you set `from_logits=False` 
when declaring the loss. With this, you will also not need
to call the `tf.math.sigmoid()` function in the loop
because the model output is already a probability

In [18]:
def plot_dataset(x, y, title):
    plt.rcParams["figure.figsize"] = (12,8)
    plt.rcParams["lines.markersize"] = 12
    plt.scatter(x, y, marker='x', c='r'); 
    plt.title(title)
    plt.xlabel("x"); 
    plt.ylabel("y"); 
    plt.show()
    
def plot_train_cv_test(x_train, y_train, x_cv, y_cv, x_test, y_test, title):
    plt.scatter(x_train, y_train, marker='x', c='r', label='training'); 
    plt.scatter(x_cv, y_cv, marker='o', c='b', label='cross validation'); 
    plt.scatter(x_test, y_test, marker='^', c='g', label='test'); 
    plt.title("input vs. target")
    plt.xlabel("x"); 
    plt.ylabel("y"); 
    plt.legend()
    plt.show()

def plot_bc_dataset(x, y, title):
    for i in range(len(y)):
        marker = 'x' if y[i] == 1 else 'o'
        c = 'r' if y[i] == 1 else 'b'
        plt.scatter(x[i,0], x[i,1], marker=marker, c=c); 
    plt.title("x1 vs x2")
    plt.xlabel("x1"); 
    plt.ylabel("x2"); 
    y_0 = mlines.Line2D([], [], color='r', marker='x', markersize=12, linestyle='None', label='y=1')
    y_1 = mlines.Line2D([], [], color='b', marker='o', markersize=12, linestyle='None', label='y=0')
    plt.title(title)
    plt.legend(handles=[y_0, y_1])
    plt.show()


def train_plot_poly(model, x_train, y_train, x_cv, y_cv, max_degree=10, baseline=None):
    
    train_mses = []
    cv_mses = []
    models = []
    scalers = []
    degrees = range(1,max_degree+1)

    # Loop over 10 times. Each adding one more degree of polynomial higher than the last.
    for degree in degrees:

        # Add polynomial features to the training set
        poly = PolynomialFeatures(degree, include_bias=False)
        X_train_mapped = poly.fit_transform(x_train)

        # Scale the training set
        scaler_poly = StandardScaler()
        X_train_mapped_scaled = scaler_poly.fit_transform(X_train_mapped)
        scalers.append(scaler_poly)

        # Create and train the model
        model.fit(X_train_mapped_scaled, y_train )
        models.append(model)

        # Compute the training MSE
        yhat = model.predict(X_train_mapped_scaled)
        train_mse = mean_squared_error(y_train, yhat) / 2
        train_mses.append(train_mse)

        # Add polynomial features and scale the cross-validation set
        poly = PolynomialFeatures(degree, include_bias=False)
        X_cv_mapped = poly.fit_transform(x_cv)
        X_cv_mapped_scaled = scaler_poly.transform(X_cv_mapped)

        # Compute the cross-validation MSE
        yhat = model.predict(X_cv_mapped_scaled)
        cv_mse = mean_squared_error(y_cv, yhat) / 2
        cv_mses.append(cv_mse)

    # Plot the results
    plt.plot(degrees, train_mses, marker='o', c='r', label='training MSEs'); 
    plt.plot(degrees, cv_mses, marker='o', c='b', label='CV MSEs'); 
    plt.plot(degrees, np.repeat(baseline, len(degrees)), linestyle='--', label='baseline')
    plt.title("degree of polynomial vs. train and CV MSEs")
    plt.xticks(degrees)
    plt.xlabel("degree"); 
    plt.ylabel("MSE"); 
    plt.legend()
    plt.show()
    
def train_plot_reg_params(reg_params, x_train, y_train, x_cv, y_cv, degree= 1, baseline=None):
    
    train_mses = []
    cv_mses = []
    models = []
    scalers = []

    # Loop over 10 times. Each adding one more degree of polynomial higher than the last.
    for reg_param in reg_params:

        # Add polynomial features to the training set
        poly = PolynomialFeatures(degree, include_bias=False)
        X_train_mapped = poly.fit_transform(x_train)

        # Scale the training set
        scaler_poly = StandardScaler()
        X_train_mapped_scaled = scaler_poly.fit_transform(X_train_mapped)
        scalers.append(scaler_poly)

        # Create and train the model
        model = Ridge(alpha=reg_param)
        model.fit(X_train_mapped_scaled, y_train)
        models.append(model)

        # Compute the training MSE
        yhat = model.predict(X_train_mapped_scaled)
        train_mse = mean_squared_error(y_train, yhat) / 2
        train_mses.append(train_mse)

        # Add polynomial features and scale the cross-validation set
        poly = PolynomialFeatures(degree, include_bias=False)
        X_cv_mapped = poly.fit_transform(x_cv)
        X_cv_mapped_scaled = scaler_poly.transform(X_cv_mapped)

        # Compute the cross-validation MSE
        yhat = model.predict(X_cv_mapped_scaled)
        cv_mse = mean_squared_error(y_cv, yhat) / 2
        cv_mses.append(cv_mse)

    # Plot the results
    reg_params = [str(x) for x in reg_params]
    plt.plot(reg_params, train_mses, marker='o', c='r', label='training MSEs'); 
    plt.plot(reg_params, cv_mses, marker='o', c='b', label='CV MSEs'); 
    plt.plot(reg_params, np.repeat(baseline, len(reg_params)), linestyle='--', label='baseline')
    plt.title("lambda vs. train and CV MSEs")
    plt.xlabel("lambda"); 
    plt.ylabel("MSE"); 
    plt.legend()
    plt.show()

def train_plot_diff_datasets(model, files, max_degree=10, baseline=None):
    
    for file in files:
        
        x_train, y_train, x_cv, y_cv, x_test, y_test = prepare_dataset(file['filename'])

        train_mses = []
        cv_mses = []
        models = []
        scalers = []
        degrees = range(1,max_degree+1)

        # Loop over 10 times. Each adding one more degree of polynomial higher than the last.
        for degree in degrees:

            # Add polynomial features to the training set
            poly = PolynomialFeatures(degree, include_bias=False)
            X_train_mapped = poly.fit_transform(x_train)

            # Scale the training set
            scaler_poly = StandardScaler()
            X_train_mapped_scaled = scaler_poly.fit_transform(X_train_mapped)
            scalers.append(scaler_poly)

            # Create and train the model
            model.fit(X_train_mapped_scaled, y_train )
            models.append(model)

            # Compute the training MSE
            yhat = model.predict(X_train_mapped_scaled)
            train_mse = mean_squared_error(y_train, yhat) / 2
            train_mses.append(train_mse)

            # Add polynomial features and scale the cross-validation set
            poly = PolynomialFeatures(degree, include_bias=False)
            X_cv_mapped = poly.fit_transform(x_cv)
            X_cv_mapped_scaled = scaler_poly.transform(X_cv_mapped)

            # Compute the cross-validation MSE
            yhat = model.predict(X_cv_mapped_scaled)
            cv_mse = mean_squared_error(y_cv, yhat) / 2
            cv_mses.append(cv_mse)

        # Plot the results
        plt.plot(degrees, train_mses, marker='o', c='r', linestyle=file['linestyle'], label=f"{file['label']} training MSEs"); 
        plt.plot(degrees, cv_mses, marker='o', c='b', linestyle=file['linestyle'], label=f"{file['label']} CV MSEs"); 

    plt.plot(degrees, np.repeat(baseline, len(degrees)), linestyle='--', label='baseline')
    plt.title("degree of polynomial vs. train and CV MSEs")
    plt.xticks(degrees)
    plt.xlabel("degree"); 
    plt.ylabel("MSE"); 
    plt.legend()
    plt.show()

def train_plot_learning_curve(model, x_train, y_train, x_cv, y_cv, degree= 1, baseline=None):
    
    train_mses = []
    cv_mses = []
    models = []
    scalers = []
    num_samples_train_and_cv = []
    percents = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

    # Loop over 10 times. Each adding one more degree of polynomial higher than the last.
    for percent in percents:
        
        num_samples_train = round(len(x_train) * (percent/100.0))
        num_samples_cv = round(len(x_cv) * (percent/100.0))
        num_samples_train_and_cv.append(num_samples_train + num_samples_cv)
        
        x_train_sub = x_train[:num_samples_train]
        y_train_sub = y_train[:num_samples_train]
        x_cv_sub = x_cv[:num_samples_cv]
        y_cv_sub = y_cv[:num_samples_cv]

        # Add polynomial features to the training set
        poly = PolynomialFeatures(degree, include_bias=False)
        X_train_mapped = poly.fit_transform(x_train_sub)

        # Scale the training set
        scaler_poly = StandardScaler()
        X_train_mapped_scaled = scaler_poly.fit_transform(X_train_mapped)
        scalers.append(scaler_poly)

        # Create and train the model
        model.fit(X_train_mapped_scaled, y_train_sub)
        models.append(model)

        # Compute the training MSE
        yhat = model.predict(X_train_mapped_scaled)
        train_mse = mean_squared_error(y_train_sub, yhat) / 2
        train_mses.append(train_mse)

        # Add polynomial features and scale the cross-validation set
        poly = PolynomialFeatures(degree, include_bias=False)
        X_cv_mapped = poly.fit_transform(x_cv_sub)
        X_cv_mapped_scaled = scaler_poly.transform(X_cv_mapped)

        # Compute the cross-validation MSE
        yhat = model.predict(X_cv_mapped_scaled)
        cv_mse = mean_squared_error(y_cv_sub, yhat) / 2
        cv_mses.append(cv_mse)

    # Plot the results
    plt.plot(num_samples_train_and_cv, train_mses, marker='o', c='r', label='training MSEs'); 
    plt.plot(num_samples_train_and_cv, cv_mses, marker='o', c='b', label='CV MSEs'); 
    plt.plot(num_samples_train_and_cv, np.repeat(baseline, len(percents)), linestyle='--', label='baseline')
    plt.title("number of examples vs. train and CV MSEs")
    plt.xlabel("total number of training and cv examples"); 
    plt.ylabel("MSE"); 
    plt.legend()
    plt.show()