# A

# Q1: Neural networks do not support missing values

# Q2: Splitting the dataset into 2 different groups, training data to train the model on and the validation set which the model is tested on

# Q3: Features in a neural network are the variables or attributes in your data set

# Q4: The purpose of the activation functions is to introduce non-linearity into the output of a neuron

# Q5:

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
np.random.seed(123)

def preprocess(df):
    print('----------------------------------------------')
    print("Before preprocessing")
    print("Number of rows with 0 values for each variable")
    for col in df.columns:
        missing_rows = df.loc[df[col]==0].shape[0]
        print(col + ": " + str(missing_rows))
    print('----------------------------------------------')

    # Replace 0 values with the mean of the existing values
    df['Glucose'] = df['Glucose'].replace(0, np.nan)
    df['BloodPressure'] = df['BloodPressure'].replace(0, np.nan)
    df['SkinThickness'] = df['SkinThickness'].replace(0, np.nan)
    df['Insulin'] = df['Insulin'].replace(0, np.nan)
    df['BMI'] = df['BMI'].replace(0, np.nan)
    df['Glucose'] = df['Glucose'].fillna(df['Glucose'].mean())
    df['BloodPressure'] = df['BloodPressure'].fillna(df['BloodPressure'].mean())
    df['SkinThickness'] = df['SkinThickness'].fillna(df['SkinThickness'].mean())
    df['Insulin'] = df['Insulin'].fillna(df['Insulin'].mean())
    df['BMI'] = df['BMI'].fillna(df['BMI'].mean())

    print('----------------------------------------------')
    print("After preprocessing")
    print("Number of rows with 0 values for each variable")
    for col in df.columns:
        missing_rows = df.loc[df[col]==0].shape[0]
        print(col + ": " + str(missing_rows))
    print('----------------------------------------------')

    # Standardization
    df_scaled = preprocessing.scale(df)
    df_scaled = pd.DataFrame(df_scaled, columns=df.columns)
    df_scaled['Outcome'] = df['Outcome']
    df = df_scaled
    

    return df

In [2]:
import matplotlib
matplotlib.use("TkAgg")
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from keras.models import Sequential
from keras.layers import Dense, Dropout
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(16)

try:
    df = pd.read_csv('diabetes.csv')
except:
    print("""
      Dataset not found in your computer.
      Please find proper file path and data set for your BT4221 exercise.
      """)
    quit()

# Perform preprocessing and feature engineering. Please refer source code in utils.py.
df = preprocess(df)

# Split the data into a training and testing set
X = df.loc[:, df.columns != 'Outcome']
y = df.loc[:, 'Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Build neural network in Keras using sequential(). kera.models from sequential will make it.
model = Sequential()
# 'relu' is always used sa the activation function for intermediate hidden layers. We make 32 nodes and the input dimension is 8 (8 input variables). For the first input layer, you need to specify the number of input variables.
model.add(Dense(32, activation='relu', input_dim=8))

# This is the 2nd hidden layer. Increasing more layers can cause over-fitting and increaing complexity. We are going to use only 2 layers. You can increas the number of layers.
model.add(Dense(16, activation='relu'))
# The final output layer requires to have a activation function that makes a prediction on the class of the label. We are making a simple binary prediction: 1 for patients with diabetes and 0 for patients w/o diabetes. 'sigmoid' function will be ideal for the bianry choices.
model.add(Dense(1, activation='sigmoid'))

# Model compilation. We are defining the parameters of the training process via complie. You need to specify three parameters. We use the optimizer as 'adam'. We use the loss function as binary as we are dealing with binary choices. We use metrics for testing metric as accuracy for percentage.
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model training will be initiated
model.fit(X_train, y_train, epochs=200, verbose=False)

# We are evaluating the model from accuracy rate.
scores = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: %.2f%%\n" % (scores[1]*100))
scores = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: %.2f%%\n" % (scores[1]*100))




----------------------------------------------
Before preprocessing
Number of rows with 0 values for each variable
Pregnancies: 111
Glucose: 5
BloodPressure: 35
SkinThickness: 227
Insulin: 374
BMI: 11
DiabetesPedigreeFunction: 0
Age: 0
Outcome: 500
----------------------------------------------
----------------------------------------------
After preprocessing
Number of rows with 0 values for each variable
Pregnancies: 111
Glucose: 0
BloodPressure: 0
SkinThickness: 0
Insulin: 0
BMI: 0
DiabetesPedigreeFunction: 0
Age: 0
Outcome: 500
----------------------------------------------
Training Accuracy: 89.74%

Testing Accuracy: 79.22%



In [3]:
# Build neural network in Keras using sequential(). kera.models from sequential will make it.
model = Sequential()
# 'relu' is always used sa the activation function for intermediate hidden layers. We make 32 nodes and the input dimension is 8 (8 input variables). For the first input layer, you need to specify the number of input variables.
model.add(Dense(32, activation='relu', input_dim=8))

model.add(Dense(16, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(32, activation='relu'))
# This is the 2nd hidden layer. Increasing more layers can cause over-fitting and increaing complexity. We are going to use only 2 layers. You can increas the number of layers.
model.add(Dense(16, activation='relu'))
# The final output layer requires to have a activation function that makes a prediction on the class of the label. We are making a simple binary prediction: 1 for patients with diabetes and 0 for patients w/o diabetes. 'sigmoid' function will be ideal for the bianry choices.
model.add(Dense(1, activation='sigmoid'))

# Model compilation. We are defining the parameters of the training process via complie. You need to specify three parameters. We use the optimizer as 'adam'. We use the loss function as binary as we are dealing with binary choices. We use metrics for testing metric as accuracy for percentage.
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model training will be initiated
model.fit(X_train, y_train, epochs=200, verbose=False)

# We are evaluating the model from accuracy rate.
scores = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: %.2f%%\n" % (scores[1]*100))
scores = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: %.2f%%\n" % (scores[1]*100))

Training Accuracy: 98.37%

Testing Accuracy: 75.97%



# When layers increase from 2 to 6, the accuracy score has decreased from 80.5% to 75.3%

# Q6

In [4]:
# Build neural network in Keras using sequential(). kera.models from sequential will make it.
model = Sequential()
# 'relu' is always used sa the activation function for intermediate hidden layers. We make 32 nodes and the input dimension is 8 (8 input variables). For the first input layer, you need to specify the number of input variables.
model.add(Dense(32, activation='relu', input_dim=8))
model.add(Dropout(0.5))  
# This is the 2nd hidden layer. Increasing more layers can cause over-fitting and increaing complexity. We are going to use only 2 layers. You can increas the number of layers.
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))  
# The final output layer requires to have a activation function that makes a prediction on the class of the label. We are making a simple binary prediction: 1 for patients with diabetes and 0 for patients w/o diabetes. 'sigmoid' function will be ideal for the bianry choices.
model.add(Dense(1, activation='sigmoid'))

# Model compilation. We are defining the parameters of the training process via complie. You need to specify three parameters. We use the optimizer as 'adam'. We use the loss function as binary as we are dealing with binary choices. We use metrics for testing metric as accuracy for percentage.
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model training will be initiated
model.fit(X_train, y_train, epochs=200, verbose=False)

# We are evaluating the model from accuracy rate.
scores = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: %.2f%%\n" % (scores[1]*100))
scores = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: %.2f%%\n" % (scores[1]*100))




Training Accuracy: 79.15%

Testing Accuracy: 83.77%



# When the number of neurons is changed using the Dropout method, the accuracy increased from 80.5% to 83.12%