# Feed-Forward Neural Networks

This notebook provides an introduction to how feed-forward networks are created in tensorflow. The notebook shows how to 
create networks for both classification and regression problems. 

The notebook focuses on the creation of the networks. The training and evaluation is limited. See multilayer-perceptron-for-mnist.ipynb for more details on these aspects.

In [29]:
import numpy as np 
import pandas as pd 
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, Softmax, Normalization

## Classification for Heart Disease

In [30]:
# Load our dataset
dataset = pd.read_csv('../data/heart_failure/heart.csv')
dataset.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [31]:
# Separate features from targets
features = dataset.drop('HeartDisease',axis=1)
targets = dataset['HeartDisease']

# Encode our targets using one-hot encoding
targets_onehot = pd.get_dummies(targets)
targets_onehot.head()

Unnamed: 0,0,1
0,True,False
1,False,True
2,True,False
3,False,True
4,True,False


In [32]:
# Check distribution of target values
targets.value_counts()

HeartDisease
1    508
0    410
Name: count, dtype: int64

In [33]:
features.shape

(918, 11)

In [34]:
# One-hot encode categorical features
features_onehot = pd.get_dummies(features)
features_onehot.shape

(918, 20)

In [35]:
# Convert to float (probably not needed (depends on the version of TF), but neural networks use floats 
# internally, and this lets us specify the type explicitly)
features_onehot = features_onehot.astype('float32')
targets_onehot = targets_onehot.astype('float32')
features_onehot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                918 non-null    float32
 1   RestingBP          918 non-null    float32
 2   Cholesterol        918 non-null    float32
 3   FastingBS          918 non-null    float32
 4   MaxHR              918 non-null    float32
 5   Oldpeak            918 non-null    float32
 6   Sex_F              918 non-null    float32
 7   Sex_M              918 non-null    float32
 8   ChestPainType_ASY  918 non-null    float32
 9   ChestPainType_ATA  918 non-null    float32
 10  ChestPainType_NAP  918 non-null    float32
 11  ChestPainType_TA   918 non-null    float32
 12  RestingECG_LVH     918 non-null    float32
 13  RestingECG_Normal  918 non-null    float32
 14  RestingECG_ST      918 non-null    float32
 15  ExerciseAngina_N   918 non-null    float32
 16  ExerciseAngina_Y   918 non

In [36]:
# Split our data into 90/10
train_features, test_features, train_targets, test_targets = train_test_split(features_onehot, targets_onehot, test_size=0.1, stratify=targets)

In [37]:
# Show the shape of the input data i.e. num rows and num columns
train_features.shape

(826, 20)

In [38]:
# Convert the dataframes into tensors
train_features_tensor = tf.convert_to_tensor(train_features)
train_targets_tensor = tf.convert_to_tensor(train_targets)
test_features_tensor = tf.convert_to_tensor(test_features)
test_targets_tensor = tf.convert_to_tensor(test_targets)

In [39]:
# Create our first neural network. Sequential provides the easiest way to create FF networks. It takes a list of layers as input. 
# The first layer should always be an Input layer, which is there mainly to provide an input shape i.e. let out network know how
# many features there are.
model = Sequential([
    Input(shape=(20,)),                 # We have 20 features, so the input must specified accordingly
    Dense(20,activation='sigmoid'),     # We have a single hidden layer with 20 neurons, the sigmoid activation function is a common way to normalize the output
    Dense(2,activation="sigmoid"),      # We have one-hot encoded targets and a binary problem, so we need 2 output neurons,   
    Softmax()])                         # Softmax provides a way to convert the outputs to probabilities i.e. make them sum to 1 

# We finish the model by specifying what algorithm to use to train the network (adam is a stochastic gradient descent variant),
# what loss function to use, and what additional metrics to use to evaluate the model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [40]:
# Train our network. The number of epochs specify how many iterations to go through the training data (batches),
# and batch_size specifies how many instances to feed through the network before updating the weights according the 
# the backpropagated errors
# The number of epoch should be adjusted to find where the loss performance reaches a minimum. The weights are 
# randomly initialized so each training run can produce different results.
model.fit(train_features_tensor, train_targets_tensor, epochs=20, batch_size=2)

Epoch 1/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5669 - loss: 0.6710
Epoch 2/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5355 - loss: 0.6501
Epoch 3/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5958 - loss: 0.6489
Epoch 4/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6458 - loss: 0.6420
Epoch 5/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6523 - loss: 0.6326
Epoch 6/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6592 - loss: 0.6340
Epoch 7/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6973 - loss: 0.6171
Epoch 8/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6815 - loss: 0.6161
Epoch 9/20
[1m413/413[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x70d3abde7aa0>

In [41]:
# The trained model can be evaluated with the specified metrics by invoking the evaluate method 
# with the desired data. The returned values are the loss value followed by the specified metrics, 
# in this case CCE and accuracy
model.evaluate(test_features, test_targets)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 159ms/step - accuracy: 0.8419 - loss: 0.5251


[0.5141952633857727, 0.8478260636329651]

In [42]:
# Create a slightly simpler network by decreasing the number of neurons in the hidden layer. This
# is an example of hyperparameter tuning; both the number of neurons in each layer and the number 
# of layers should be considered hyperparameters in the context of ANNs
model_simple = Sequential([Input(shape=(20,)), Dense(10,activation='sigmoid'), Dense(2,activation="sigmoid")])
model_simple.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model_simple.fit(train_features, train_targets, epochs=20, batch_size=2)

Epoch 1/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5812 - loss: 0.6384
Epoch 2/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7058 - loss: 0.5933
Epoch 3/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7149 - loss: 0.5802
Epoch 4/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7119 - loss: 0.5761
Epoch 5/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7289 - loss: 0.5612
Epoch 6/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7544 - loss: 0.5459
Epoch 7/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7385 - loss: 0.5487
Epoch 8/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7519 - loss: 0.5326
Epoch 9/20
[1m413/413[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x70d392b3dfd0>

In [43]:
model_simple.evaluate(test_features, test_targets)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 0.8247 - loss: 0.4676


[0.4460592567920685, 0.8369565010070801]

## Regression for Life Expectancy

In [44]:
# Load and Pre-process the dataset.
# Load the dataset
dataset2 = pd.read_csv('../data/life_expectancy/Life Expectancy Data.csv')
dataset2.head()

# Drop all rows where we are missing the life expectancy. Had to put the key in a list 
# for this to work in this notebook!?
dataset2 = dataset2.dropna(subset=['Life expectancy '])

# Fill the remaining missing values with the means for each column
float_cols_with_nas = ['Alcohol','Hepatitis B',' BMI ','Polio','Total expenditure','Diphtheria ','GDP','Population',' thinness  1-19 years',' thinness 5-9 years','Income composition of resources','Schooling']
dataset2[float_cols_with_nas] = dataset2[float_cols_with_nas].fillna(dataset2[float_cols_with_nas].mean())

# Convert the categorical columns to appropriate types
dataset2[['Country', 'Status']] = dataset2[['Country', 'Status']].astype('category')

# Encode the categorical columns using one-hot encoding. Note that the country category will 
# lead to a large number of one-hot encoded columns. There might be better alternatives e.g. TargetEncoder.
dataset2 = pd.get_dummies(dataset2)

# Separate into features/targets
targets2 = dataset2['Life expectancy ']
features2 = dataset2.drop('Life expectancy ', axis=1)
features2 = features2.astype('float32')
targets2 = targets2.astype('float32')

# Split the data into 75/25 
train_targets2, test_targets2, train_features2, test_features2 = train_test_split(targets2,features2)
 
# Convert to tensors
train_features2_tensor = tf.convert_to_tensor(train_features2)
train_targets2_tensor = tf.convert_to_tensor(train_targets2)
test_features2_tensor = tf.convert_to_tensor(test_features2)
test_targets2_tensor = tf.convert_to_tensor(test_targets2)


In [45]:
# Create a simple FF with one hidden layer with 204 neurons, and one output neuron since this a 
# regression problem. The linear activation function is no activation and the default value if none
# is specified 
model2 = Sequential([Input(shape=(204,)),Dense(units=204,activation='sigmoid'), Dense(units=1,activation="linear")])
# Specify the optimizer algorithm, the loss function and any additional metrics. This is a regression problem so we 
# use MSE for loss and MAE as an additional metric
model2.compile(optimizer='adam',loss='mean_squared_error',metrics=['mean_absolute_error'])
# Train the network using 20 epochs and a batch size of 1. This batch size is probably not the best
# choice since it is likely to produce ineffective traversal of the landscape, and it also require 
# more runtime since the weights are updated after each image
model2.fit(train_features2_tensor,train_targets2_tensor, epochs=20, batch_size=1)

Epoch 1/20
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 1892.3175 - mean_absolute_error: 36.5020
Epoch 2/20
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 89.2478 - mean_absolute_error: 7.6219
Epoch 3/20
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 93.9739 - mean_absolute_error: 7.9544
Epoch 4/20
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 92.2664 - mean_absolute_error: 7.8002
Epoch 5/20
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 93.6394 - mean_absolute_error: 7.9300
Epoch 6/20
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 91.9508 - mean_absolute_error: 7.8082
Epoch 7/20
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 87.9474 - mean_absolute_error: 7.6675
Epoch 8/20
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x70d3d0df3d70>

In [46]:
# Evaluate the network using the training data
model2.evaluate(test_features2_tensor, test_targets2_tensor)

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 86.5278 - mean_absolute_error: 7.5660


[86.6064453125, 7.485737323760986]

In [47]:
# We can add a normalization step to our network by adding a normalization layer. This 
# is similar to how pipelines work in scikit-learn, you can add any number of processing
# steps befor the actual network. 

# Create the normalization layer
normalization = Normalization(axis=None)
# A layer should be trained explicitly using adapt
normalization.adapt(train_features2_tensor)
# Create a new architecture that includes the normalization layer, and has only 40 hidden neurons
model3 = Sequential([Input(shape=(204,)),normalization,Dense(units=40,activation='sigmoid'),Dense(units=1)])
# Optimize using SGD using MSE loss and MAE evaluation
model3.compile(optimizer='sgd',loss='mean_squared_error',metrics=['mean_absolute_error'])
# Train the network using 20 epochs and a batch size of 24. 
model3.fit(train_features2_tensor, train_targets2_tensor, epochs=20, batch_size=24)


Epoch 1/20
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 544.9052 - mean_absolute_error: 15.1686
Epoch 2/20
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 92.8476 - mean_absolute_error: 7.8460
Epoch 3/20
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 92.2737 - mean_absolute_error: 7.8268
Epoch 4/20
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 97.2351 - mean_absolute_error: 8.0968
Epoch 5/20
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 91.3850 - mean_absolute_error: 7.8134
Epoch 6/20
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 93.0944 - mean_absolute_error: 7.9308
Epoch 7/20
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 91.3447 - mean_absolute_error: 7.8203
Epoch 8/20
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - lo

<keras.src.callbacks.history.History at 0x70d37a93d6a0>

In [48]:
# Evaluate using internal function
model3.evaluate(test_features2_tensor, test_targets2_tensor)

# Predict by invoking the model itself
model3(test_features2_tensor[:40])

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 89.0771 - mean_absolute_error: 7.5583


<tf.Tensor: shape=(40, 1), dtype=float32, numpy=
array([[69.57342 ],
       [71.00403 ],
       [69.51534 ],
       [69.70304 ],
       [70.07866 ],
       [69.66648 ],
       [71.00915 ],
       [69.55703 ],
       [69.77534 ],
       [70.23212 ],
       [71.00048 ],
       [69.60717 ],
       [71.12518 ],
       [69.89737 ],
       [69.5137  ],
       [69.679726],
       [69.813805],
       [71.0015  ],
       [69.52632 ],
       [70.99084 ],
       [71.17711 ],
       [71.00151 ],
       [69.68252 ],
       [71.00402 ],
       [70.02606 ],
       [71.000755],
       [70.54343 ],
       [71.008125],
       [69.91454 ],
       [71.00151 ],
       [69.579735],
       [70.52903 ],
       [69.55221 ],
       [70.30667 ],
       [71.313484],
       [69.8637  ],
       [70.556465],
       [70.745285],
       [69.547554],
       [71.00082 ]], dtype=float32)>

In [49]:
# Try to create a deeper neural network (5 hidden layers). We use relu as the activation
# function since this is likely more suitable for deep networks and less likely to cause 
# vanishing gradients
model4 = Sequential()
model4.add(Input(shape=(204,)))
model4.add(normalization)
model4.add(Dense(40, activation='relu'))
model4.add(Dense(40, activation='relu'))
model4.add(Dense(40, activation='relu'))
model4.add(Dense(40, activation='relu'))
model4.add(Dense(40, activation='relu'))
model4.add(Dense(1, activation='linear'))
model4.compile('adam',loss='mean_squared_error',metrics=['mean_absolute_error'])

model4.fit(train_features2,train_targets2, epochs=40, batch_size=20)

Epoch 1/40
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 4160.0532 - mean_absolute_error: 62.0723
Epoch 2/40
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 177.8534 - mean_absolute_error: 10.0342
Epoch 3/40
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 99.8712 - mean_absolute_error: 8.0173 
Epoch 4/40
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 102.1001 - mean_absolute_error: 8.0867
Epoch 5/40
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 92.2253 - mean_absolute_error: 7.6978
Epoch 6/40
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 96.1920 - mean_absolute_error: 7.9855
Epoch 7/40
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 92.5978 - mean_absolute_error: 7.8625
Epoch 8/40
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

<keras.src.callbacks.history.History at 0x70d3ca32f980>

In [50]:
# Create a final FF architecture manualle (without using Sequence). Layers are chained 
# together implicitly through the order of invocation 
inputs = Input(shape=(204,))
print(inputs.shape)
norm_inputs = normalization(inputs)                                             # The normalization layer is fed the inputs
x = Dense(40, activation='relu')(norm_inputs)                                   # The first hidden layer gets its' input from the normalization output
x = Dense(40, activation='relu')(x)                                             # The second from the first
x = Dense(40, activation='relu')(x) + Dense(40,activation='relu')(norm_inputs)  # The third layer features a skip connection (inputs both from previous layer and normalization)
x = Dense(1, activation='linear')(x)                                            # The output 
model5 = Model(inputs=inputs, outputs=x)
print(model5.summary())
model5.compile('adam',loss='mean_squared_error',metrics=['mean_absolute_error'])

model5.fit(train_features2,train_targets2, epochs=20, batch_size=20)


(None, 204)


None
Epoch 1/20
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 4694.9248 - mean_absolute_error: 67.5379
Epoch 2/20
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1971.2081 - mean_absolute_error: 35.9406
Epoch 3/20
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 282.5911 - mean_absolute_error: 10.9633
Epoch 4/20
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 116.3346 - mean_absolute_error: 8.2391
Epoch 5/20
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 93.4112 - mean_absolute_error: 7.8727
Epoch 6/20
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 92.0372 - mean_absolute_error: 7.8405
Epoch 7/20
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 96.3700 - mean_absolute_error: 7.9952
Epoch 8/20
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x70d378bf50d0>