# Regression Model in Keras

In [1]:
import pandas as pd
import numpy as np

In [2]:
concrete_data = pd.read_csv('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv')
concrete_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
concrete_data.shape

(1030, 9)

In [4]:
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

### Split data into predictors and target
The target variable in this problem is the concrete sample strength. Therefore, our predictors will be all the other columns.

In [5]:
concrete_data_columns = concrete_data.columns

predictors = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength
predictors_norm = (predictors - predictors.mean()) / predictors.std() #nomalised version
target = concrete_data['Strength'] # Strength column
n_cols = predictors.shape[1] # number of predictors
n_cols

8

In [6]:
import keras
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


# Part A

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error 

Define regression model

In [8]:
def regression_model(n_hidden = 1):
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,))) #input layer
    
    for i in range(n_hidden): #number of hidden layers
        model.add(Dense(10, activation='relu'))
        
    model.add(Dense(1)) #output layer
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model


### A.1. Splitting the data into a training and test sets by holding 30% of the data for testing
### A.2. Train on training data for 50 epochs
### A.3. return mean_squared error between prediction on test and actual test labels

In [9]:
def calc_mse(model, features, n_epochs=50):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3) #1
    model.fit(X_train, y_train, epochs=n_epochs, verbose=0) #2
    return mean_squared_error(y_test, model.predict(X_test)) #3

### A.4. Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors.

In [11]:
model = regression_model(n_hidden=1) # n_hidden=1  
mse_listA=[]
print('running...')
while len(mse_listA) < 50:
    mse = calc_mse(model=model, features=predictors) #n_epochs=50(default)
    mse_listA.append(mse)
    print('mse{}={}...'.format(len(mse_listA), mse_listA[-1]))
print('calculation completed.')

running...
mse1=148.030467234928...
mse2=112.29026830004891...
mse3=103.84467611266179...
mse4=96.90073327522644...
mse5=98.3575387025331...
mse6=105.24413370069743...
mse7=108.7189146772465...
mse8=70.75999359110634...
mse9=63.860252597177016...
mse10=59.57678219286116...
mse11=47.42248397955145...
mse12=48.478087145572694...
mse13=50.30262110370942...
mse14=55.53506164076601...
mse15=53.079523521127086...
mse16=53.64598687297436...
mse17=51.14009570916117...
mse18=54.657988687787594...
mse19=48.320803389980554...
mse20=59.1025139342195...
mse21=44.088684342967966...
mse22=54.88889349916351...
mse23=48.28428832131031...
mse24=58.4305035836784...
mse25=56.699317372177944...
mse26=45.25708754710977...
mse27=55.89941133439924...
mse28=51.58296950526839...
mse29=49.080247421536676...
mse30=48.75772222499391...
mse31=49.46707657638866...
mse32=58.340317653329315...
mse33=61.34779970489103...
mse34=51.992647343742156...
mse35=66.36630939985271...
mse36=67.9625483992333...
mse37=56.412540195

### A5. Report the mean and the standard deviation of the mean squared errors.

In [14]:
print('count = {}, the mean = {}, the standard deviation={}'.format(len(mse_listA), round(np.array(mse_listA).mean(), 3), round(np.array(mse_listA).std(), 3) ))

count = 50, the mean = 61.75, the standard deviation=21.392


# Part B

Repeat Part A but use a normalized version of the data.

In [13]:
model = regression_model(n_hidden=1) #hidden = 1
   
mse_listB=[]
print('running...')
while len(mse_listB) < 50:
    mse = calc_mse(model=model, features=predictors_norm)# normalized_version + n_epochs=50(default)
    mse_listB.append(mse)
    print('mse{}={}...'.format(len(mse_listB), mse_listB[-1]))
print('calculation completed.')

running...
mse1=131.2346045072095...
mse2=114.01426874525977...
mse3=107.99182782801887...
mse4=95.83878561179156...
mse5=94.28920426234376...
mse6=69.68087397754408...
mse7=48.76363190720589...
mse8=43.40767224339627...
mse9=44.349356964744416...
mse10=36.947603400624345...
mse11=43.124090185285915...
mse12=41.42864403482789...
mse13=31.785523295776308...
mse14=42.0551885446876...
mse15=32.98837226526215...
mse16=37.407547190686344...
mse17=32.97235972108524...
mse18=40.18683108037253...
mse19=37.491855087504035...
mse20=32.482079582511425...
mse21=36.70869033476383...
mse22=37.053666992186564...
mse23=38.17192691743233...
mse24=38.33836278730678...
mse25=31.112279818621623...
mse26=32.82614576460935...
mse27=33.2167716189771...
mse28=33.11861727101451...
mse29=32.36055719685855...
mse30=29.700804065710113...
mse31=33.20517566053811...
mse32=30.468053773063254...
mse33=31.17022301604571...
mse34=33.72728336926591...
mse35=28.53141139664144...
mse36=35.57629851594477...
mse37=31.837815

The mean of the mean squared errors is **decreased** compared to A

In [18]:
print('count = {}, the mean = {}, the standard deviation={}'.format(len(mse_listB), round(np.array(mse_listB).mean(), 3), round(np.array(mse_listB).std(), 3) ))

count = 50, the mean = 41.711, the standard deviation=23.875


# Part C

Repeat Part B but using 100 epochs for training.

In [16]:
model = regression_model(n_hidden=1)  #hidden = 1
mse_listC=[]
print('running...')
while len(mse_listC) < 50:
    mse = calc_mse(model=model, features=predictors_norm, n_epochs=100) #normalized_version + n_epochs=100
    mse_listC.append(mse)
    print('mse{}={}...'.format(len(mse_listC), mse_listC[-1]))
print('calculation completed.')

running...
mse1=105.70219841307689...
mse2=104.53070507361042...
mse3=112.07834057967654...
mse4=50.86922716199953...
mse5=42.4890806807713...
mse6=49.71336303123367...
mse7=41.568944913234496...
mse8=41.28025601590005...
mse9=45.03288021350182...
mse10=39.40225823944532...
mse11=36.08475896598199...
mse12=38.74788333235147...
mse13=38.11454854698084...
mse14=38.199072027327645...
mse15=37.96993990778225...
mse16=35.922543439791454...
mse17=41.1484598856316...
mse18=36.15443560613495...
mse19=32.28358734522494...
mse20=32.77925301441881...
mse21=38.71298606982265...
mse22=41.90725494609976...
mse23=33.317002289015775...
mse24=37.955960558340564...
mse25=31.286443467013335...
mse26=36.24507624916443...
mse27=31.774185895260626...
mse28=25.806365930601523...
mse29=31.4021327214565...
mse30=28.345806008187683...
mse31=29.612896750885806...
mse32=29.587414144983754...
mse33=32.102368960291784...
mse34=29.642488073008522...
mse35=31.273136241595743...
mse36=26.26077924180009...
mse37=30.792

The mean is **decreased** compared to B

In [17]:
print('count = {}, the mean = {}, the standard deviation={}'.format(len(mse_listC), round(np.array(mse_listC).mean(), 3), round(np.array(mse_listC).std(), 3) ))

count = 50, the mean = 38.596, the standard deviation=18.336


# Part D

Repeat part B but using a neural network with three hidden layers

In [19]:
model = regression_model(n_hidden=3)# n_hidden=3  
mse_listD=[]
print('running...')
while len(mse_listD) < 50:
    mse = calc_mse(model=model, features=predictors_norm)# normalized_version + n_epochs=50(default)
    mse_listD.append(mse)
    print('mse{}={}...'.format(len(mse_listD), mse_listD[-1]))
print('calculation completed.')

running...
mse1=127.89582015409574...
mse2=117.1333837344324...
mse3=89.17197625983921...
mse4=58.36644367931834...
mse5=43.713764715781835...
mse6=34.17021955666664...
mse7=33.11926297178395...
mse8=32.83531153404783...
mse9=31.775084064394804...
mse10=25.821547813100256...
mse11=31.138875919837428...
mse12=30.22849147073684...
mse13=26.19357914446754...
mse14=27.929907680288974...
mse15=31.933275482985024...
mse16=28.92343286858267...
mse17=27.161031041610347...
mse18=30.34030696159712...
mse19=24.183867553116478...
mse20=25.802581497064985...
mse21=24.830522978698728...
mse22=21.64533085790765...
mse23=23.777891463187373...
mse24=21.180488215895927...
mse25=23.7303364710776...
mse26=25.395176779917104...
mse27=23.597362155134967...
mse28=25.273495432534894...
mse29=23.03229708205741...
mse30=26.079397440311467...
mse31=20.812013818735586...
mse32=25.29658638737582...
mse33=18.935111658760686...
mse34=21.285069497863272...
mse35=22.516355387271645...
mse36=22.24822979823976...
mse37=

The mean is **decreased** compared to B and C

In [20]:
print('count = {}, the mean = {}, the standard deviation={}'.format(len(mse_listD), round(np.array(mse_listD).mean(), 3), round(np.array(mse_listD).std(), 3) ))

count = 50, the mean = 30.974, the standard deviation=21.741
