#Part A: Build a baseline model

In [1]:
import pandas as pd
import numpy as np

In [2]:
!wget -q https://cocl.us/concrete_data

In [4]:
df = pd.read_csv("concrete_data")
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


#Data wrangling and EDA

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Cement              1030 non-null   float64
 1   Blast Furnace Slag  1030 non-null   float64
 2   Fly Ash             1030 non-null   float64
 3   Water               1030 non-null   float64
 4   Superplasticizer    1030 non-null   float64
 5   Coarse Aggregate    1030 non-null   float64
 6   Fine Aggregate      1030 non-null   float64
 7   Age                 1030 non-null   int64  
 8   Strength            1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [6]:
#checking for nulls
df.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

In [7]:
#descriptive statistics
df.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


##Feature selection

In [8]:
x = df.loc[:,df.columns!="Strength"]
x.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [9]:
#target variable
y =df["Strength"]
y.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

In [44]:
n_cols =x.shape[1]
n_cols

8

#Model building

###Import keras, required libraries from keras 

In [14]:
import keras

In [15]:
from keras.models import Sequential
from keras.layers import Dense

In [36]:
#define regression model
def regression_model():
  model = Sequential()

  model.add(Dense(10,activation="relu",input_shape=(n_cols,)))
  model.add(Dense(1))

  #compile model
  model.compile(optimizer="adam",loss="mean_squared_error")
  return model

In [43]:
from sklearn.metrics import mean_squared_error as MSE

model = regression_model()
mse = []  #empty mean squared error

for i in range(0,50):
  xtr,xte,ytr,yte = tts(x,y,test_size=0.3)
  model.fit(xtr,ytr,epochs=50,verbose=0)
  yhat = model.predict(xte)
  error = MSE(yte,yhat)
  mse.append(error)

mse

[208.78719075219684,
 110.21765980852055,
 75.44642882028447,
 57.179537742292446,
 56.47588452747049,
 56.30609398773349,
 46.64302230558537,
 44.6077556154711,
 52.70194513936033,
 58.34934149306482,
 56.212630614337705,
 56.49682386971899,
 50.736910164330446,
 47.067397453212266,
 47.31957236864499,
 52.51699821467591,
 51.634915422363555,
 51.2249242656657,
 50.8234092497561,
 57.833287754181,
 50.94099504406682,
 51.988153800656896,
 46.15085897921341,
 51.28483948004605,
 46.604172953359225,
 47.51488430830116,
 50.601613160477235,
 49.79918686274293,
 49.82773759271403,
 57.93126720184942,
 49.44300303841611,
 55.06164998385387,
 49.40068809302663,
 63.59704391510421,
 47.55153618653243,
 48.058632745777,
 49.23373680839788,
 49.27686962127714,
 49.94821498457236,
 48.995377631904,
 46.97622441870908,
 49.68288942900039,
 55.33990990410741,
 49.41690952812501,
 58.686521942304935,
 48.75190087889846,
 42.968063550531575,
 47.41789116646781,
 45.20853947128713,
 54.8692492903254

In [47]:
len(mse)

50

In [48]:
#mean of mean squared errors.
print("mean off mse = ",np.mean(mse))

mean off mse =  56.02220583081826


In [49]:
#standard deviation of the mean squared errors.
print("standard deviation of the mse = ",np.std(mse))

standard deviation of the mse =  23.932111596695954


#Part B: Normalize the data

In [50]:
#define regression model
def regression_model():
  model = Sequential()

  model.add(Dense(10,activation="relu",input_shape=(n_cols,)))
  model.add(Dense(1))

  #compile model
  model.compile(optimizer="adam",loss="mean_squared_error")
  return model

>We have to normalize data after splitting inorder to avoid data leak

In [51]:
from sklearn.metrics import mean_squared_error as MSE

model2 = regression_model()
mse = []  #empty mean squared error

for i in range(0,50):
  xtr,xte,ytr,yte = tts(x,y,test_size=0.3)

  #normalising train data and test data
  xtr_norm = (xtr-xtr.mean())/xtr.std()
  xte_norm = (xte-xte.mean())/xte.std()

  model2.fit(xtr_norm,ytr,epochs=50,verbose=0)
  yhat = model2.predict(xte_norm)
  error = MSE(yte,yhat)
  mse.append(error)

mse

[439.84951565829823,
 167.2881323805372,
 116.52696255574693,
 83.54628245267295,
 76.60663386104797,
 61.757964141753845,
 56.253277830378764,
 56.64436415803959,
 54.520585508517506,
 45.980364352914584,
 49.14363910108146,
 56.37780249059305,
 46.39790492551496,
 39.16821135704735,
 41.974978936292786,
 36.369992648610655,
 48.216746779332325,
 41.29339616500098,
 41.626030411861464,
 44.85540131943483,
 40.44387756597624,
 37.30751674425388,
 35.29120919049977,
 42.85916715023342,
 36.067394689528435,
 32.37308262335113,
 33.67781526401075,
 41.80650290679822,
 29.221944303021292,
 37.43589107325537,
 28.78095468224503,
 33.88428831292457,
 31.07476143130202,
 39.16864360017536,
 31.385877187884176,
 30.19296198398804,
 29.71447800150181,
 37.6188543020724,
 36.15119229833056,
 35.38366137592524,
 33.790906348089905,
 30.742139197309513,
 31.611501065877704,
 51.31374322062494,
 36.54802304981925,
 36.34287063024487,
 33.746110452582286,
 33.93495880154476,
 40.0080606198992,
 37.8

In [52]:
#mean of mean squared errors.
print("mean off mse = ",np.mean(mse))

mean off mse =  53.402061509465945


In [53]:
#standard deviation of the mean squared errors.
print("standard deviation of the mse = ",np.std(mse))

standard deviation of the mse =  59.87128600220436


Hence the mean of mse in part B is less than that of part A

#Part C: Increate the number of epochs

In [54]:
#define regression model
def regression_model():
  model = Sequential()

  model.add(Dense(10,activation="relu",input_shape=(n_cols,)))
  model.add(Dense(1))

  #compile model
  model.compile(optimizer="adam",loss="mean_squared_error")
  return model

>We have to normalize data after splitting inorder to avoid data leak

In [55]:
from sklearn.metrics import mean_squared_error as MSE

model3 = regression_model()
mse = []  #empty mean squared error

for i in range(0,50):
  xtr,xte,ytr,yte = tts(x,y,test_size=0.3)

  #normalising train data and test data
  xtr_norm = (xtr-xtr.mean())/xtr.std()
  xte_norm = (xte-xte.mean())/xte.std()

  model3.fit(xtr_norm,ytr,epochs=100,verbose=0)  #using 100 epochs for training.
  yhat = model3.predict(xte_norm)
  error = MSE(yte,yhat)
  mse.append(error)

mse

[146.77663814658078,
 93.36411251868697,
 53.60856827550833,
 54.160375988856494,
 50.51217518092866,
 42.72444408334272,
 55.367063618185384,
 52.721157784312624,
 61.00882527227466,
 40.69873948647878,
 60.98705863141358,
 58.68928764528565,
 40.69438765651557,
 48.67502476964363,
 46.67612239466983,
 41.95206048508998,
 51.85971513273133,
 48.0355478780723,
 39.94775579386421,
 49.31739933600077,
 43.25639887985103,
 43.50107340879277,
 38.36548572429632,
 48.24544365234896,
 56.538003419331446,
 38.00388134570902,
 48.68388051667007,
 43.580515103974314,
 55.934734579020464,
 45.840462613160085,
 48.21558522152279,
 57.90777854192687,
 39.84193420736204,
 36.00487964896491,
 52.767859032660624,
 45.70420930506415,
 47.252787767547495,
 48.21848723054029,
 43.63666131184858,
 39.89076369166843,
 46.330738199374245,
 42.68404208436195,
 53.90925068991481,
 42.09314420320541,
 38.93413012029169,
 44.34910885547187,
 51.21606395428008,
 47.93173970997465,
 43.56194492410869,
 46.282569

In [57]:
#mean of mean squared errors.
print("mean off mse = ",np.mean(mse))

mean off mse =  50.32920035200576


In [58]:
#standard deviation of the mean squared errors.
print("standard deviation of the mse = ",np.std(mse))

standard deviation of the mse =  16.401320812546707


Hence the mean of mse in part C is less than that of part B

#Part D: Increase the number of hidden layers 

In [59]:
#define regression model
def regression_model():
  model = Sequential()

  model.add(Dense(10,activation="relu",input_shape=(n_cols,)))
  model.add(Dense(10,activation="relu"))
  model.add(Dense(10,activation="relu"))
  model.add(Dense(1))

  #compile model
  model.compile(optimizer="adam",loss="mean_squared_error")
  return model

>We have to normalize data after splitting inorder to avoid data leak

In [60]:
from sklearn.metrics import mean_squared_error as MSE

model4 = regression_model()
mse = []  #empty mean squared error

for i in range(0,50):
  xtr,xte,ytr,yte = tts(x,y,test_size=0.3)

  #normalising train data and test data
  xtr_norm = (xtr-xtr.mean())/xtr.std()
  xte_norm = (xte-xte.mean())/xte.std()

  model4.fit(xtr_norm,ytr,epochs=50,verbose=0)
  yhat = model4.predict(xte_norm)
  error = MSE(yte,yhat)
  mse.append(error)

mse

[167.13197699572015,
 96.2181010256609,
 68.9870875378906,
 42.25445851966935,
 43.125303642492945,
 38.411045990772784,
 43.05971860450492,
 38.25554518064909,
 41.131142892565244,
 37.03294072599632,
 36.32436007675568,
 35.05650862971943,
 32.80361489017227,
 34.80491866617692,
 32.394063226787836,
 38.160598284182896,
 35.21455992636211,
 31.58021913726483,
 31.401842463556147,
 37.76445877871854,
 32.85106630840039,
 36.36219789288243,
 50.485952043655494,
 28.418990079617483,
 32.543285587148425,
 38.2472257795662,
 34.67497844712749,
 30.52514468383634,
 65.3176453713059,
 26.329527555873863,
 26.83905808883635,
 26.328902546426644,
 28.20501476400095,
 30.21827387605979,
 27.850911193939886,
 31.407096335029994,
 28.94632415062459,
 28.226737672174693,
 30.480730782856906,
 27.580197703325997,
 36.23556069028866,
 23.631209582008577,
 27.007136601673878,
 27.04049110129364,
 27.326691068751657,
 25.69224623322013,
 23.726137453164625,
 26.83396615744322,
 29.003817274573755,
 2

In [61]:
#mean of mean squared errors.
print("mean off mse = ",np.mean(mse))

mean off mse =  37.82155998122717


In [62]:
#standard deviation of the mean squared errors.
print("standard deviation of the mse = ",np.std(mse))

standard deviation of the mse =  22.285606184757278


Hence the mean of mse in part D is less than that of part B