In [1]:
import keras
import statistics
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## importing data

In [2]:
df = pd.read_csv('concrete_data.csv')
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
df.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

## Splitting data into predictors and target

In [4]:
X = df.copy().drop('Strength', axis = 1)
y = df['Strength']

## A. Building a baseline model

In [5]:
def regression_model_1(n_cols):
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,))) # One hidden layer of 10 nodes, and a ReLU activation function
    model.add(Dense(1))

    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error') # the adam optimizer and the mean squared error as the loss function.
    return model

## Loop for split, model fit, and calculating mse

## part A, 50 epochs with raw data

In [6]:
mse_list_A = []
for i in range(50):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 1. train test split
    
    model = regression_model_1(X_train.shape[1])
    model.fit(X_train, y_train,  epochs=50, verbose=False) # 2. Train the model on the training data using 50 epochs.
    
    preds = model.predict(X_test)
    mse = mean_squared_error(preds, y_test)
    mse_list_A.append(mse)

## Normalizing the data

In [7]:
X_normalized = (X - X.mean()) / X.std()
X_normalized.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


## part B, 50 epochs with normalized data

In [8]:
mse_list_B = []
for i in range(50):
    
    X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.3) # 1. train test split
    
    model = regression_model_1(X_train.shape[1])
    model.fit(X_train, y_train,  epochs=50, verbose=False) # 2. Train the model on the training data using 50 epochs.
    
    preds = model.predict(X_test)
    mse = mean_squared_error(preds, y_test)
    mse_list_B.append(mse)

## part C, 100 epochs with normalized data

In [9]:
mse_list_C = []
for i in range(50):
    
    X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.3) # 1. train test split
    
    model = regression_model_1(X_train.shape[1])
    model.fit(X_train, y_train,  epochs=100, verbose=False) # 2. Train the model on the training data using 100 epochs.
    
    preds = model.predict(X_test)
    mse = mean_squared_error(preds, y_test)
    mse_list_C.append(mse)

In [10]:
## part C, 50 epochs with normalized data and new model

## Building new model with 3 hidden layers

In [11]:
def regression_model_2(n_cols):
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,))) # One hidden layer of 10 nodes, and a ReLU activation function
    model.add(Dense(10, activation='relu')) # 2nd hidden layer of 10 nodes, and a ReLU activation function
    model.add(Dense(10, activation='relu')) # 3rd hidden layer of 10 nodes, and a ReLU activation function
    model.add(Dense(1))

    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error') # the adam optimizer and the mean squared error as the loss function.
    return model

In [12]:
mse_list_D = []
for i in range(50):
    
    X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.3) # 1. train test split
    
    model = regression_model_2(X_train.shape[1])
    model.fit(X_train, y_train,  epochs=50, verbose=False) # 2. Train the model on the training data using 50 epochs.
    
    preds = model.predict(X_test)
    mse = mean_squared_error(preds, y_test)
    mse_list_D.append(mse)

# comments

### A

In [13]:
print('Mean for MSEs: ', statistics.mean(mse_list_A))
print('STDeviation for MSEs: ', statistics.pstdev(mse_list_A))

Mean for MSEs:  353.56578391459885
STDeviation for MSEs:  320.7488599478216


The mean for MSE and STDeviation is are too high, meaning our model is not a good estimator for this data. We need to add more layers to make it deep for good predictions, low errors.

### B 

In [14]:
print('Mean for MSEs: ', statistics.mean(mse_list_B))
print('STDeviation for MSEs: ', statistics.pstdev(mse_list_B))

Mean for MSEs:  400.71673111767444
STDeviation for MSEs:  104.9077620732844


Mean squared error compare to Step A didnt change too much, we can say that it is same but the STDeviation decreased because of the normalization. Normalizing the data make it more consistent for a model.

### C

In [15]:
print('Mean for MSEs: ', statistics.mean(mse_list_C))
print('STDeviation for MSEs: ', statistics.pstdev(mse_list_C))

Mean for MSEs:  168.37416310495112
STDeviation for MSEs:  21.60671322574165


Both Mean squared error and STDeviation are decreased compared to model A and B. It's because we used normalized data which is more consistent and more number of epochs which gives our model more accuracy. The ability of model on training increased because of increased number of epochs. So, the MSE decreased.

### D

In [16]:
print('Mean for MSEs: ', statistics.mean(mse_list_D))
print('STDeviation for MSEs: ', statistics.pstdev(mse_list_D))

Mean for MSEs:  127.0261665271318
STDeviation for MSEs:  16.149392417316424


The best MSe and STDeviation are obtained so far. Becasue we make the model deeper and used normalized data. Compared to Step B, the MSE and STDeviation decreased, due to usage of more complex model, which has better prediction ability. If we also increase the number of epochs to 100 as in C, MSE and STDeviation may decrease more.