## Download and Clean Dataset

In [1]:
import pandas as pd
import numpy as np

concrete_data = pd.read_csv('https://cocl.us/concrete_data')
concrete_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [2]:
concrete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
Cement                1030 non-null float64
Blast Furnace Slag    1030 non-null float64
Fly Ash               1030 non-null float64
Water                 1030 non-null float64
Superplasticizer      1030 non-null float64
Coarse Aggregate      1030 non-null float64
Fine Aggregate        1030 non-null float64
Age                   1030 non-null int64
Strength              1030 non-null float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [3]:
concrete_data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [4]:
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

## Operation functions

#### Set of Preprocessing Functions

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer

def convert_to_float(df, columns):
    df[columns] = df[columns].astype(np.float)
    return df

def standard_scaler(df, columns):
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

def split(df, class_column, test_size=0.3):
    y = df[class_column].values
    X = df[df.columns.difference([class_column])].values
    return train_test_split(X, y, test_size=test_size)

#### Set of Model Building Functions

In [6]:
import keras
from keras.models import Sequential
from keras.layers import Dense

def add_dense_layer(model, layer):
    if hasattr(layer, 'activation'):
        model.add(Dense(
            layer['n_nodes'], 
            activation=layer['activation']
        ))
    elif hasattr(layer, 'activation') and hasattr(layer, 'input_shappe'):
        model.add(Dense(
            layer['n_nodes'], 
            activation=layer['activation'],
            input_shape=layer['input_shape']
        ))
    else:
        model.add(Dense(
            layer['n_nodes']
        ))
    return model
    
def regression_model(layers:list):
    # create model
    model = Sequential()
    for layer in layers:
        model = add_dense_layer(model, layer)
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model


Using TensorFlow backend.


## Part B

In [7]:
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
tqdm().pandas()

scores = []

for step in tqdm(range(50)):
    model = regression_model([{
        'n_nodes': 10,
        'activation': 'relu',
        'input_shape': (len(concrete_data.columns)-1,)
    }, {
        'n_nodes': 1
    }])

    df = concrete_data.copy(deep=True)

    X_train, X_test, y_train, y_test = (
        df.pipe(convert_to_float, df.columns.difference(['Strength']))
        .pipe(standard_scaler, df.columns.difference(['Strength']))
        .pipe(split, 'Strength')
    )

    model.fit(X_train, y_train, epochs=100, verbose=0)
    pred = model.predict(X_test)
    scores.append(mean_squared_error(y_test, pred))

scores = np.array(scores)
print('Mean of mse {} and Std of mse {}'.format(scores.mean(), scores.std()))

0it [00:00, ?it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


100%|██████████| 50/50 [04:27<00:00,  7.21s/it]

Mean of mse 113.66935390281513 and Std of mse 7.836818983697575



