In [53]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from scipy import stats

from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from pprint import pprint
from time import time


df = pd.read_csv ("concrete.csv")

df.head()


Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,141.3,212.0,0.0,203.5,0.0,971.8,748.5,28,29.89
1,168.9,42.2,124.3,158.3,10.8,1080.8,796.2,14,23.51
2,250.0,0.0,95.7,187.4,5.5,956.9,861.2,28,29.22
3,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
4,154.8,183.4,0.0,193.3,9.1,1047.4,696.7,28,18.29


In [64]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cement,1030.0,281.167864,104.506364,102.0,192.375,272.9,350.0,540.0
slag,1030.0,73.895825,86.279342,0.0,0.0,22.0,142.95,359.4
ash,1030.0,54.18835,63.997004,0.0,0.0,0.0,118.3,200.1
water,1030.0,181.567282,21.354219,121.8,164.9,185.0,192.0,247.0
superplastic,1030.0,6.20466,5.973841,0.0,0.0,6.4,10.2,32.2
coarseagg,1030.0,972.918932,77.753954,801.0,932.0,968.0,1029.4,1145.0
fineagg,1030.0,773.580485,80.17598,594.0,730.95,779.5,824.0,992.6
age,1030.0,45.662136,63.169912,1.0,7.0,28.0,56.0,365.0
strength,1030.0,35.817961,16.705742,2.33,23.71,34.445,46.135,82.6


In [65]:
df.apply(lambda x: sum(x.isnull()))

cement          0
slag            0
ash             0
water           0
superplastic    0
coarseagg       0
fineagg         0
age             0
strength        0
dtype: int64

## Data Cleaning

### Identify the outlier
#### By using inter quarter range (IQR) method to detect and remove the outlier
Using IQR method is better than z-score approach, because most of the concrete data is skewed to left as show in the plots.


In [55]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df_out = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df_out.shape

(941, 9)

## Spliting Data

### Split By ratio of 70:30

In [56]:
X_train, X_test, y_train, y_test = train_test_split( df_out.loc[:, df.columns != 'strength'], df_out["strength"], random_state = 0) 

## Modelling
### Modelling with Pipeline and GridSearchCV

In [57]:

# Declare the simple pipeline
pipe = Pipeline([
        ('scaler',  StandardScaler()),
        ('reduce_dim', PCA()),
        ('regressor', Lasso())
        ])

#prepareing for the hyperparemeter to be tuned
n_features_to_test = np.arange(1, 9)
alpha_to_test = 2.0**np.arange(-6, +6)
scalers_to_test =  [StandardScaler(), RobustScaler(), QuantileTransformer()]
regressor_to_test = [Ridge(), Lasso()]

params = [
        {'scaler': scalers_to_test,
         'reduce_dim': [PCA()],
         'reduce_dim__n_components': n_features_to_test,
         'regressor': regressor_to_test,
         'regressor__alpha': alpha_to_test},

        {'scaler': scalers_to_test,
         'reduce_dim': [SelectKBest(f_regression)],
         'reduce_dim__k': n_features_to_test,
         'regressor': regressor_to_test,
         'regressor__alpha': alpha_to_test},
    
        {'scaler': scalers_to_test,
         'reduce_dim': [PCA()],
         'reduce_dim__n_components': n_features_to_test,
         'regressor': [LinearRegression()]},
    
        {'scaler': scalers_to_test,
         'reduce_dim': [SelectKBest(f_regression)],
         'reduce_dim__k': n_features_to_test,
         'regressor': [LinearRegression()]}    
        ]


gridsearch = GridSearchCV(pipe, params, scoring='r2', cv=5, n_jobs=-1, verbose=2)

print("pipeline:", [name for name, _ in pipe.steps])
print("parameters:")
pprint(params)
t0 = time()
gridsearch.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()
print('Final score is: ', gridsearch.score(X_test, y_test))

gridsearch.fit(X_train, y_train)

pipeline: ['scaler', 'reduce_dim', 'regressor']
parameters:
[{'reduce_dim': [PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)],
  'reduce_dim__n_components': array([1, 2, 3, 4, 5, 6, 7, 8]),
  'regressor': [Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
                Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)],
  'regressor__alpha': array([1.5625e-02, 3.1250e-02, 6.2500e-02, 1.2500e-01, 2.5000e-01,
       5.0000e-01, 1.0000e+00, 2.0000e+00, 4.0000e+00, 8.0000e+00,
       1.6000e+01, 3.2000e+01]),
  'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True),
             RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True),
        

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 366 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 1178 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 2310 tasks      | elapsed:   42.2s
[Parallel(n_jobs=-1)]: Done 3770 tasks      | elapsed:   54.6s
[Parallel(n_jobs=-1)]: Done 5550 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 6000 out of 6000 | elapsed:  1.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


done in 70.712s

Final score is:  0.8278429794334037
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits


[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 2014 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done 3712 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-1)]: Done 5902 tasks      | elapsed:   47.9s
[Parallel(n_jobs=-1)]: Done 6000 out of 6000 | elapsed:   48.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regressor', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True), RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True), QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=1000,
          output_distribution='uniform'...essor': [LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)]}],
       pre_dispatch='2

In [58]:
print(" Test score: {:.2f}". format( gridsearch.score( X_test, y_test)))

 Test score: 0.83


In [62]:
gridsearch.best_estimator_

Pipeline(memory=None,
     steps=[('scaler', QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=1000,
          output_distribution='uniform', random_state=None,
          subsample=100000)), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=8, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regressor', Ridge(alpha=0.125, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [63]:
gridsearch.best_params_

{'reduce_dim': PCA(copy=True, iterated_power='auto', n_components=8, random_state=None,
   svd_solver='auto', tol=0.0, whiten=False),
 'reduce_dim__n_components': 8,
 'regressor': Ridge(alpha=0.125, copy_X=True, fit_intercept=True, max_iter=None,
    normalize=False, random_state=None, solver='auto', tol=0.001),
 'regressor__alpha': 0.125,
 'scaler': QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=1000,
           output_distribution='uniform', random_state=None,
           subsample=100000)}