# Jupyter Notebook: Revisting a Concrete Strength Regression
##### Jorge Gimenez & Juan Carlos Soriano

In [9]:
from sklearn.datasets import make_regression
import numpy as np
import pandas as pd
%matplotlib notebook
from matplotlib import pyplot as plt
import scipy.stats

# 3 decimals
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Llegir dades en csv
def load_dataset(path):
    dataset = pd.read_csv(path, header=0, delimiter=',')
    return dataset

# Carrega del dataset
dataset = load_dataset('./Database/Concrete_Data_Yeh.csv')
data = dataset.values


x = data[:, :(data.shape[1]-1)]
y = data[:, (data.shape[1]-1)]

print("Dimensio Database:", dataset.shape)
print("Dimensio entrades X:", x.shape)
print("Dimensio atribut Y:", y.shape)


Dimensio Database: (1030, 9)
Dimensio entrades X: (1030, 8)
Dimensio atribut Y: (1030,)


In [10]:
print(dataset.isnull().sum())

cement              0
slag                0
flyash              0
water               0
superplasticizer    0
coarseaggregate     0
fineaggregate       0
age                 0
csMPa               0
dtype: int64


In [11]:
dataset.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [12]:
dataset.describe()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.168,73.896,54.188,181.567,6.205,972.919,773.58,45.662,35.818
std,104.506,86.279,63.997,21.354,5.974,77.754,80.176,63.17,16.706
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [13]:
plt.figure()
plt.xlabel("Cement (kg/m3)")
plt.ylabel("Concrete Compressive Strength (MPa)")
ax = plt.scatter(x[:,0],y)

<IPython.core.display.Javascript object>

# Figure for every attribute on the DataBase contrasted with the Concrete Compressive Strength in MPa
    
    

In [14]:
index = 0
for col in dataset:
    if index < 8:
        plt.figure()
        plt.title(f"Comparació MPa-{col}")
        plt.xlabel(f"{col}")
        plt.ylabel("Concrete Compressive Strength (MPa)")
        ax = plt.scatter(x[:,index],y)
        index = index + 1
        

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Correlation Study 

In [51]:
import seaborn as sns

corr = dataset.corr()

plt.figure()

ax = sns.heatmap(corr, cmap='coolwarm', annot=True, linewidths=.75)

<IPython.core.display.Javascript object>

In [16]:
pp = sns.pairplot(dataset, palette="husl", diag_kind="kde")
pp.map_lower(sns.kdeplot, levels=3)

<IPython.core.display.Javascript object>

<seaborn.axisgrid.PairGrid at 0x2e392da8>

## Dirty Data
Let's see what we can do in order to clean our dataset and work with the correct inputs

In [40]:
#boxplot + stripplot

plt.figure()
sns.boxplot(x=dataset["cement"])
sns.stripplot(x=dataset["cement"], data=dataset, size = 3, edgecolor="gray")
plt.show()

<IPython.core.display.Javascript object>

Looking at the above figure we can\'t make a difference between the noise and the useful range, there is a lot of dispersion

In [44]:
plt.figure()
plt.title("Most important items Boxplot")
ax = sns.boxplot(data=dataset, order=["csMPa", "cement", "water", "age"])

#With this function we put the different entry points on the plot giving us a better idea where the heatmap is created
#sns.stripplot(data=dataset, order=["csMPa", "cement", "water", "age"], size = 3, edgecolor="gray")

plt.show()

<IPython.core.display.Javascript object>

Could we throw away those dirty data that are not into the frequent range and get rid of the possible **noise** on our dataset? 
The next question is what we do with those dataset entries, **delete them** and use only the frequent ones or **low their weight** on the final dataset correlation?

What about looking the higher and lower values on the Pearson Correlation Table?

In [50]:
sortedCorr = corr.unstack().sort_values(kind="quicksort", ascending = False)

print(sortedCorr[sortedCorr!=1].head(10), "\n")

print(sortedCorr[sortedCorr!=1].tail(10))

cement            csMPa              0.498
csMPa             cement             0.498
flyash            superplasticizer   0.378
superplasticizer  flyash             0.378
                  csMPa              0.366
csMPa             superplasticizer   0.366
age               csMPa              0.329
csMPa             age                0.329
water             age                0.278
age               water              0.278
dtype: float64 

csMPa             water              -0.290
water             csMPa              -0.290
slag              flyash             -0.324
flyash            slag               -0.324
                  cement             -0.397
cement            flyash             -0.397
fineaggregate     water              -0.451
water             fineaggregate      -0.451
superplasticizer  water              -0.658
water             superplasticizer   -0.658
dtype: float64


Higher Correlation

|Item|MPa Corr|
|----|---------------|
|Cement|0,5|
|SuperPlasticizer|0,37|
|Age|0,33|

Lower Correlation

|Item|MPa Corr|
|----|---------------|
|Water|-0,29|
|FineAggregate|-0,17|
|CoarseAggregate|-0,16|