In [28]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 36)
pd.set_option("display.max_colwidth", 72)

seed = 42
import numpy as np
np.random.seed(seed)

# graphics
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 24})
plt.rcParams["figure.figsize"] = (20,8)
#plt.style.use('fivethirtyeight')
mono_font = {'fontname':'monospace'}

import seaborn as sns
sns.set(font_scale=1)
#sns.set_style("whitegrid")

import plotly.io as pio
# for use in JupyterLab 4
#pio.renderers.default = 'iframe'
# for use in Google Colab
pio.renderers.default = 'colab'
import plotly as py
import plotly.express as px

#!pip install -q --no-deps scikit_learn==1.7.1
#import sklearn
#sklearn.set_config(transform_output="pandas")
#sklearn.set_config(transform_output="polars")

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

# for regression problems
from sklearn.linear_model import LinearRegression
# metrics
from sklearn.metrics import mean_squared_error, root_mean_squared_error

# Introduction
> Concrete is the most important material in civil engineering. The
concrete compressive strength is a highly nonlinear function of age and
ingredients. These ingredients include cement, blast furnace slag, fly ash,
water, superplasticizer, coarse aggregate, and fine aggregate.

Our task is to predict the concrete compressive strength.<br>
I.e. our `target` is the `Strength` column.


In [29]:
df = pd.read_csv("https://raw.githubusercontent.com/Carl-McBride-Ellis/datasets_public/refs/heads/main/concrete_data.csv")
df

Unnamed: 0,Cement,Blast_Furnace_Slag,Fly_Ash,Water,Superplasticizer,Coarse_Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.052780
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.284354
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.178794
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.696601
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.768036


## 1. EDA
Perform an EDA of the dataset, for example looking at summary statistics, univariate histograms and bivariate scatter plots.

In [30]:
df.describe()

Unnamed: 0,Cement,Blast_Furnace_Slag,Fly_Ash,Water,Superplasticizer,Coarse_Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.165631,73.895485,54.187136,181.566359,6.203112,972.918592,773.578883,45.662136,35.817836
std,104.507142,86.279104,63.996469,21.355567,5.973492,77.753818,80.175427,63.169912,16.705679
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.707115
50%,272.9,22.0,0.0,185.0,6.35,968.0,779.51,28.0,34.442774
75%,350.0,142.95,118.27,192.0,10.16,1029.4,824.0,56.0,46.136287
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.599225


## 2. Data splitting
Split the data into disjoint training and test datasets

In [31]:
X = df
y = df.pop('Strength')

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

## 3. Re-scale the data
For linear regression we should re-scale the data

In [32]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 4. Our estimator

In [33]:
regressor = LinearRegression()

## 5. Fit the model and look at the feature importance

In [34]:
regressor.fit(X_train, y_train)

## 6. Calculate the baseline score

In [35]:
from sklearn.metrics import root_mean_squared_error as metric
y_train_mean = np.full( len(y_test), np.mean(y_train) )
metric(y_test, y_train_mean)

16.05368658359623

## 7. Cross-validation score

In [36]:
from sklearn.model_selection import cross_val_score, KFold
# Regression using the L2 loss
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import make_scorer
RMSE = make_scorer(root_mean_squared_error)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
CV_scores = cross_val_score(regressor,
X_train, y_train,
cv=kf,
scoring=RMSE,
n_jobs=-1)
print("CV scores: ", CV_scores)
print("RMS of CV RMSE: ",np.sqrt(np.mean(np.square(CV_scores))))
print("Std. dev of CV score: ", np.std(CV_scores))

CV scores:  [10.47070075 10.73576305 11.46043754 10.46015661 10.01173939]
RMS of CV RMSE:  10.638454896732606
Std. dev of CV score:  0.476918534127761


## 9. Results and conclusions

In [37]:
y_pred = regressor.predict(X_test)