# 03 Error Estimation for Linear Regression and 3NN
 

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# import pandas as pd
# import numpy as np

# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline

# from sklearn.preprocessing import StandardScaler

# from sklearn.linear_model import LinearRegression
# from sklearn.neighbors import KNeighborsRegressor

# from sklearn.model_selection import cross_val_score

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_absolute_error


# Introductory Remarks

    - We're going to predict the strength of concrete!
    - We have a labeled dataset, originally described in
    I Cheng Yeh, "Modeling of strength of high performance concrete using artificial neural networks," Cement and Concrete Research, Vol. 28, No. 12, pp. 1797-1808 (1998).
    - Nowadays, it is available from the UC Irvine Machine Learning Repository. I have taken a copy and made it available to you as a CSV file called dataset_concrete.csv.
    - Use error estimation to compare linear regression and 3NN.




In [3]:
df = pd.read_csv('datasets/dataset_concrete.csv')
df

Unnamed: 0,cement,slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [4]:
df = df.sample(frac=1, random_state=2)
df.reset_index(drop=True, inplace=True)

In [5]:
features = ["cement", "slag", "fly_ash", "water", "superplasticizer", "coarse_aggregate", "fine_aggregate", "age"]
X = df[features]
y = df["strength"].values

In [6]:
df.dtypes

cement              float64
slag                float64
fly_ash             float64
water               float64
superplasticizer    float64
coarse_aggregate    float64
fine_aggregate      float64
age                   int64
strength            float64
dtype: object

In [7]:
X = df[features]
y = df['strength'].values

### Predictor

In [8]:
# create an object that shuffles and splits the data
ss = ShuffleSplit(n_splits=1, train_size=0.8, random_state=2)

In [9]:
# create a processor
preprocessor = ColumnTransformer([
    ("scaler", StandardScaler(), features)],
    remainder="passthrough")

In [10]:
# Create a pipeline that combines the preprocessor with liner regression

l_model = Pipeline([
    ("preprocessor", preprocessor),
    ("predictor", LinearRegression())
])

l_model.fit(X, y)

mean_absolute_error(y, l_model.predict(X))

8.214343706221815

In [11]:
y_predictions = l_model.predict(X)

In [12]:
mean_absolute_error(y, l_model.predict(X))

8.214343706221815

### Error Estimation

In [13]:
# choose between holdout and k-fold cv. 1000 examples is smallish, so k fold

In [14]:
np.mean(cross_val_score(l_model, X, y, scoring="neg_mean_absolute_error", cv=10))

#mean not there when doing holdout

-8.291989935333197

# KNN MODEL

In [15]:
# split off the test test: 20% of the dataset
dev_df, test_df = train_test_split(df, train_size=0.8, random_state=2)

In [16]:
# extract the features but leave as a dataframe
dev_X = dev_df[features]
test_X = test_df[features]

# target values converted to an 1d numpy array
dev_y = dev_df["strength"].values
test_y = test_df["strength"].values