In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv("../Data/Advertising.csv", index_col = 0)
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [8]:
number_of_samples, number_of_features = df.shape[0], df.shape[1]-1 # sales is label not feature
number_of_samples, number_of_features

(200, 3)

In [11]:
X, y = df.drop("Sales", axis="columns"), df["Sales"]
X.head()

Unnamed: 0,TV,Radio,Newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


In [12]:
y.head()

1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: Sales, dtype: float64

## Sklearn  - typical steps

1. train|test split (sometimes train|val|test split)
2. scaling sometimes required
    - min-max scaling
    - standardization
    - ...
    - scale the training data
    - scale the test data to the training data (to avoid data leakage)
3. Fit the algorithm to training data
4. Predict the test data
5. Evaluate

## Train|test split

In [15]:
from sklearn.model_selection import train_test_split

# help(train_test_split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((140, 3), (60, 3), (140,), (60,))

## Feature scaling

Normalization (min-max feature scaling) for each feature

- $X' = \frac{X-X_{min}}{X_{max} - X_{min}}$

Feature standardization (scaling to Z-scores)

- $X' = \frac{X-\mu}{\sigma}$

In [30]:
from sklearn.preprocessing import MinMaxScaler

# instatiate a scaler instance
scaler = MinMaxScaler()
scaler.fit(X_train) # set min max (use training data)

scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

# note scaled_X_test min != 0, scaled_X_test max != 1
scaled_X_train.min(), scaled_X_train.max(), scaled_X_test.min(), scaled_X_test.max()


(0.0, 1.0, 0.005964214711729622, 1.1302186878727631)

## Linear regression

### Ordinary least squares

In [44]:
from sklearn.linear_model import LinearRegression

model_OLS = LinearRegression()
model_OLS.fit(scaled_X_train, y_train)
print(model_OLS.coef_) # beta1, beta2, beta3
print(model_OLS.intercept_) # beta0

[13.02832938  9.88465985  0.69237469]
2.741855324852814


### Stochastic gradient descent

In [56]:
from sklearn.linear_model import SGDRegressor

model_SGD = SGDRegressor(loss = "squared_error", max_iter = 1000)
model_SGD.fit(scaled_X_train, y_train)
print(model_SGD.coef_) # beta1, beta2, beta3
print(model_SGD.intercept_) # beta0

[11.93734016  8.99478506  1.34128065]
[3.59504339]


## Manual prediction

In [89]:
test_sample_features = scaled_X_test[0].reshape(1,-1) # -1 gives remaining so 3, -> (1,3)
test_sample_label = y_test.values[0]
test_sample_features, test_sample_label


(array([[0.54988164, 0.63709677, 0.52286282]]), 16.9)

In [92]:
model_OLS.predict(test_sample_features)[0]

16.565396297434837

In [93]:
model_SGD.predict(test_sample_features)[0]

16.59102186761312

## Evaluation

In [97]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 1. predict on test data
y_pred_OLS = model_OLS.predict(scaled_X_test)
y_pred_SGD = model_SGD.predict(scaled_X_test)
y_pred_OLS[:5]

array([16.5653963 , 21.18822792, 21.55107058, 10.88923816, 22.20231988])

In [98]:
y_pred_SGD[:5]

array([16.59102187, 20.80656961, 21.09517208, 11.32943884, 21.38488659])

In [99]:
y_test[:5].values

array([16.9, 22.4, 21.4,  7.3, 24.7])

In [105]:
mae_OLS = mean_absolute_error(y_test, y_pred_OLS)
mae_SGD = mean_absolute_error(y_test, y_pred_SGD)

mse_OLS = mean_squared_error(y_test, y_pred_OLS)
mse_SGD = mean_squared_error(y_test, y_pred_SGD)

rmse_OLS = np.sqrt(mae_OLS)
rmse_SGD = np.sqrt(mae_SGD)

print(f"{mae_OLS=:.4f} \t\t {mse_OLS=:.4f} \t {rmse_OLS=:.4f}")
print(f"{mae_SGD=:.4f} \t\t {mse_SGD=:.4f} \t {rmse_SGD=:.4f}")


mae_OLS=1.5117 		 mse_OLS=3.7968 	 rmse_OLS=1.2295
mae_SGD=1.5230 		 mse_SGD=4.0974 	 rmse_SGD=1.2341
