# Linear Regression Code alongs

- we have labels -> supervised learning
- try to predict real number -> regression
- predict discrete values -> classification


In [77]:
import pandas as pd

df = pd.read_csv("../../data/Advertising.csv", index_col=0)
df.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [78]:
df.shape

(200, 4)

In [79]:
print(f"{df.shape[0]} samples")
print(f"{df.shape[1] - 1} features")
print("sales column is our label/target")

200 samples
3 features
sales column is our label/target


## EDA left for the reader ...

## Divide data into X and y

In [80]:
# tuple unpacking 
# X - design matrix / feature matrix / features / independent variables
# y - target variable / label / dependent variable
X, y = df.drop("sales", axis = "columns"), df["sales"]
X.head()

Unnamed: 0,TV,radio,newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


In [81]:
y.head()

1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: sales, dtype: float64

In [82]:
type(y), type(X)

(pandas.core.series.Series, pandas.core.frame.DataFrame)

## Scikit-learn steps

1. train|test split or train|val|test split
2. scale dataset 
    - many algorithms require scaling, some don't
    - there exists different types of scaling (e.g. feature standardization, min-max scaling)
    - scale training data and test data to the training datas parameters to avoid data leakage
3. Fit algorithm to training data 
4. Predict on test data 
5. Evaluation metrics

## 1. train|test split 

In [83]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, 
)

print(f"{X_train.shape = }")
print(f"{y_train.shape = }")
print(f"{X_test.shape = }")
print(f"{y_test.shape = }")

X_train.shape = (134, 3)
y_train.shape = (134,)
X_test.shape = (66, 3)
y_test.shape = (66,)


In [84]:
X_train.head()

Unnamed: 0,TV,radio,newspaper
72,109.8,14.3,31.7
15,204.1,32.9,46.0
10,199.8,2.6,21.2
51,199.8,3.1,34.6
178,170.2,7.8,35.2


In [85]:
y_train.head()

72     12.4
15     19.0
10     10.6
51     11.4
178    11.7
Name: sales, dtype: float64

## 2. feature scaling

- min-max scaling 
- values transformed into 0 to 1

In [None]:
from sklearn.preprocessing import MinMaxScaler

# instantiate an instance from the MinMaxScaler class
scaler = MinMaxScaler()
type(scaler)

sklearn.preprocessing._data.MinMaxScaler

In [87]:
scaler

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [93]:
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min() = }")
print(f"{scaled_X_train.max() = }")
print(f"{scaled_X_test.min() = }")
print(f"{scaled_X_test.max() = }")


scaled_X_train.min() = np.float64(0.0)
scaled_X_train.max() = np.float64(1.0)
scaled_X_test.min() = np.float64(-0.016151202749140895)
scaled_X_test.max() = np.float64(1.0040485829959516)


In [95]:
scaled_X_train.shape

(134, 3)

In [97]:
scaled_X_train[:5]

array([[0.35876289, 0.28947368, 0.27616535],
       [0.68281787, 0.6659919 , 0.40193492],
       [0.66804124, 0.05263158, 0.18381706],
       [0.66804124, 0.06275304, 0.30167106],
       [0.56632302, 0.15789474, 0.30694811]])

In [98]:
type(scaled_X_train)

numpy.ndarray

## 3. Linear regression

$y = w_0 + w_1x_1 + w_2x_2 + w_3x_3$

In [99]:
from sklearn.linear_model import LinearRegression

# instantiate an instance from LinearRegression class
model = LinearRegression()
model


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [102]:
model.fit(scaled_X_train, y_train)
print(f"Parameters or weights: {model.coef_}")
print(f"Intercept: {model.intercept_}")

Parameters or weights: [12.70257155  9.17599532 -0.70452187]
Intercept: 3.7403672496951543


## 4. Prediction

In [112]:
X_test.iloc[0]

TV           210.8
radio         49.6
newspaper     37.7
Name: 59, dtype: float64

In [107]:
sample_features = scaled_X_test[0].reshape(1, -1)
sample_features

array([[0.70584192, 1.00404858, 0.3289358 ]])

In [108]:
model.predict(sample_features)

array([21.68777744])

In [110]:
y_test.iloc[0]

np.float64(23.8)

### predict on whole test data

In [114]:
y_pred = model.predict(scaled_X_test)
y_pred[:5]

array([21.68777744, 10.77384283, 15.48852117,  8.07005988, 12.48362187])

In [115]:
y_test.iloc[:5]

59     23.8
181    10.5
88     16.0
130     9.7
180    12.6
Name: sales, dtype: float64

## 5. evaluate

common metrics for regression case 
- mae - mean absolute error
- mse - mean squared error
- rmse - root mean squared error


In [116]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np 

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"{mae = }")
print(f"{mse = }")
print(f"{rmse = }")


mae = 1.3101700821097393
mse = 3.7971989225713703
rmse = np.float64(1.9486402753128578)
