In [None]:
# reading the CSV

import pandas as pd
import duckdb as db

df = pd.read_csv("../../data/Advertising.csv", index_col=0)

query = db.query(
    """
    SELECT
        TV, 
        radio,
        newspaper,
        sales
    FROM
        df
    """
).df()

query

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,9.7
197,177.0,9.3,6.4,12.8
198,283.6,42.0,66.2,25.5


In [16]:
df = query

print(f"We have {df.shape[0]} samples")
print(f"We have {df.shape[1] - 1} features")
print("Sales is our target")

We have 200 samples
We have 3 features
Sales is our target


# Divide data into x and y

In [30]:
X, y = df.drop("sales", axis = "columns"), df["sales"]

X.head()

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [33]:
y.head()

0    22.1
1    10.4
2     9.3
3    18.5
4    12.9
Name: sales, dtype: float64

## Scikit-learn steps

1. train/test split or train/val/test split
2. scale dataset
    - many algorithms require scaling, some don't
    - different types of scaling exists (e.g. feature standardization, min-max scaling)
    - scale training dataand test data to the training datas parameters to avoid data leakage
3. Fit algorithm to training data
4. Predict on test data
5. Evaluation metrics

- many algorithms require scaling, some dont
- different types of scaling
- try to predict real number - regression (decimaltal)
- predict discrete values - classification (heltal)
- we have targets, which means we are using 'Supervised learning'
- X = design matrix / feature matrix / features / independent variables
- y = target / dependent variable


# train test split

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(f"{X_train.shape = }")

X_train.shape = (134, 3)


In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(f"{X_train.shape = }")
print(f"{y_train.shape = }")
print(f"{X_test.shape = }")
print(f"{y_test.shape = }")


X_train.shape = (134, 3)
y_train.shape = (134,)
X_test.shape = (66, 3)
y_test.shape = (66,)


In [42]:
X_train.head()

Unnamed: 0,TV,radio,newspaper
42,293.6,27.7,1.8
189,18.7,12.1,23.4
90,134.3,4.9,9.3
136,25.6,39.0,9.3
51,100.4,9.6,3.6


In [43]:
y_train.head()

42     20.7
189     6.7
90     11.2
136     9.5
51     10.7
Name: sales, dtype: float64

# 2. feature scaling

In [45]:
from sklearn.preprocessing import MinMaxScaler

# instantiate an instance from the MinMaxScaler class
scaler = MinMaxScaler()
type(scaler)

scaler

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [48]:
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min() = }")
print(f"{scaled_X_train.max() = }")
print(f"{scaled_X_test.min() = }")
print(f"{scaled_X_test.max() = }")


scaled_X_train.min() = np.float64(0.0)
scaled_X_train.max() = np.float64(1.0)
scaled_X_test.min() = np.float64(0.005964214711729622)
scaled_X_test.max() = np.float64(1.1302186878727631)


In [50]:
scaled_X_train.shape

(134, 3)

In [None]:
# scaled_X_train är nu en NP-array
scaled_X_train[:5]

array([[0.99053094, 0.55846774, 0.01491054],
       [0.06087251, 0.24395161, 0.22962227],
       [0.45180927, 0.09879032, 0.08946322],
       [0.08420697, 0.78629032, 0.08946322],
       [0.33716605, 0.19354839, 0.03280318]])

# 3. Linear regression

In [54]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(scaled_X_train, y_train)
print(f"Parameters: {model.coef_}")
print(f"Intercept: {model.intercept_}")

Parameters: [13.20747617  9.75285112  0.61108329]
Intercept: 2.7911595196243653


# 4. Prediction

In [55]:
sample_features = scaled_X_test[0]
sample_features

array([0.54988164, 0.63709677, 0.52286282])

In [59]:
model.predict(sample_features.reshape(1, -1))


array([16.58673085])

In [60]:
y_test.iloc[0]

np.float64(16.9)

## Predict on whole test data

In [62]:
y_pred = model.predict(scaled_X_test)
y_pred[:5]

array([16.58673085, 21.18622524, 21.66752973, 10.81086512, 22.25210881])

In [63]:
y_test.iloc[:5]

95     16.9
15     22.4
30     21.4
158     7.3
128    24.7
Name: sales, dtype: float64

# 5. Evaluation 
mae - mean absolute error
mse - mean squared error
rmse - root mean squared error

In [64]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"{mae = }")
print(f"{mse = }")
print(f"{rmse = }")

mae = 1.4937750024728977
mse = 3.72792833068152
rmse = np.float64(1.9307843822347228)
