## 2. Machine Learning for Regression


In [None]:
import pandas as pd
import numpy as np

## 2.2 Data preparation

In [None]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'

In [None]:
!wget $data 

In [None]:
df = pd.read_csv('/Users/ingluissantana/Desktop/Py/clones/mlbookcamp-code/chapter-02-car-price/data.csv')

In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [None]:
df['make'].str.lower().str.replace(' ', '_')

In [None]:
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

In [None]:
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [None]:
df.dtypes

## 2.3 Exploratory data analysis

In [None]:
for col in df.columns:
    print(col)
    print(df[col].unique()[:5])
    print(df[col].nunique())
    print()

In [None]:
df

Distribution of price

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
sns.histplot(df.msrp, bins=50)

In [None]:
sns.histplot(df.msrp[df.msrp < 100000], bins=50)

In [None]:
np.log1p([0, 1, 10, 1000, 100000])

In [None]:
np.log([0 + 1, 1+ 1, 10 + 1, 1000 + 1, 100000])

In [None]:
price_logs = np.log1p(df.msrp)

In [None]:

sns.histplot(price_logs, bins=50)

Missing values

In [None]:
df.isnull().sum()

## 2.4 Setting up the validation framework

Let's draw it

In [None]:
n = len(df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [None]:
n

In [None]:
n_val, n_test, n_train

In [None]:
df.iloc[[10, 0, 3, 5]]

In [None]:
df_train = df.iloc[:n_train]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]

In [None]:
idx = np.arange(n)

In [None]:
np.random.seed(2)
np.random.shuffle(idx)

In [None]:
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

In [None]:
df_train.head()

In [None]:
len(df_train), len(df_val), len(df_test)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = np.log1p(df_train.msrp.values)
y_val = np.log1p(df_val.msrp.values)
y_test = np.log1p(df_test.msrp.values)

In [None]:
del df_train['msrp']
del df_val['msrp']
del df_test['msrp']

In [None]:
len(y_train)

## 2.5 Linear regression

draw

In [None]:
df_train.iloc[10]

In [None]:
xi = [453, 11, 86]
w0 = 7.17
w = [0.01, 0.04, 0.002]

In [None]:
def linear_regression(xi):
    n = len(xi)

    pred = w0

    for j in range(n):
        pred = pred + w[j] * xi[j]

    return pred

In [None]:
xi = [453, 11, 86]
w0 = 7.17
w = [0.01, 0.04, 0.002]

In [None]:
linear_regression(xi)

In [None]:
np.expm1(12.312)

In [None]:
np.log1p(222347.2221101062)

## 2.6 Linear regression vector form

In [None]:
def dot(xi, w):
    n = len(xi)
    
    res = 0.0
    
    for j in range(n):
        res = res + xi[j] * w[j]
    
    return res

In [None]:
def linear_regression(xi):
    return w0 + dot(xi, w)

In [None]:
w_new = [w0] + w

In [None]:
w_new

In [None]:
def linear_regression(xi):
    xi = [1] + xi
    return dot(xi, w_new)

In [None]:
linear_regression(xi)

In [None]:
w0 = 7.17
w = [0.01, 0.04, 0.002]
w_new = [w0] + w

In [None]:
x1  = [1, 148, 24, 1385]
x2  = [1, 132, 25, 2031]
x10 = [1, 453, 11, 86]

X = [x1, x2, x10]
X = np.array(X)
X

In [None]:
def linear_regression(X):
    return X.dot(w_new)

In [None]:
linear_regression(X)

## 2.7 Training a linear regression model

In [None]:
def train_linear_regression(X, y):
    pass

In [None]:
X = [
    [148, 24, 1385],
    [132, 25, 2031],
    [453, 11, 86],
    [158, 24, 185],
    [172, 25, 201],
    [413, 11, 86],
    [38,  54, 185],
    [142, 25, 431],
    [453, 31, 86],
]

X = np.array(X)
X

In [None]:
ones = np.ones(X.shape[0])
ones

In [None]:
X = np.column_stack([ones, X])

In [None]:
y = [10000, 20000, 15000, 20050, 10000, 20000, 15000, 25000, 12000]

In [None]:
XTX = X.T.dot(X)
XTX_inv = np.linalg.inv(XTX)
w_full = XTX_inv.dot(X.T).dot(y)

In [None]:
w0 = w_full[0]
w = w_full[1:]

In [None]:
w0, w

In [None]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [None]:
train_linear_regression(X, y)

## 2.8 Car price baseline model

In [None]:
df_train.columns

In [None]:
base = ['engine_hp', 'engine_cylinders', 'highway_mpg',
        'city_mpg', 'popularity']

X_train = df_train[base].fillna(0).values

w0, w = train_linear_regression(X_train, y_train)

y_pred = w0 + X_train.dot(w)

In [None]:
w0

In [None]:
w

In [None]:
sns.histplot(y_pred, color='red', alpha=0.5, bins=50)
sns.histplot(y_train, color='blue', alpha=0.5, bins=50)


## 2.9 RMSE

In [None]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [None]:
rmse(y_train, y_pred)

## 2.10 Validating the model

In [None]:
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [None]:
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

## 2.11 Simple feature engineering

In [None]:
def prepare_X(df):
    df = df.copy()
    
    df['age'] = 2017 - df['year']
    features = base + ['age']
    
    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values

    return X

In [None]:
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

In [None]:
sns.histplot(y_pred, label='prediction', color='red', alpha=0.5, bins=50)
sns.histplot(y_val, label='target', color='blue',  alpha=0.5, bins=50)
plt.legend()

## 2.12 Categorical variables

In [None]:
categorical_columns = [
    'make', 'model', 'engine_fuel_type', 'driven_wheels', 'market_category',
    'vehicle_size', 'vehicle_style']

categorical = {}

for c in categorical_columns:
    categorical[c] = list(df_train[c].value_counts().head().index)

In [None]:
def prepare_X(df):
    df = df.copy()
    
    df['age'] = 2017 - df['year']
    features = base + ['age']

    for v in [2, 3, 4]:
        df['num_doors_%d' % v] = (df.number_of_doors == v).astype(int)
        features.append('num_doors_%d' % v)

    for name, values in categorical.items():
        for value in values:
            df['%s_%s' % (name, value)] = (df[name] == value).astype(int)
            features.append('%s_%s' % (name, value))

    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values

    return X

In [None]:
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

In [None]:
w0, w

## 2.13 Regilarization

In [None]:
X = [
    [4, 4, 4],
    [3, 5, 5],
    [5, 1, 1],
    [5, 4, 4],
    [7, 5, 5],
    [4, 5, 5.00000001],
]

X = np.array(X)
X

In [None]:
y= [1, 2, 3, 1, 2, 3]

In [None]:
XTX = X.T.dot(X)
XTX

In [None]:
XTX_inv = np.linalg.inv(XTX)

In [None]:
XTX_inv

In [None]:
XTX_inv.dot(X.T).dot(y)

In [None]:
XTX = [
    [1, 2, 2],
    [2, 1, 1.0000001],
    [2, 1.0000001, 1]
]

XTX = np.array(XTX)

In [None]:
np.linalg.inv(XTX)

In [None]:
XTX = XTX + 0.01 * np.eye(3)

In [None]:
np.linalg.inv(XTX)

In [None]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [None]:
X_train = prepare_X(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=0.01)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

## 2.14 Tuning the model

In [None]:
for r in [0.0, 0.00001, 0.0001, 0.001, 0.1, 1, 10]:
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    print(r, w0, score)

In [None]:
r = 0.001
X_train = prepare_X(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=r)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)
score

## 2.15 Using the model

In [None]:
df_full_train = pd.concat([df_train, df_val])

In [None]:
df_full_train = df_full_train.reset_index(drop=True)

In [None]:
X_full_train = prepare_X(df_full_train)

In [None]:
X_full_train

In [None]:
y_full_train = np.concatenate([y_train, y_val])

In [None]:
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

In [None]:
X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)
score

In [None]:
car = df_test.iloc[20].to_dict()
car

In [None]:
df_small = pd.DataFrame([car])
df_small

In [None]:
X_small = prepare_X(df_small)

In [None]:
y_pred = w0 + X_small.dot(w)
y_pred = y_pred[0]
y_pred

In [None]:
np.expm1(y_pred)

In [None]:
np.expm1(y_test[20])

## 2.16 Next steps

* We included only 5 top features. What happens if we include 10?

Other projects

* Predict the price of a house - e.g. boston dataset
* https://archive.ics.uci.edu/ml/datasets.php?task=reg
* https://archive.ics.uci.edu/ml/datasets/Student+Performance

## 2.17 Summary

* EDA - looking at data, finding missing values
* Target variable distribution - long tail => bell shaped curve
* Validation framework: train/val/test split (helped us detect problems)
* Normal equation - not magic, but math
* Implemented it with numpy
* RMSE to validate our model
* Feature engineering: age, categorical features
* Regularization to fight numerical instability