In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('advertising.csv')
df.head(10)

X = df.drop(['Sales'], axis=1)
y = df['Sales']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                        train_size=0.5, random_state=71)

In [None]:
# Scale data
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
models = {
    'tree': DecisionTreeRegressor(max_depth=5),
    'linear': LinearRegression(),
    'random_forest': RandomForestRegressor(n_estimators=20, max_depth=10),
    'knn': KNeighborsRegressor(n_neighbors=10)
}

`R squared` - the closer to 1 the better

`Mean squred error` - the closer to 0, the better 

In [None]:
prec = 3

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    print(f'Model: {model_name}')
    print(f'Train r squared: {np.round(r2_score(y_train, y_pred_train), prec)}')
    print(f'Test r squared: {np.round(r2_score(y_test, y_pred_test), prec)}')
    print('-----')
    
    print(f'Train MSE: {np.round(mean_squared_error(y_train, y_pred_train), prec)}')
    print(f'Test MSE: {np.round(mean_squared_error(y_test, y_pred_test), prec)}')
    
    print('=====\n\n')

# Which models are affected by data scaling?

In [None]:
prec = 3

for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred_train = model.predict(X_train_scaled)
    y_pred_test = model.predict(X_test_scaled)
    
    print(f'Model: {model_name}')
    print(f'Train r squared: {np.round(r2_score(y_train, y_pred_train), prec)}')
    print(f'Test r squared: {np.round(r2_score(y_test, y_pred_test), prec)}')
    print('-----')
    
    print(f'Train MSE: {np.round(mean_squared_error(y_train, y_pred_train), prec)}')
    print(f'Test MSE: {np.round(mean_squared_error(y_test, y_pred_test), prec)}')
    
    print('=====\n\n')

### How to interpret R^squared ?

In [None]:
from matplotlib.patches import Ellipse

In [None]:
model = KNeighborsRegressor(n_neighbors=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)

fig, ax = plt.subplots()

ax.scatter(y_train, y_pred, s=10)
ax.plot([4, 25], [4, 25], color='r')
ax.set_xlabel('True Sales')
ax.set_ylabel('Predicted Sales');
ax.set_title(f'R squared: {np.round(r2_score(y_train, y_pred), 2)}')

ax.text(15, 10, 'Sales underestimation area')
ax.text(5, 20, 'Sales overstimation area');