# Entry 22 notebook - Continuous Targets - Implementation

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import category_encoders as ce

from sklearn.pipeline import make_pipeline

### Custom functions

In [2]:
def split_data(df, target, train_size):
    y = df[[target]]
    X = df.drop(target, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, random_state=12)
    return X_train, X_test, y_train, y_test

In [3]:
df_raw = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original', header=None, sep='\t')
cols = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_yr', 'origin']
df = df_raw[0].str.split(expand=True)
df.columns = cols
df[df == 'NA'] = np.nan
df = df.astype('float32')
df.drop(df[df['mpg'].isna()].index, inplace=True)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_yr,origin
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0


## Standard pipeline with make_pipeline

In [4]:
X_train, X_test, y_train, y_test = split_data(df, 'mpg', 0.8)
pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), LinearRegression())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.8055297594035651

In [6]:
cross_validate(pipe, X_train, y_train, cv=10, scoring=['explained_variance', 'r2', 'max_error',
                                                      'neg_median_absolute_error', 'neg_mean_absolute_error',
                                                      'neg_mean_squared_error', 'neg_mean_squared_log_error',
                                                      'neg_root_mean_squared_error', 'neg_mean_poisson_deviance',
                                                      'neg_mean_gamma_deviance'])

{'fit_time': array([0.00582886, 0.00529385, 0.00452709, 0.00340986, 0.00476122,
        0.00394201, 0.003304  , 0.00356293, 0.00309896, 0.00314689]),
 'score_time': array([0.00745916, 0.0082891 , 0.01005173, 0.0075829 , 0.00711393,
        0.00576091, 0.00775385, 0.0056231 , 0.00575399, 0.00568509]),
 'test_explained_variance': array([0.67682898, 0.80524778, 0.62092972, 0.68306053, 0.94139659,
        0.77826154, 0.84657204, 0.88591731, 0.62975121, 0.7808243 ]),
 'test_r2': array([0.65232904, 0.75891477, 0.61082248, 0.56507696, 0.94096155,
        0.77456762, 0.76509546, 0.87235651, 0.55538031, 0.57407846]),
 'test_max_error': array([-11.5754776 ,  -5.6730423 , -11.1805687 , -10.35218048,
         -3.73846054,  -4.83219719,  -4.25672913,  -4.0054493 ,
         -6.67257881,  -7.6016655 ]),
 'test_neg_median_absolute_error': array([-3.02493382, -1.63546515, -2.33081245, -3.63427258, -1.48281193,
        -3.59795904, -2.82716656, -2.13691902, -2.70675659, -3.7892971 ]),
 'test_neg_mean_ab