# Autotrader_TransformedTargetRegressor
---

In [1]:
# import packages (pd, viz)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from joblib import dump, load

# import myfunctions.py (functions used in multiple notebooks)
import myfunctions

# import packages (preprocessing)
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# import packages (models)
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor

plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Read in file

In [44]:
# Read in file
df = pd.read_csv('df_master_2.csv', index_col=0)
print(df.shape)

# Set features and target
X = df.copy()
y = X.pop('price')

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

(14996, 13)


((11996, 12), (3000, 12), (11996,), (3000,))

# Define Functions
1. pipe_construct: construct a pipeline according to input features and choice of model
2. plot_coef_pipe: function to plot resulting coefficients from fitted pipe

In [45]:
# Define input features 
features_cont = ['year', 'mileage', 'engine_size', 'horsepower', 'bhp_per_litre']
features_cat = ['body', 'transmission', 'fuel', 'ulez', 'seller_type', 'make', 'model']

def pipe_construct(features_cont=features_cont, features_cat=features_cat, model=TransformedTargetRegressor()):
    '''
    Input: continuous features, categorical features, model of choice. 
    Returns: pipeline.
    '''
    t = [('cont', StandardScaler(), features_cont), 
         ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), features_cat)]

    transformer = ColumnTransformer(transformers=t, remainder='drop')

    pipe = Pipeline(steps=[('transformer', transformer), ('model', model)])
    return pipe


def plot_coef_pipe(pipe, X_train, X_test, y_train, y_test):
    '''plot coefficients from a model'''
    # df for model coefficients    
#     col_names = features_cont + list(transformer.named_transformers_.cat.get_feature_names())
    col_names = features_cont + list(pipe.named_steps.transformer.named_transformers_.cat.get_feature_names())
    
    df_model_coef = pd.DataFrame(pipe.named_steps.model.coef_, index=col_names, columns=['coef'])
    df_model_coef['coef_abs'] = np.abs(df_model_coef)
    
    # plot coefficients in order of importance
    fig, ax = plt.subplots(1,2, figsize=(12,6))
    plt.tight_layout(w_pad=10)

    df_model_coef.coef.sort_values()[:10].plot(kind='barh', ax=ax[0])
    df_model_coef.coef.sort_values()[-10:].plot(kind='barh', ax=ax[1])

    ax[0].set_title('Feature Importance (negative impact)')
    ax[1].set_title('Feature Importance (positive impact)')

    print('Coefficients:')
    plt.show()
    

def plot_feat_imp(pipe, X_train, X_test, y_train, y_test, n_features=10):
    '''
    Plot feature importances from model.
    Input: pipe, X_train, X_test, y_train, y_test, n_features (number of features to show).
    Returns: plot of feature importances.
    '''
    # dataframe for feature importances
    col_names = features_cont + list(pipe.named_steps.transformer.named_transformers_.cat.get_feature_names())
    df_feat_imp = pd.DataFrame(pipe.named_steps.model.feature_importances_, index=col_names, columns=['feat_imp'])

    # plot of feature importances
    fig, ax = plt.subplots(1,1, figsize=(12,12))
    df_feat_imp.sort_values(by='feat_imp')[-n_features:].plot(kind='barh', ax=ax)
    ax.set_title('Feature Importances')
    plt.show()

# 1. Transformed Target Regressor (default)

In [46]:
# TransformedTargetRegressor(); all features
features_cont = ['year', 'mileage', 'engine_size', 'horsepower', 'bhp_per_litre']
features_cat = ['body', 'transmission', 'fuel', 'ulez', 'seller_type', 'make', 'model']
print('cont features: ', features_cont)
print('cat features:  ', features_cat)
print()

# fit pipe
pipe = pipe_construct(features_cont, features_cat, TransformedTargetRegressor())
pipe.fit(X_train, y_train)
print(pipe.named_steps.model)
print()

# print scores
myfunctions.print_scores(pipe, X_train, X_test, y_train, y_test)
print()

# # plots
# myfunctions.plot_predictions(pipe, X_train, X_test, y_train, y_test)
# myfunctions.plot_residuals(pipe, X_train, X_test, y_train, y_test)
# plot_feat_imp(pipe, X_train, X_test, y_train, y_test)

cont features:  ['year', 'mileage', 'engine_size', 'horsepower', 'bhp_per_litre']
cat features:   ['body', 'transmission', 'fuel', 'ulez', 'seller_type', 'make', 'model']

TransformedTargetRegressor()

R2 Train, Test:		0.8621390243921754 	 -1.8010667305570317e+17
R2 Train (CV Mean):	-4.853569546396363e+17

RMSE Train, Test:	11061.525609722943 	 12230361789124.908
MAE  Train, Test:	3862.555570606869 	 288453415189.0343



In [47]:
# TransformedTargetRegressor(); all features
features_cont = ['year', 'mileage', 'engine_size', 'horsepower', 'bhp_per_litre']
features_cat = ['body', 'transmission', 'fuel', 'ulez', 'seller_type', 'make', 'model']
print('cont features: ', features_cont)
print('cat features:  ', features_cat)
print()

# fit pipe
pipe = pipe_construct(features_cont, features_cat, TransformedTargetRegressor(regressor=LinearRegression(), 
                                                                              func=np.log, inverse_func=np.exp))
pipe.fit(X_train, y_train)
print(pipe.named_steps.model)
print()

# # print scores
# myfunctions.print_scores(pipe, X_train, X_test, y_train, y_test)
# print()

# # plots
# myfunctions.plot_predictions(pipe, X_train, X_test, y_train, y_test)
# myfunctions.plot_residuals(pipe, X_train, X_test, y_train, y_test)
# plot_feat_imp(pipe, X_train, X_test, y_train, y_test)

cont features:  ['year', 'mileage', 'engine_size', 'horsepower', 'bhp_per_litre']
cat features:   ['body', 'transmission', 'fuel', 'ulez', 'seller_type', 'make', 'model']

TransformedTargetRegressor(func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>,
                           regressor=LinearRegression())



In [48]:
# print scores
myfunctions.print_scores(pipe, X_train, X_test, y_train, y_test)
print()


  return func(X, **(kw_args if kw_args else {}))
  return func(X, **(kw_args if kw_args else {}))


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
pipe.score(X_test, y_test)

In [17]:
pipe.named_steps

{'transformer': ColumnTransformer(transformers=[('cont', StandardScaler(),
                                  ['year', 'mileage', 'engine_size',
                                   'horsepower', 'bhp_per_litre']),
                                 ('cat',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  ['body', 'transmission', 'fuel', 'ulez',
                                   'seller_type', 'make', 'model'])]),
 'model': TransformedTargetRegressor(func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>)}

In [56]:
X_train_trans = pipe.named_steps.transformer.fit_transform(X_train)
X_test_trans = pipe.named_steps.transformer.transform(X_test)

In [57]:
ttr = TransformedTargetRegressor(func=np.log, inverse_func=np.exp)

In [59]:
ttr.fit(X_train_trans, np.log(y_train))

TransformedTargetRegressor(func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>)

In [28]:
print('R2 train: ', ttr.score(X_train_trans, y_train))
# print('R2 test: ', ttr.score(X_test_trans, y_test))

R2 train:  0.9104066329672078


In [34]:
X_test_trans.min().min(), X_test_trans.max().max()

(-3.7611781964167434, 5.517871906272311)

In [36]:
y_test.min(), y_test.max()

(490, 464995)

In [37]:
y_train.min(), y_train.max()

(250, 623940)

In [38]:
print('R2 test: ', ttr.score(X_test_trans, y_test))

  return func(X, **(kw_args if kw_args else {}))


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [16]:
# # dump pipe and model to disk
# dump(pipe, filename='autotrader_etr_pipe.joblib')

['autotrader_etr_pipe.joblib']

# Manually split and dummify...

In [64]:
df

Unnamed: 0,year,body,mileage,engine_size,horsepower,transmission,fuel,ulez,seller_type,price,make,model,bhp_per_litre
0,2008.0,estate,115000.0,2.0,168.0,manual,diesel,0,trade_seller,3995,audi,a4,84.000000
1,2011.0,hatchback,112988.0,1.6,104.0,automatic,diesel,0,trade_seller,4799,audi,a3,65.000000
2,2011.0,suv,96000.0,2.0,168.0,manual,petrol,1,trade_seller,8740,audi,q3,84.000000
3,2015.0,saloon,97837.0,2.0,190.0,automatic,diesel,1,trade_seller,10900,audi,a6,95.000000
4,2009.0,coupe,67345.0,1.8,168.0,manual,petrol,1,trade_seller,6491,audi,a5,93.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16194,2017.0,suv,72750.0,2.0,150.0,automatic,diesel,1,trade_seller,15799,volkswagen,tiguan,75.000000
16195,2017.0,estate,18979.0,2.0,150.0,automatic,diesel,1,trade_seller,15300,volkswagen,passat,75.000000
16196,2016.0,hatchback,53494.0,1.6,108.0,manual,diesel,1,trade_seller,9490,volkswagen,golf,67.500000
16197,2017.0,hatchback,24960.0,1.2,89.0,manual,petrol,1,trade_seller,8800,volkswagen,polo,74.166667


In [65]:
# Set features and target
X = df.copy()
y = X.pop('price')

In [66]:
features_cat

['body', 'transmission', 'fuel', 'ulez', 'seller_type', 'make', 'model']

In [67]:
# dummify categorical variables
X = pd.get_dummies(data=X, columns=features_cat, drop_first=True)

print(X.shape)
X.head()

(14996, 630)


Unnamed: 0,year,mileage,engine_size,horsepower,bhp_per_litre,body_combi van,body_convertible,body_coupe,body_estate,body_hatchback,...,model_xsara,model_xv,model_yaris,model_yeti,model_ypsilon,model_z4,model_zafira,model_zr,model_zs,model_zt
0,2008.0,115000.0,2.0,168.0,84.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2011.0,112988.0,1.6,104.0,65.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2011.0,96000.0,2.0,168.0,84.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2015.0,97837.0,2.0,190.0,95.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2009.0,67345.0,1.8,168.0,93.333333,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((11996, 630), (3000, 630), (11996,), (3000,))

In [72]:
features_dum = [i for i in X_train.columns if i not in features_cont]
print(len(features_dum))
print(features_dum[:5])

625
['body_combi van', 'body_convertible', 'body_coupe', 'body_estate', 'body_hatchback']


In [73]:
# separate features_cont and features_cat for train and test (to standardise features_cont only)
X_train_cont = X_train[features_cont]
X_train_cat = X_train[features_dum]
X_test_cont = X_test[features_cont]
X_test_cat = X_test[features_dum]

# StandardScaler()
scaler = StandardScaler()
X_train_cont = pd.DataFrame(scaler.fit_transform(X_train_cont), columns=X_train_cont.columns, index=X_train_cont.index)
X_test_cont = pd.DataFrame(scaler.transform(X_test_cont), columns=X_test_cont.columns, index=X_test_cont.index)

# concatenate cont (now standardised) and cat (left as is)
X_train_master = pd.concat([X_train_cont, X_train_cat], axis=1)
X_test_master = pd.concat([X_test_cont, X_test_cat], axis=1)

In [76]:
X_train = X_train_master.copy()
X_test = X_test_master.copy()

In [77]:
ttr = TransformedTargetRegressor(func=np.log, inverse_func=np.exp)

In [78]:
ttr.fit(X_train, y_train)

TransformedTargetRegressor(func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>)

In [79]:
ttr.score(X_train, y_train)

0.910407148505971

In [80]:
ttr.score(X_test, y_test)

  return func(X, **(kw_args if kw_args else {}))


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').