## Import of libraries

In [1]:
!pip install sklearn-pandas



In [2]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

## Dataset

In [3]:
data = pd.read_csv('cars.csv')

In [4]:
data.drop(data[data.year == 2060].index, inplace = True)
data = data[~(data.year.isna())]

## Custom imputer for tax

In [5]:
class Merger(BaseEstimator, TransformerMixin):
    def __init__(self, col_main, col_add):
        self.col_main, = col_main,
        self.col_add = col_add
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        Xt = X.copy()
        
        Xt[self.col_main].fillna(Xt[self.col_add], inplace=True)
        
        return Xt

## Train-test split

In [6]:
X = data.drop('price', axis = 1)
y = data.price

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Defining mapper

In [8]:
numeric_features = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
nominal_features = ['transmission', 'brand', 'model', 'fuelType']

mapper = DataFrameMapper(
                    [([feature], SimpleImputer(strategy='median')) for feature in numeric_features] + [
                    (['fuelType'], OneHotEncoder(sparse=False, drop='first')),
                    (['transmission'], OneHotEncoder(sparse=False, drop='first')),    
                    (['brand'], OneHotEncoder(sparse=False, handle_unknown='ignore')),
                    (['model'], OneHotEncoder(sparse=False, handle_unknown='ignore')),
], df_out=True)

# Searching for best parameters with Parameter Grid

## Elastic Net

In [9]:
param_grid = dict(alpha=np.linspace(0.1, 1, 5).tolist(),
                  l1_ratio=np.linspace(0, 1, 5).tolist())
param_grid

{'alpha': [0.1, 0.325, 0.55, 0.775, 1.0],
 'l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]}

In [10]:
results = []

for params in ParameterGrid(param_grid):
    print(params)
    
    pipe_reg = Pipeline(steps = [('tax_merger', Merger('tax', 'tax(£)')),
                                 ('mapper', mapper),
                                 ('elasticnet', ElasticNet(alpha=params['alpha'], l1_ratio=params['l1_ratio']))])
    
    pipe_reg.fit(X_train, y_train)
   
    results.append(dict(alpha=params['alpha'],
                        l1_ratio=params['l1_ratio'],
                        train_score=mean_squared_error(y_train,
                                                       pipe_reg.predict(X_train), 
                                                       squared=False),
        
                        test_score=mean_squared_error(y_test,
                                                      pipe_reg.predict(X_test), 
                                                      squared=False)
    ))

{'alpha': 0.1, 'l1_ratio': 0.0}
{'alpha': 0.1, 'l1_ratio': 0.25}
{'alpha': 0.1, 'l1_ratio': 0.5}
{'alpha': 0.1, 'l1_ratio': 0.75}
{'alpha': 0.1, 'l1_ratio': 1.0}
{'alpha': 0.325, 'l1_ratio': 0.0}
{'alpha': 0.325, 'l1_ratio': 0.25}
{'alpha': 0.325, 'l1_ratio': 0.5}
{'alpha': 0.325, 'l1_ratio': 0.75}
{'alpha': 0.325, 'l1_ratio': 1.0}
{'alpha': 0.55, 'l1_ratio': 0.0}
{'alpha': 0.55, 'l1_ratio': 0.25}
{'alpha': 0.55, 'l1_ratio': 0.5}
{'alpha': 0.55, 'l1_ratio': 0.75}
{'alpha': 0.55, 'l1_ratio': 1.0}
{'alpha': 0.775, 'l1_ratio': 0.0}
{'alpha': 0.775, 'l1_ratio': 0.25}
{'alpha': 0.775, 'l1_ratio': 0.5}
{'alpha': 0.775, 'l1_ratio': 0.75}
{'alpha': 0.775, 'l1_ratio': 1.0}
{'alpha': 1.0, 'l1_ratio': 0.0}
{'alpha': 1.0, 'l1_ratio': 0.25}
{'alpha': 1.0, 'l1_ratio': 0.5}
{'alpha': 1.0, 'l1_ratio': 0.75}
{'alpha': 1.0, 'l1_ratio': 1.0}


In [11]:
res = pd.DataFrame(results)
res.sort_values('test_score').style.bar(subset=['train_score', 'test_score'], vmin=0)

Unnamed: 0,alpha,l1_ratio,train_score,test_score
4,0.1,1.0,3703.38724,3541.713179
9,0.325,1.0,3709.243804,3544.705675
14,0.55,1.0,3716.100199,3550.375602
19,0.775,1.0,3724.310682,3557.823544
24,1.0,1.0,3732.341945,3566.91189
3,0.1,0.75,4610.532016,4516.434222
2,0.1,0.5,4753.196112,4669.366973
1,0.1,0.25,4859.590887,4781.981161
8,0.325,0.75,4884.433227,4808.135764
0,0.1,0.0,4951.723395,4878.6912


<b>First five best models have l1_ratio equal to 1 and lead by a large margin, so it seems reasonable to use Lasso regressor further.</b>

## Adding Polynomial Features to Lasso Regressor

In [12]:
param_grid = dict(max_degree=[3, 4, 5], 
                  alpha=np.linspace(0.1, 1, 5).tolist())
param_grid

{'max_degree': [3, 4, 5], 'alpha': [0.1, 0.325, 0.55, 0.775, 1.0]}

In [13]:
results = []

for params in ParameterGrid(param_grid):
    print(params)
    
    pipe_reg = Pipeline(steps = [('tax_merger', Merger('tax', 'tax(£)')),
                                 ('mapper', mapper),
                                 ('poly', ColumnTransformer([("poly", PolynomialFeatures(degree=params['max_degree']), slice(0, 5))], remainder='passthrough')),
                                 ('lassoreg', Lasso(alpha=params['alpha']))])
    
    pipe_reg.fit(X_train, y_train)
   
    results.append(dict(max_degree=params['max_degree'],
                        alpha=params['alpha'],
                        train_score=mean_squared_error(y_train,
                                                       pipe_reg.predict(X_train), 
                                                       squared=False),
        
                        test_score=mean_squared_error(y_test,
                                                      pipe_reg.predict(X_test), 
                                                      squared=False)
    ))

{'alpha': 0.1, 'max_degree': 3}
{'alpha': 0.1, 'max_degree': 4}
{'alpha': 0.1, 'max_degree': 5}
{'alpha': 0.325, 'max_degree': 3}
{'alpha': 0.325, 'max_degree': 4}
{'alpha': 0.325, 'max_degree': 5}
{'alpha': 0.55, 'max_degree': 3}
{'alpha': 0.55, 'max_degree': 4}
{'alpha': 0.55, 'max_degree': 5}
{'alpha': 0.775, 'max_degree': 3}
{'alpha': 0.775, 'max_degree': 4}
{'alpha': 0.775, 'max_degree': 5}
{'alpha': 1.0, 'max_degree': 3}
{'alpha': 1.0, 'max_degree': 4}
{'alpha': 1.0, 'max_degree': 5}


In [14]:
res = pd.DataFrame(results)
res.sort_values('test_score').style.bar(subset=['train_score', 'test_score'], vmin=0)

Unnamed: 0,max_degree,alpha,train_score,test_score
1,4,0.1,2884.33462,2750.862942
4,4,0.325,2889.859512,2757.235921
7,4,0.55,2896.57874,2766.753516
10,4,0.775,2905.043346,2777.785109
13,4,1.0,2914.506759,2789.236053
0,3,0.1,2953.91276,2813.973318
3,3,0.325,2960.262185,2819.883404
2,5,0.1,2837.136998,2825.789549
6,3,0.55,2968.126298,2830.552205
5,5,0.325,2842.009045,2835.63947


<b>As it turns out, model works best with polynomial degree equal to 4. We'll save it for later usage.</b>

In [15]:
best_max_degree = res.loc[res.test_score==res.test_score.min(), 'max_degree'].item()

Alternatively, run this cell to skip parameter grid search:

In [None]:
#best_max_degree = 4

### Best alpha for Lasso Regressor

<b>Here we'll try to find an even better alpha for Lasso regressor in the range around 0.1, where model showed best result.</b>

In [16]:
param_grid = dict(alpha=[0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2])
param_grid

{'alpha': [0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2]}

Tuning in best max_degree for PolynomialFeatures from previous search:

In [17]:
poly = ColumnTransformer([("poly", PolynomialFeatures(degree=best_max_degree), slice(0, 5))], remainder='passthrough')

In [18]:
results = []

for params in ParameterGrid(param_grid):
    print(params)
    
    pipe_reg = Pipeline(steps = [('tax_merger', Merger('tax', 'tax(£)')),
                                 ('mapper', mapper),
                                 ('poly', poly),
                                 ('lassoreg', Lasso(alpha=params['alpha']))])
    
    pipe_reg.fit(X_train, y_train)
   
    results.append(dict(alpha=params['alpha'],
                        train_score=mean_squared_error(y_train,
                                                       pipe_reg.predict(X_train), 
                                                       squared=False),
        
                        test_score=mean_squared_error(y_test,
                                                      pipe_reg.predict(X_test), 
                                                      squared=False)
    ))

{'alpha': 0.001}
{'alpha': 0.005}
{'alpha': 0.01}
{'alpha': 0.05}
{'alpha': 0.1}
{'alpha': 0.15}
{'alpha': 0.2}


In [19]:
res = pd.DataFrame(results)
res.sort_values('test_score').style.bar(subset=['train_score', 'test_score'], vmin=0)

Unnamed: 0,alpha,train_score,test_score
0,0.001,2878.032851,2740.530574
1,0.005,2878.201978,2740.942626
2,0.01,2878.435383,2741.496524
3,0.05,2881.022132,2746.033899
4,0.1,2884.33462,2750.862942
5,0.15,2885.608959,2751.755609
6,0.2,2886.925133,2753.111384


<b>As it turns out, model works best with alpha for Lasso regressor equal to 0.001. We'll save it for later usage.</b>

In [20]:
best_alpha = res.loc[res.test_score==res.test_score.min(), 'alpha'].item()

Alternatively, run this cell to skip parameter grid search:

In [None]:
#best_alpha = 0.001

## Fitting the Model Pipeline

In [21]:
pipeline_regularized = Pipeline(steps = [('tax_merger', Merger('tax', 'tax(£)')),
                                         ('mapper', mapper), 
                                         ('poly', poly),
                                         ('lassoreg', Lasso(alpha=best_alpha))])

pipeline_regularized.fit(X, y)

Pipeline(steps=[('tax_merger', Merger(col_add='tax(£)', col_main='tax')),
                ('mapper',
                 DataFrameMapper(df_out=True, drop_cols=[],
                                 features=[(['year'],
                                            SimpleImputer(strategy='median')),
                                           (['mileage'],
                                            SimpleImputer(strategy='median')),
                                           (['tax'],
                                            SimpleImputer(strategy='median')),
                                           (['mpg'],
                                            SimpleImputer(strategy='median')),
                                           (['engineSize'],
                                            SimpleImputer(strategy='m...
                                                          sparse=False)),
                                           (['transmission'],
                                     

## Predicting for Test Sample and Converting Predictions to CSV

In [22]:
test = pd.read_csv('cars_test.csv')

In [23]:
pred = test.id.to_frame()
pred['price'] = pipeline_regularized.predict(test)
pred.set_index('id', inplace = True)

In [24]:
pred.to_csv('try_poly4_lasso.csv')