# 4. MODEL TRAINING AND TUNING
---

In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 99)
pd.set_option("display.max_rows", 999)
pd.set_option('precision', 3)

cars = pd.read_csv('data/imports-85.cars3')
print(cars.shape)
cars.head()

(204, 31)


Unnamed: 0,engine_size,engine_type_dohcv,make_bmw,curb_weight,make_mercedes-benz,width,wheel_base,engine_location_rear,engine_location_front,make_jaguar,make_peugot,engine_type_rotor,horsepower,make_subaru,body_style_convertible,make_porsche,make_mitsubishi,make_plymouth,num_cylinders,make_toyota,make_renault,make_dodge,make_nissan,engine_type_ohcf,symboling_-2,peak_rpm,make_volkswagen,compression_ratio,make_isuzu,bore,price
0,0.26,0,0,0.411,0,0.317,0.058,0,1,0,0,0,0.263,0,1,0,0,0,0.2,0,0,0,0,0,0,0.347,0,0.125,0,0.664,16500.0
1,0.343,0,0,0.518,0,0.433,0.23,0,1,0,0,0,0.442,0,0,0,0,0,0.4,0,0,0,0,0,0,0.347,0,0.125,0,0.1,16500.0
2,0.181,0,0,0.329,0,0.492,0.385,0,1,0,0,0,0.225,0,0,0,0,0,0.2,0,0,0,0,0,0,0.551,0,0.188,0,0.464,13950.0
3,0.283,0,0,0.518,0,0.508,0.373,0,1,0,0,0,0.279,0,0,0,0,0,0.3,0,0,0,0,0,0,0.551,0,0.062,0,0.464,17450.0
4,0.283,0,0,0.395,0,0.5,0.385,0,1,0,0,0,0.258,0,0,0,0,0,0.3,0,0,0,0,0,0,0.551,0,0.094,0,0.464,15250.0


In [4]:
np.random.seed(42)
cars = cars.loc[np.random.permutation(len(cars))]
X_cars = cars.drop('price', axis=1)
y_cars = cars['price'].copy()
X_cars.shape, y_cars.shape

((204, 30), (204,))

## 1. K-Nearest Neighbor Regression

In [7]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

kpar = {'n_neighbors': np.arange(1, 21, 2),
       'weights': ['uniform', 'distance'],
       'algorithm': ['ball_tree', 'kd_tree', 'brute'],
       'p': [1,2]
      }
knn = KNeighborsRegressor()
kgrid = GridSearchCV(knn, kpar, n_jobs=-1)
kgrid.fit(X_cars, y_cars)

print('Best Parameters:', kgrid.best_params_)
print('Best Score:', kgrid.best_score_)

Best Parameters: {'algorithm': 'ball_tree', 'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
Best Score: 0.9026686890801114


In [10]:
from sklearn.model_selection import cross_val_score

knn = KNeighborsRegressor(algorithm='ball_tree', n_neighbors=7, 
                        p=1, weights='distance')
knn_mse = cross_val_score(knn, X_cars, y_cars, cv=5, n_jobs=-1, 
                             scoring="neg_mean_squared_error") 
knn_rmse = np.sqrt(np.absolute(knn_mse))
print('RMSE:', knn_rmse)
print('Avg_RMSE:', knn_rmse.mean())
print('Std_RMSE:', knn_rmse.std())

RMSE: [3704.30544571 2161.29986198 1889.57343682 2159.55492586 2122.96711104]
Avg_RMSE: 2407.540156281768
Std_RMSE: 656.2015716948404


The error results above are an improvement from what we got with OLS linear model and ElasticNet model. Hopefully we can keep improving upon it. 

## 2. Decision Tree Regression

In [12]:
from sklearn.tree import DecisionTreeRegressor

tpar = {'max_depth': [2, 3, 4, 5, 6, 8, 10, 12],
       'min_samples_split': [2, 3, 4],
       'max_features': [10, 15, 20, 25, 30]
       }
tree = DecisionTreeRegressor()
tgrid = GridSearchCV(tree, tpar, n_jobs=-1)
tgrid.fit(X_cars, y_cars)

print('Best Parameters:', tgrid.best_params_)
print('Best Score:', tgrid.best_score_)

Best Parameters: {'max_depth': 8, 'max_features': 15, 'min_samples_split': 3}
Best Score: 0.8672069439843376


In [13]:
tree = DecisionTreeRegressor(max_depth=8, max_features=15,
                            min_samples_split=3)
tree_mse = cross_val_score(tree, X_cars, y_cars, cv=5, n_jobs=-1, 
                             scoring="neg_mean_squared_error") 
tree_rmse = np.sqrt(np.absolute(tree_mse))
print('RMSE:', tree_rmse)
print('Avg_RMSE:', tree_rmse.mean())
print('Std_RMSE:', tree_rmse.std())

RMSE: [5712.26423039 1967.08775981 3361.62179902 2970.40856479 3393.38033651]
Avg_RMSE: 3480.952538104143
Std_RMSE: 1229.0730083712267
