In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import metrics

In [10]:
original_data = pd.read_csv('./vw.csv')
data_encoded = pd.read_csv('./vw_encoded.csv')
data_encoded.drop(['Unnamed: 0'], axis=1, inplace=True)
data_encoded

Unnamed: 0,mileage,tax,mpg,engineSize,age,model_ Amarok,model_ Arteon,model_ Golf,model_ Golf SV,model_ Passat,...,model_ Tiguan,model_ Touareg,model_ Touran,model_ Up,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,fuelType_Diesel,fuelType_Petrol,price
0,-0.378448,0.549548,-0.391100,0.920693,-0.92354,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,25000.0
1,-0.856243,0.549548,-0.391100,0.920693,-0.92354,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,26883.0
2,-0.710378,0.549548,-0.308725,0.920693,-0.92354,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,20000.0
3,-0.842792,0.549548,-2.151869,0.920693,-0.92354,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,33492.0
4,-0.757125,0.630186,-1.400195,-0.177545,-0.92354,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,22900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14065,0.837725,2.404206,-1.770884,3.117169,0.19530,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,23495.0
14066,1.057392,2.404206,-1.863556,0.920693,0.75472,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,16500.0
14067,3.914138,2.404206,-1.863556,0.920693,1.31414,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,12995.0
14068,0.853938,2.404206,-1.863556,0.920693,1.31414,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,15995.0


### Splitting training and testing data

In [3]:
from sklearn.model_selection import train_test_split

In [20]:
X = data_encoded.drop(columns=['price'])
y = np.ravel(data_encoded[['price']])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

In [22]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10552, 24), (3518, 24), (10552,), (3518,))

helper functions:

In [25]:
def regression_report(y_pred, y_test=y_test):
    print('Accuracy score: \t\t\t{:>15.4f}'.format(
        metrics.explained_variance_score(y_test, y_pred)))
    print('R-squared score: \t\t\t{:>15.4f}'.format(
        metrics.r2_score(y_test, y_pred)))
    print('Mean Absolute Error(MAE): \t\t{:>15.4f}'.format(
        metrics.mean_absolute_error(y_test, y_pred)))
    print('Mean Squared Error (MSE): \t\t{:>15.4f}'.format(
        metrics.mean_squared_error(y_test, y_pred)))
    print('Root Mean Squared Error (RSME): \t{:>15.4f}'.format(
        np.sqrt(metrics.mean_squared_error(y_test, y_pred))))
    print('Root Mean Squared Log Error (RSMLE): \t{:>15.4f}'.format(
        np.sqrt(metrics.mean_squared_log_error(y_test, y_pred))))


In [None]:
def regression_report_compare(y_before_tuned, y_after_tuned, y_test=y_test):
    print('Before tuning hyperparameter: \n')
    regression_report(y_before_tuned, y_test)
    print('-------------------------------------------------------')
    print('After tuning hyper parameter: \n')
    regression_report(y_after_tuned, y_test)

In [None]:
def regression_report_compare_model(model_before_tuned, model_after_tuned, X=X_test, Y=y_test):
    regression_report_compare(model_before_tuned.predict(X), model_after_tuned.predict(X), Y)

In [26]:
def gs_report(gs):
    print('Best score: \t\t\t{:>15.4f}'.format(gs.best_score_))
    print('Best parameters: \t\t{}'.format(gs.best_params_))
    print('Best estimator: \t\t{}'.format(gs.best_estimator_))

In [None]:
def cv_score_report(cv_score):
    print('Cross validation score: {}'.format(cv_score))
    print('Mean cross validation score: {}'.format(np.mean(cv_score)))
    print('Standard deviation cross validation score: {}'.format(np.std(cv_score)))

In [None]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance = metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    mean_squared_log_error = metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error = metrics.median_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance, 4))
    print('mean_squared_log_error: ', round(mean_squared_log_error, 4))
    print('r2: ', round(r2, 4))
    print('MAE: ', round(mean_absolute_error, 4))
    print('MSE: ', round(mse, 4))
    print('RMSE: ', round(np.sqrt(mse), 4))