In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, precision_recall_curve
import matplotlib.pyplot as plt
from plotnine import *
import os
import datetime
import pickle
import json
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)
pd.options.display.float_format = '{:.4f}'.format
df = pd.read_csv('car_prices_clean.csv', index_col=False)
df['id'] = range(1, len(df) + 1)



## 2. Feature engineering

In [9]:
# constants
USE_STORED_SVC_HYPERTUNED_MODELS = True
USE_STORED_SVC_SKLEARNED_HYPERTUNED_MODELS = True
USE_STORED_NN_HYPERTUNED_MODELS = True

In [10]:
df.drop(['Unnamed: 0', 'model', 'trim', 'body', 'vin', 'seller', 'saledate'], axis=1, inplace=True)
df.head()

Unnamed: 0,year,make,transmission,state,condition,odometer,color,interior,mmr,sellingprice,id
0,2015,Kia,automatic,ca,5.0,16639.0,white,black,20500.0,21500.0,1
1,2015,Kia,automatic,ca,5.0,9393.0,white,beige,20800.0,21500.0,2
2,2014,BMW,automatic,ca,4.5,1331.0,gray,black,31900.0,30000.0,3
3,2015,Volvo,automatic,ca,4.1,14282.0,white,black,27500.0,27750.0,4
4,2014,BMW,automatic,ca,4.3,2641.0,gray,black,66000.0,67000.0,5


Split data in training, validation and testing

In [11]:
df['transmission_bin'] = pd.factorize(df['transmission'])[0]

class TrainTestSplitter(object):
    '''Class to perform the split of the data into train, test, and validation.
    '''
    def __init__(self, train_frac=0.8, validation_frac=0.2, seed=1234):
        self.train_frac = train_frac
        self.validation_frac = validation_frac
        self.seed = seed
    
    def calculate_statistics(self):
        statistics = {}
        for i in ['train_set', 'test_set', 'validation_set']:
            split_stats = {}
            default_count = (getattr(self, i).groupby('transmission_bin').size().reset_index())
            split_stats['N_manual'] = (default_count.loc[lambda x: x.transmission_bin ==1, 0].iloc[0])
            split_stats['percentage_total_manual'] = split_stats['N_manual']/self.total_n_defaults * 100
            split_stats['N_automatic'] = default_count.loc[lambda x: x.transmission_bin == 0, 0].iloc[0]
            split_stats['percentage_total_automatic'] = split_stats['N_automatic']/self.total_n_not_defaults * 100
            statistics[i] = split_stats
        self.split_statistics = statistics

    def split_train_test(self, df):
        print("Generating the train/validation/test splits...")
        self.total_n_defaults = df.loc[lambda x: x.transmission_bin == 1].shape[0]
        self.total_n_not_defaults = df.loc[lambda x: x.transmission_bin == 0].shape[0]
        self.train_set = df.sample(frac=self.train_frac, random_state=self.seed)
        self.test_set = df.loc[lambda x: ~x.id.isin(self.train_set.id)].reset_index(drop=True)
        self.validation_set = self.train_set.sample(frac=self.validation_frac).reset_index(drop=True)
        self.train_set = self.train_set.loc[lambda x: ~x.id.isin(self.validation_set.id)].reset_index(drop=True)
        print("calculating the statistics...")
        self.calculate_statistics()
        print("split completed")

# create a fitting_splits object that will hold the train, validation, and test data
fitting_splits = TrainTestSplitter()
fitting_splits.split_train_test(df)

fitting_splits.test_set.shape
fitting_splits.split_statistics

Generating the train/validation/test splits...
calculating the statistics...
split completed


(106737, 12)

{'train_set': {'N_manual': 10483,
  'percentage_total_manual': 63.81566932489194,
  'N_automatic': 331076,
  'percentage_total_automatic': 64.00584620083943},
 'test_set': {'N_manual': 3350,
  'percentage_total_manual': 20.39325500700067,
  'N_automatic': 103387,
  'percentage_total_automatic': 19.987472426772662},
 'validation_set': {'N_manual': 2594,
  'percentage_total_manual': 15.791075668107386,
  'N_automatic': 82796,
  'percentage_total_automatic': 16.006681372387916}}

In [12]:
def dummify(df, one_hot_encoder):
    vars_to_encode = ['transmission', 'color', 'interior']
    df_to_encode = df[vars_to_encode]
    if not one_hot_encoder:
        one_hot_encoder = OneHotEncoder()
        df_encoded = one_hot_encoder.fit_transform(df_to_encode).toarray()
    else:
        df_encoded = one_hot_encoder.transform(df_to_encode).toarray()
    df_encoded = pd.DataFrame(df_encoded, columns=one_hot_encoder.get_feature_names_out())
    # add the encoded columns and drop the original columns
    df = pd.concat([df, df_encoded], axis=1)
    df = df.drop(vars_to_encode, axis=1)
    return df, one_hot_encoder

def scale(df, standard_scaler, cols_to_scale):
    if not standard_scaler:
        standard_scaler = StandardScaler()
        df[cols_to_scale] = standard_scaler.fit_transform(df[cols_to_scale])
    else:
        df[cols_to_scale] = standard_scaler.transform(df[cols_to_scale])
    return df, standard_scaler

def prepare_data(df, one_hot_encoder=None, standard_scaler=None, cols_to_scale=None):
    df = df.reset_index(drop=True)
    df, one_hot_encoder = dummify(df, one_hot_encoder)
    
    # Identify columns to scale (numerical features)
    if cols_to_scale is None:
        cols_to_scale = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Identify columns to exclude from scaling (one-hot encoded columns)
    one_hot_columns = one_hot_encoder.get_feature_names_out() if one_hot_encoder else []
    cols_to_exclude = df.columns[df.columns.isin(one_hot_columns)]
    
    # Remove one-hot encoded columns from the list of columns to scale
    cols_to_scale = list(set(cols_to_scale) - set(cols_to_exclude))
    
    df, standard_scaler = scale(df, standard_scaler, cols_to_scale)
    return df, one_hot_encoder, standard_scaler


In [13]:
# now we prepare all the data we use below
X_train, one_hot_encoder, standard_scaler = prepare_data(fitting_splits.train_set)
X_train = X_train.drop(["id", 'transmission_bin', 'sellingprice', 'make', 'state'],axis=1)  
y_train = fitting_splits.train_set["sellingprice"]

X_validation = prepare_data(fitting_splits.validation_set, one_hot_encoder, standard_scaler)[0]
X_validation = X_validation.drop(["id", 'transmission_bin', 'sellingprice', 'make', 'state'],axis=1)
y_validation = fitting_splits.validation_set["sellingprice"]

X_test = prepare_data(fitting_splits.test_set, one_hot_encoder, standard_scaler)[0].drop(["id", 'transmission_bin', 'sellingprice', 'make', 'state'],axis=1)
y_test = fitting_splits.test_set["sellingprice"]

X_train_validation = pd.concat([X_train, X_validation])
y_train_validation = pd.concat([y_train, y_validation])

In [14]:
# hyperparameter tuning with crossvalidation of a random forest model
param_grid = {
    "n_estimators": [x for x in range(10, 200, 10)],
    "max_depth": [x for x in range(5, 21, 5)]
}
sklearn_grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, n_jobs=2, scoring='f1')
_ = sklearn_grid_search_rf.fit(X_train_validation, y_train_validation)

