# Simple classifier on Carvana dataset

* The challenge of this project is to predict if the car purchased at the Auction is a good / bad buy.
* All the variables in the data set are defined in the file Carvana_Data_Dictionary.txt 
* The data contains missing values 
* The dependent variable (IsBadBuy) is binary (C2)
* There are 32 Independent variables (C3-C34)

Used dataset: https://www.kaggle.com/c/DontGetKicked/data

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, plot_roc_curve, plot_confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
%matplotlib inline

In [None]:
%%time

dataset = pd.read_csv('training.csv', sep=',')
dataset.PurchDate = pd.to_datetime(dataset.PurchDate)

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.isnull().sum()

## Data cleaning

In [None]:
def detect_outliers(df, column):
    zscores = np.abs(zscore(df[column]))
    return df[zscores > 3].index

def clean_data(df, columns):
    df_copy = df.copy(deep=True)
    for column in columns:
        df_copy = df_copy.drop(detect_outliers(df_copy, column), axis = 0).reset_index(drop=True)
    return df_copy

In [None]:
columns = ['VehicleAge', 'VehYear', 'VehOdo', 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'VehBCost', 'WarrantyCost']

dataset = clean_data(dataset, columns)

In [None]:
to_drop = ['RefId', 'PRIMEUNIT', 'AUCGUART', 'BYRNO', 'WheelTypeID', 'TopThreeAmericanName', 'Model', 'Trim', 'SubModel', 'VNZIP1']

dataset = dataset.drop(to_drop, axis='columns')

In [None]:
dataset.info()

In [None]:
def handle_nans(df):
    df_copy = df.copy(deep=True)
    # fill snon numerical values with 'UNKNOWN'
    df_copy.Color = df_copy.Color.fillna('UNKNOWN')
    df_copy.WheelType = df_copy.WheelType.fillna('UNKNOWN')
    df_copy.Transmission = df_copy.Transmission.fillna('UNKNOWN')
    df_copy.Transmission = df_copy.Transmission.apply(str.upper)
    df_copy.Nationality = df_copy.Nationality.fillna('UNKNOWN')
    df_copy.Size = df_copy.Size.fillna('UNKNOWN')
    
    cols = ['MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice']

    for col in cols:
        df_copy[col] = df_copy[col].fillna(df_copy[col].mean())
    
    return df_copy

dataset = handle_nans(dataset)

## Visualizations

In [None]:
g = sns.heatmap(dataset.corr(), fmt = ".2f", cmap = "coolwarm")


In [None]:
df = dataset.copy(deep=True)
df['PurchDateAgg'] = df.PurchDate.apply(lambda n: f'{n.year}-{n.month}') 

g = sns.catplot(x='PurchDateAgg', y='IsBadBuy', data=df, aspect=3, kind="bar")
g.set_xticklabels(rotation=-45)

In [None]:
g = sns.catplot(x='Auction', y='IsBadBuy', data=dataset, aspect=3, kind="bar")
g.set_xticklabels(rotation=-45)

In [None]:
g = sns.catplot(x='Make', y='IsBadBuy', data=dataset, aspect=3, kind="bar")
g.set_xticklabels(rotation=-45)

In [None]:
g = sns.catplot(x='Color', y='IsBadBuy', data=dataset, aspect=3, kind="bar")
g.set_xticklabels(rotation=-45)

In [None]:
g = sns.catplot(x='Transmission', y='IsBadBuy', data=dataset, aspect=3, kind="bar")
g.set_xticklabels(rotation=-45)

In [None]:
g = sns.catplot(x='WheelType', y='IsBadBuy', data=dataset, aspect=3, kind="bar")
g.set_xticklabels(rotation=-45)

In [None]:
g = sns.catplot(x='Nationality', y='IsBadBuy', data=dataset, aspect=3, kind="bar")
g.set_xticklabels(rotation=-45)

In [None]:
g = sns.catplot(x='Size', y='IsBadBuy', data=dataset, aspect=3, kind="bar")
g.set_xticklabels(rotation=-45)

In [None]:
g = sns.catplot(x='VNST', y='IsBadBuy', data=dataset, aspect=3, kind="bar")
g.set_xticklabels(rotation=-45)

In [None]:
g = sns.catplot(x='IsOnlineSale', y='IsBadBuy', data=dataset, kind="bar")

## Data conversion

In [None]:
def convert_date(date):
    return f'{date.year}-{date.month}'

dataset['PurchDateAgg'] = dataset.PurchDate.apply(convert_date)
dataset = dataset.drop('PurchDate', axis='columns')

In [None]:
def convert_column_to_one_hot(df, column):
    df_copy = df.copy(deep=True)
    one_hot = pd.get_dummies(df_copy[column], prefix=column)
    df_copy = df_copy.join(one_hot)
    df_copy = df_copy.drop(column, axis='columns')
    return df_copy

def convert_columns_to_one_hot(df, columns):
    df_copy = df.copy(deep=True)
    for column in columns:
        df_copy = convert_column_to_one_hot(df_copy, column)
    return df_copy

columns_to_convert = ['Auction', 'Make', 'Color', 'Transmission', 'WheelType', 'Nationality', 'Size', 'VNST', 'PurchDateAgg']

dataset.IsOnlineSale = dataset.IsOnlineSale.astype(bool)
dataset.VehBCost = dataset.VehBCost.astype(np.int)
dataset = convert_columns_to_one_hot(dataset, columns_to_convert)

In [None]:
cols = list(dataset.columns)[1:]
labels = ['IsBadBuy']

X = dataset[cols]
Y = dataset[labels]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

## Models

In [None]:
models = [DecisionTreeClassifier, RandomForestClassifier, GradientBoostingClassifier, MLPClassifier]
params = {
    DecisionTreeClassifier: {'max_depth': list(range(1, len(cols), 10))}, 
    RandomForestClassifier: {'n_estimators': [100, 250, 500, 1000], 'max_depth': [11]}, # max depth was found empirically so training doesn't take too long
    GradientBoostingClassifier: {'n_estimators': [100, 250, 500, 1000]},
    MLPClassifier: {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)], 'activation': ['tanh', 'relu'], 'solver': ['sgd', 'adam'], 'alpha': [0.0001, 0.05], 'learning_rate': ['constant','adaptive'], 'early_stopping': [True], 'max_iter': [100]}
}

#### Beware, `find_best_models` runs about 25 mins on 6 core CPU.

In [None]:
%%time
def find_best_models(models, params):
    best_estimators = []
    for model in models:
        print(f'{model}')
        clf = GridSearchCV(model(), params[model], scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)
        clf.fit(X_train, y_train.values.ravel())
        best_estimators.append(clf.best_estimator_)
    return best_estimators

best_models = find_best_models(models, params)

In [None]:
def evaluate_model(model):
    predictions = model.predict(X_test)
    print(f'Evaluating model: {model.__class__.__name__}')
    plot_confusion_matrix(model, X_test, y_test, cmap=plt.cm.Blues, normalize='true')
    plot_roc_curve(model, X_test, y_test)
    plt.show()
    print('-----------')

for model in best_models:
    evaluate_model(model)