# Data and Preprocessing

In [163]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('carvana.csv')
df.head()

Unnamed: 0,RefId,PurchDate,Auction,VehYear,VehicleAge,Make,Model,Trim,SubModel,Color,...,MMRCurrentRetailCleanPrice,PRIMEUNIT,AUCGUART,BYRNO,VNZIP1,VNST,VehBCost,IsOnlineSale,WarrantyCost,IsBadBuy
0,1,1/7/2009 0:12,ADESA,2005,4,DODGE,STRATUS V6,SXT,4D SEDAN SXT FFV,MAROON,...,8702.0,,,19638,33619,FL,4900.0,0,1389,0
1,2,1/7/2009 0:12,ADESA,2004,5,DODGE,NEON,SXT,4D SEDAN,SILVER,...,5518.0,,,19638,33619,FL,4100.0,0,630,0
2,3,1/7/2009 0:12,ADESA,2005,4,FORD,FOCUS,ZX3,2D COUPE ZX3,SILVER,...,7911.0,,,19638,33619,FL,4000.0,0,1020,0
3,4,1/7/2009 0:12,ADESA,2004,5,MITSUBISHI,GALANT 4C,ES,4D SEDAN ES,WHITE,...,9451.0,,,19638,33619,FL,5600.0,0,594,0
4,5,1/21/2009 0:12,ADESA,2004,5,DODGE,1500 RAM PICKUP 2WD,ST,QUAD CAB 4.7L SLT,WHITE,...,12560.0,,,19638,33619,FL,8800.0,0,920,0


In [164]:
df.shape

(6798, 34)

In [165]:
df.dtypes

RefId                                  int64
PurchDate                             object
Auction                               object
VehYear                                int64
VehicleAge                             int64
Make                                  object
Model                                 object
Trim                                  object
SubModel                              object
Color                                 object
Transmission                          object
WheelTypeID                          float64
WheelType                             object
VehOdo                                 int64
Nationality                           object
Size                                  object
TopThreeAmericanName                  object
MMRAcquisitionAuctionAveragePrice    float64
MMRAcquisitionAuctionCleanPrice      float64
MMRAcquisitionRetailAveragePrice     float64
MMRAcquisitonRetailCleanPrice        float64
MMRCurrentAuctionAveragePrice        float64
MMRCurrent

In [166]:
df['BYRNO'] = df['BYRNO'].astype('object')
df['VNZIP1'] = df['VNZIP1'].astype('object')


Since features 'AUCGUART' and 'PRIMEUNIT' have huge number of NA values, we will drop those features

In [167]:
df.drop(['AUCGUART', 'PRIMEUNIT'], axis = 1, inplace = True)

In [168]:
df.isna().sum()

RefId                                   0
PurchDate                               0
Auction                              3815
VehYear                                 0
VehicleAge                              0
Make                                    0
Model                                   0
Trim                                  232
SubModel                                0
Color                                   0
Transmission                            0
WheelTypeID                           283
WheelType                             339
VehOdo                                  0
Nationality                             1
Size                                    1
TopThreeAmericanName                    1
MMRAcquisitionAuctionAveragePrice       1
MMRAcquisitionAuctionCleanPrice         1
MMRAcquisitionRetailAveragePrice        1
MMRAcquisitonRetailCleanPrice           1
MMRCurrentAuctionAveragePrice          21
MMRCurrentAuctionCleanPrice            21
MMRCurrentRetailAveragePrice      

In [169]:
# Since the same observation have NA values for multiple features, we will drop that observation
df = df.loc[~df['Nationality'].isna(), :]
df = df.loc[~df['MMRAcquisitionAuctionAveragePrice'].isna(), :]
df = df.loc[~df['MMRCurrentAuctionAveragePrice'].isna(), :]
# Since features 'Auction' and 'VNST' have huge number of NA values, we will drop those features
df.drop(['Auction', 'VNST'], axis = 1, inplace = True)
# WheelTypeID is numerical representation of WheelType, so it will be droped
df.drop('WheelTypeID', axis = 1, inplace = True)

In [170]:
df.isna().sum()

RefId                                  0
PurchDate                              0
VehYear                                0
VehicleAge                             0
Make                                   0
Model                                  0
Trim                                 232
SubModel                               0
Color                                  0
Transmission                           0
WheelType                            339
VehOdo                                 0
Nationality                            0
Size                                   0
TopThreeAmericanName                   0
MMRAcquisitionAuctionAveragePrice      0
MMRAcquisitionAuctionCleanPrice        0
MMRAcquisitionRetailAveragePrice       0
MMRAcquisitonRetailCleanPrice          0
MMRCurrentAuctionAveragePrice          0
MMRCurrentAuctionCleanPrice            0
MMRCurrentRetailAveragePrice           0
MMRCurrentRetailCleanPrice             0
BYRNO                                  0
VNZIP1          

In [171]:
df = df.set_index('RefId')
# Purchase Date is irrelevant since we have Vehicle Year and Vehicle Age
df = df.drop('PurchDate', axis = 1)

In [172]:
X = df.drop('IsBadBuy', axis = 1)
y = df['IsBadBuy']

### Exploring the distribution of outcome variable

In [173]:
y.value_counts()

0    5911
1     865
Name: IsBadBuy, dtype: int64

In [174]:
y.value_counts(normalize = True)

0    0.872344
1    0.127656
Name: IsBadBuy, dtype: float64

### We can see that data is highly inbalanced, so we will use various techniques in further analyisis to solve that problem

# Splitting data into train and test set

In [175]:
from sklearn.model_selection import train_test_split

RAND_STATE = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = RAND_STATE, stratify=y)

In [176]:
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_validate

# Building predictive models with default parameters through pipeline

### KNN

In [177]:
categorical_pipeline = Pipeline([('imputer', SimpleImputer(strategy = 'most_frequent')),
                                 ('encoder', OneHotEncoder(handle_unknown = 'ignore', min_frequency=5))])

numerical_pipeline = Pipeline([('imputer', SimpleImputer()), ('standardization', StandardScaler())])

preprocessor = ColumnTransformer([('num', numerical_pipeline, X.select_dtypes('number').columns),
                                  ('cat', categorical_pipeline, X.select_dtypes('object').columns)])

pipe = Pipeline([('preprocessing', preprocessor), ('knn', KNeighborsClassifier(n_neighbors=5))])

knn_dict = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'roc_auc'], cv = 10)
print('KNN')
print('Accuracy:', knn_dict['test_accuracy'].mean())
print('ROC AUC:', knn_dict['test_roc_auc'].mean())

KNN
Accuracy: 0.8579335793357933
ROC AUC: 0.5632689238513369


Testing KNN model on out of sample data

In [178]:
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
print('KNN on Out of Sample data')
print('Accuracy:', accuracy_score(y_test, y_hat))
print('ROC AUC:', roc_auc_score(y_test, y_hat))

KNN on Out of Sample data
Accuracy: 0.8591445427728613
ROC AUC: 0.5121323762942259


### Logistic Regression

In [179]:
pipe.steps[1] = ('lr', LogisticRegression(random_state=42, solver = 'liblinear'))

lr_dict = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'roc_auc'], cv = 10)
print('Logistic Regression')
print('Accuracy:', lr_dict['test_accuracy'].mean())
print('ROC AUC:', lr_dict['test_roc_auc'].mean())

Logistic Regression
Accuracy: 0.8619926199261994
ROC AUC: 0.6460073877625632


Testing LR model on out of sample data

In [180]:
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
print('LR on Out of Sample data')
print('Accuracy:', accuracy_score(y_test, y_hat))
print('ROC AUC:', roc_auc_score(y_test, y_hat))

LR on Out of Sample data
Accuracy: 0.8657817109144543
ROC AUC: 0.5134687455719025


### Decision Tree

In [181]:
pipe.steps[1] = ('dt', DecisionTreeClassifier(random_state=42))

dt_dict = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'roc_auc'], cv = 10)
print('Decision Tree')
print('Accuracy:', dt_dict['test_accuracy'].mean())
print('ROC AUC:', dt_dict['test_roc_auc'].mean())

Decision Tree
Accuracy: 0.7894833948339484
ROC AUC: 0.5241135053087077


Testing DT model on out of sample data

In [182]:
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
print('DT on Out of Sample data')
print('Accuracy:', accuracy_score(y_test, y_hat))
print('ROC AUC:', roc_auc_score(y_test, y_hat))

DT on Out of Sample data
Accuracy: 0.7912979351032449
ROC AUC: 0.5127284898294235


### With each model we get low roc_auc_score, so obviously we need to balance the distribution of the class variable

## Undersampling

In [183]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline

pipe = make_pipeline(preprocessor, RandomUnderSampler(random_state=RAND_STATE), KNeighborsClassifier(n_neighbors=5))

knn_rus_dict = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'roc_auc'], cv = 10)
print('KNN RUS')
print('Accuracy:', knn_rus_dict['test_accuracy'].mean())
print('ROC AUC:', knn_rus_dict['test_roc_auc'].mean())

KNN RUS
Accuracy: 0.5612546125461254
ROC AUC: 0.5853433368806821


In [184]:
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
print('KNN on out of sample data')
print('Accuracy:', accuracy_score(y_test, y_hat))
print('ROC AUC:', roc_auc_score(y_test, y_hat))

KNN on out of sample data
Accuracy: 0.5811209439528023
ROC AUC: 0.5970761119716211


In [185]:
pipe.steps[2] = ('lr', LogisticRegression(random_state=RAND_STATE, solver = 'liblinear'))

lr_rus_dict = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'roc_auc'], cv = 10)
print('LR RUS')
print('Accuracy:', lr_rus_dict['test_accuracy'].mean())
print('ROC AUC:', lr_rus_dict['test_roc_auc'].mean())

LR RUS
Accuracy: 0.5977859778597786
ROC AUC: 0.6375068960561926


In [186]:
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
print('LogisticRegression on out of sample data')
print('Accuracy:', accuracy_score(y_test, y_hat))
print('ROC AUC:', roc_auc_score(y_test, y_hat))

LogisticRegression on out of sample data
Accuracy: 0.6061946902654868
ROC AUC: 0.6114463571110971


In [187]:
pipe.steps[2] = ('dt', DecisionTreeClassifier(random_state=RAND_STATE))

dt_rus_dict = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'roc_auc'], cv = 10)
print('dt RUS')
print('Accuracy:', dt_rus_dict['test_accuracy'].mean())
print('ROC AUC:', dt_rus_dict['test_roc_auc'].mean())

dt RUS
Accuracy: 0.5607011070110701
ROC AUC: 0.5570033575545692


In [188]:
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
print('Decision Tree on out of sample data')
print('Accuracy:', accuracy_score(y_test, y_hat))
print('ROC AUC:', roc_auc_score(y_test, y_hat))

Decision Tree on out of sample data
Accuracy: 0.5833333333333334
ROC AUC: 0.5958765556364489


## Oversampling

In [189]:
from imblearn.over_sampling import RandomOverSampler

pipe = make_pipeline(preprocessor, RandomOverSampler(random_state=RAND_STATE), KNeighborsClassifier(n_neighbors=5))

knn_ros_dict = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'roc_auc'], cv = 10)
print('KNN ROS')
print('Accuracy:', knn_ros_dict['test_accuracy'].mean())
print('ROC AUC:', knn_ros_dict['test_roc_auc'].mean())

KNN ROS
Accuracy: 0.6601476014760148
ROC AUC: 0.5581495296767636


In [190]:
pipe.steps[2] = ('lr', LogisticRegression(random_state=RAND_STATE, solver = 'liblinear'))

lr_ros_dict = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'roc_auc'], cv = 10)
print('LR ros')
print('Accuracy:', lr_ros_dict['test_accuracy'].mean())
print('ROC AUC:', lr_ros_dict['test_roc_auc'].mean())

LR ros
Accuracy: 0.6785977859778598
ROC AUC: 0.612675037804877


In [191]:
pipe.steps[2] = ('dt', DecisionTreeClassifier(random_state=RAND_STATE))

dt_ros_dict = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'roc_auc'], cv = 10)
print('dt ros')
print('Accuracy:', dt_ros_dict['test_accuracy'].mean())
print('ROC AUC:', dt_ros_dict['test_roc_auc'].mean())

dt ros
Accuracy: 0.7756457564575645
ROC AUC: 0.5395712995937929


# Feature Selection

In [192]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

pipe = make_pipeline(preprocessor, RandomOverSampler(random_state=RAND_STATE), SelectKBest(k = 20, score_func = f_classif), KNeighborsClassifier(n_neighbors=5))

knn_ros_dict = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'roc_auc'], cv = 10)
print('KNN ROS with Feature Selection')
print('Accuracy:', knn_ros_dict['test_accuracy'].mean())
print('ROC AUC:', knn_ros_dict['test_roc_auc'].mean())

KNN ROS with Feature Selection
Accuracy: 0.6564575645756456
ROC AUC: 0.5813712262727704


In [193]:
pipe.steps[-1] = ('lr', LogisticRegression(random_state=RAND_STATE, solver = 'liblinear'))

lr_ros_dict = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'roc_auc'], cv = 10)
print('LR ros with Feature Selection')
print('Accuracy:', lr_ros_dict['test_accuracy'].mean())
print('ROC AUC:', lr_ros_dict['test_roc_auc'].mean())

LR ros with Feature Selection
Accuracy: 0.6252767527675277
ROC AUC: 0.6767654697675606


In [194]:
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
print('LogisticRegression on out of sample data')
print('Accuracy:', accuracy_score(y_test, y_hat))
print('ROC AUC:', roc_auc_score(y_test, y_hat))

LogisticRegression on out of sample data
Accuracy: 0.6393805309734514
ROC AUC: 0.6156606843578831


In [195]:
pipe.steps[-1] = ('dt', DecisionTreeClassifier(random_state=RAND_STATE))

dt_ros_dict = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'roc_auc'], cv = 10)
print('dt ros with Feature Selection')
print('Accuracy:', dt_ros_dict['test_accuracy'].mean())
print('ROC AUC:', dt_ros_dict['test_roc_auc'].mean())

dt ros with Feature Selection
Accuracy: 0.7904059040590405
ROC AUC: 0.5375943495331766


# Hyperparameter tuning

In [196]:
# from sklearn.model_selection import GridSearchCV

# grid_params = {'logisticregression__C': list(np.linspace(0.1, 2, 50)) + [1],
#                'logisticregression__penalty': ['l1', 'l2'],
#                'selectkbest__k': [10, 15, 20, 25, 30]}

# pipe = make_pipeline(preprocessor, 
#                      RandomOverSampler(random_state=RAND_STATE), 
#                      SelectKBest(k = 20, score_func = f_classif), 
#                      LogisticRegression(random_state=RAND_STATE, solver='liblinear'))

# lr_grid = GridSearchCV(pipe, grid_params, scoring = 'roc_auc', cv = 5, n_jobs = -1)

# lr_grid.fit(X_train, y_train)
# print('Best Params:', lr_grid.best_params_)
# print('Best ROC_AUC:', lr_grid.best_score_)


In [197]:
pipe = make_pipeline(preprocessor, 
                     RandomOverSampler(random_state=RAND_STATE), 
                     SelectKBest(k = 25, score_func = f_classif), 
                     LogisticRegression(random_state=RAND_STATE, solver='liblinear', penalty = 'l1', C = 0.1))

pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
print('LogisticRegression on out of sample data')
print('Accuracy:', accuracy_score(y_test, y_hat))
print('ROC AUC:', roc_auc_score(y_test, y_hat))

LogisticRegression on out of sample data
Accuracy: 0.6511799410029498
ROC AUC: 0.6396957866499886


In [198]:
from imblearn.over_sampling import SMOTE

pipe = make_pipeline(preprocessor, 
                     SMOTE(random_state=RAND_STATE), 
                     SelectKBest(k = 25, score_func = f_classif), 
                     LogisticRegression(random_state=RAND_STATE, solver='liblinear', penalty = 'l1', C = 0.1))

pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
print('LogisticRegression on out of sample data')
print('Accuracy:', accuracy_score(y_test, y_hat))
print('ROC AUC:', roc_auc_score(y_test, y_hat))

LogisticRegression on out of sample data
Accuracy: 0.6548672566371682
ROC AUC: 0.641809057994029


# Ansambli

### Random Forest

In [199]:
from sklearn.ensemble import RandomForestClassifier

pipe = make_pipeline(preprocessor, 
                     SMOTE(random_state=RAND_STATE), 
                     SelectKBest(k = 25, score_func = f_classif), 
                     RandomForestClassifier(n_estimators=100, random_state = RAND_STATE))

rf_ros_dict = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'roc_auc'], cv = 5, n_jobs = -1)
print('rf')
print('Accuracy:', rf_ros_dict['test_accuracy'].mean())
print('ROC AUC:', rf_ros_dict['test_roc_auc'].mean())


rf
Accuracy: 0.8230627306273062
ROC AUC: 0.6400649485008004


In [200]:
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
print('Random Forest on out of sample data')
print('Accuracy:', accuracy_score(y_test, y_hat))
print('ROC AUC:', roc_auc_score(y_test, y_hat))

Random Forest on out of sample data
Accuracy: 0.8325958702064897
ROC AUC: 0.5437996863074676


### Bagging

In [201]:
from sklearn.ensemble import BaggingClassifier

model_bagging = BaggingClassifier(base_estimator = LogisticRegression(random_state=RAND_STATE, solver='liblinear', penalty='l1', C = 0.1), 
                                  n_estimators=100,
                                  random_state = RAND_STATE)
                                  

pipe = make_pipeline(preprocessor, 
                     RandomOverSampler(random_state=RAND_STATE), 
                     SelectKBest(k = 25, score_func = f_classif), 
                     model_bagging)

cv_dict = cross_validate(pipe, X_train, y_train, cv=5, scoring=['accuracy', 'roc_auc'], n_jobs=-1)
print(f"Accuracy: {cv_dict['test_accuracy'].mean()}")
print(f"Auc: {cv_dict['test_roc_auc'].mean()}")

Accuracy: 0.6263837638376384
Auc: 0.6785151286916751


In [202]:
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
print('Random Forest on out of sample data')
print('Accuracy:', accuracy_score(y_test, y_hat))
print('ROC AUC:', roc_auc_score(y_test, y_hat))

Random Forest on out of sample data
Accuracy: 0.6511799410029498
ROC AUC: 0.642163305791585


### AdaBoost

In [203]:
from sklearn.ensemble import AdaBoostClassifier

model_boost = AdaBoostClassifier(n_estimators = 100, 
                                 base_estimator=LogisticRegression(random_state=RAND_STATE, solver = 'liblinear'), 
                                 learning_rate=0.6, 
                                 random_state=RAND_STATE)

pipe = make_pipeline(preprocessor, 
                     RandomOverSampler(random_state=RAND_STATE), 
                     SelectKBest(k = 25, score_func = f_classif), 
                     model_boost)

cv_dict = cross_validate(pipe, X_train, y_train, cv=5, scoring=['accuracy', 'roc_auc'], n_jobs=-1)
print(f"Accuracy: {cv_dict['test_accuracy'].mean()}")
print(f"Auc: {cv_dict['test_roc_auc'].mean()}")

Accuracy: 0.6287822878228783
Auc: 0.6766298151320447


In [204]:
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
print('Ada Boost on out of sample data')
print('Accuracy:', accuracy_score(y_test, y_hat))
print('ROC AUC:', roc_auc_score(y_test, y_hat))

Ada Boost on out of sample data
Accuracy: 0.6497050147492626
ROC AUC: 0.6289804015459863


In [205]:
# param_grid = {
#     'adaboostclassifier__n_estimators': [50, 100],
#     'adaboostclassifier__base_estimator': [LogisticRegression(random_state=RAND_STATE), 
#                                            DecisionTreeClassifier(random_state=RAND_STATE),
#                                            KNeighborsClassifier()],
#     'adaboostclassifier__learning_rate': np.linspace(0.1, 2, 25)
# }


# pipe = make_pipeline(preprocessor, 
#                      RandomOverSampler(random_state=RAND_STATE), 
#                      SelectKBest(k = 25, score_func = f_classif), 
#                      AdaBoostClassifier(random_state=RAND_STATE))

# grid = GridSearchCV(pipe, param_grid, scoring = 'roc_auc', cv = 5, n_jobs = -1)
# grid.fit(X_train, y_train)

# print('Best Params:', grid.best_params_)
# print('Best ROC_AUC:', grid.best_score_)

In [156]:
pipe = make_pipeline(preprocessor, 
                     RandomOverSampler(random_state=RAND_STATE), 
                     SelectKBest(k = 25, score_func = f_classif), 
                     AdaBoostClassifier(random_state=RAND_STATE, 
                                        n_estimators = 100, 
                                        base_estimator = LogisticRegression(random_state=42), 
                                        learning_rate=1.2875))

pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
print('Ada Boost on out of sample data')
print('Accuracy:', accuracy_score(y_test, y_hat))
print('ROC AUC:', roc_auc_score(y_test, y_hat))

Ada Boost on out of sample data
Accuracy: 0.6460176991150443
ROC AUC: 0.6342696876267352


# Conclusion

The best ROC_AUC on out of sample data (0.6421) was achieved by oversampling in combination with BaggingClassifier (LogisticRegression as base estimator with tuned hyperparemters of C = 0.1, penalty = 'l1')