In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from tools import data_prep

# 1. Get Data from data_prep function

In [59]:
df = data_prep()

# separating the y_label -- prediction set
y = df['IsBadBuy']

#X -features are all column except y
X = df.drop(['IsBadBuy'], axis = 1)

#X.drop(['Color'], 1, inplace=True)
#X.drop(['Size'],axis=1, inplace=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41432 entries, 0 to 41475
Data columns (total 31 columns):
PurchaseID                           41432 non-null int64
PurchaseTimestamp                    41432 non-null int64
PurchaseDate                         41432 non-null object
Auction                              41432 non-null object
VehYear                              41432 non-null float64
Make                                 41432 non-null object
Color                                41432 non-null object
Transmission                         41432 non-null object
WheelTypeID                          41432 non-null object
WheelType                            41380 non-null object
VehOdo                               41432 non-null float64
Nationality                          41432 non-null object
Size                                 41432 non-null object
TopThreeAmericanName                 41432 non-null object
MMRAcquisitionAuctionAveragePrice    41416 non-null object
MMRAcq

In [60]:
df.columns

Index(['Transmission', 'VehOdo', 'MMRAcquisitionAuctionAveragePrice',
       'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice',
       'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice',
       'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice',
       'MMRCurrentRetailCleanPrice',
       ...
       'VNST_OK', 'VNST_OR', 'VNST_PA', 'VNST_SC', 'VNST_TN', 'VNST_TX',
       'VNST_UT', 'VNST_VA', 'VNST_WA', 'VNST_WV'],
      dtype='object', length=127)

In [18]:
pd.set_option('display.max_columns', 500)

# 2. Split Data

In [19]:
#from sklearn.preprocessing import LabelEncoder
#labelEncoder = LabelEncoder()

#categorical_feature_mask = df_final.dtypes==object
#categorical_cols = df_final.columns[categorical_feature_mask].tolist()
#df_final[categorical_cols] = df_final[categorical_cols].apply(lambda col: labelEncoder.fit_transform(col))
#df_final.values[:,7] = labelEncoder.fit_transform(df_final.values[:,7])

#df_final

# set the random seed - consistent
rs = 10

# train test split
X_mat = X.as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X_mat, y, test_size=0.3, stratify=y, random_state=rs)

  from ipykernel import kernelapp as app


# 3. Scalering value 

In [20]:
# initialise a standard scaler object
scaler = StandardScaler()


print(X_train)

print("\n---After Scaling----\n")

# learn the mean and std.dev of variables from training data
# then use the learned values to transform training data
X_train = scaler.fit_transform(X_train, y_train)

# use the statistic that you learned from training to transform test data
# NEVER learn from test data, this is supposed to be a set of dataset
# that the model has never seen before
X_test = scaler.transform(X_test)

print(X_train)

[[    0. 49954.  5480. ...     0.     0.     0.]
 [    0. 72271.  3085. ...     0.     0.     0.]
 [    0. 79902.  8904. ...     0.     0.     0.]
 ...
 [    0. 59196.  5713. ...     0.     0.     0.]
 [    0. 70977.  5543. ...     0.     0.     0.]
 [    0. 95711.  3684. ...     0.     0.     0.]]

---After Scaling----

[[-0.19494723 -1.49555545 -0.18171103 ... -0.24306512 -0.02827485
  -0.0365124 ]
 [-0.19494723 -0.00756551 -1.11425493 ... -0.24306512 -0.02827485
  -0.0365124 ]
 [-0.19494723  0.50123278  1.15149579 ... -0.24306512 -0.02827485
  -0.0365124 ]
 ...
 [-0.19494723 -0.87934346 -0.09098755 ... -0.24306512 -0.02827485
  -0.0365124 ]
 [-0.19494723 -0.09384319 -0.15718065 ... -0.24306512 -0.02827485
  -0.0365124 ]
 [-0.19494723  1.55530066 -0.88102161 ... -0.24306512 -0.02827485
  -0.0365124 ]]


# 4. Training a Logistic Regression Model

In [21]:
model = LogisticRegression(random_state=rs)

# fit it to training data
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=10, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

# 5. Evaluate first Logistic Regression Model

In [22]:
# training and test accuracy
print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

# classification report on test data
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Train accuracy: 0.825455997869791
Test accuracy: 0.8232370301335818
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1609
           1       0.90      0.73      0.80      1610

   micro avg       0.82      0.82      0.82      3219
   macro avg       0.84      0.82      0.82      3219
weighted avg       0.84      0.82      0.82      3219



# 6. Get the top 20 most important variables

In [23]:
# grab feature importances from the model and feature name from the original X
coef = model.coef_[0]
feature_names = X.columns

# sort them out in descending order
indices = np.argsort(np.absolute(coef))
indices = np.flip(indices, axis=0)

# limit to 20 features, you can leave this out to print out everything
indices = indices[:20]

for i in indices:
    print(feature_names[i], ':', coef[i])

Auction_ADESA : 2.065851504575421
VNST_AZ : 1.805182869101257
MMRAcquisitionAuctionAveragePrice : -1.37992404991772
VNST_OK : 1.270017379068819
MMRAcquisitionRetailAveragePrice : 1.2257218833454477
Auction_OTHER : -0.9038357728845241
Auction_MANHEIM : -0.8392271867435732
VNST_CO : -0.8096564618373743
MMRCurrentAuctionAveragePrice : 0.679323283592787
VNST_PA : 0.6694695437702735
VehYear_2008.0 : -0.6623593399693878
MMRAcquisitionAuctionCleanPrice : 0.5918379921767993
VNST_MO : 0.5705180952109038
VNST_TX : -0.5348740698509492
VNST_FL : -0.4850539207657993
VNST_GA : -0.4633858608024842
VNST_VA : -0.45474847890695513
VNST_MS : 0.42848951730205165
MMRCurrentRetailCleanPrice : -0.4200591154538043
VNST_NJ : 0.37945108997382265


# 7. Use GridSearchCV

In [24]:
# grid search CV
params = {'C': [pow(10, x) for x in range(-6, 4)]}
#params = {'C' : [1.4111, 1.4112, 1.4113, 1.4114], 'penalty': ['l1','l2'],
          #'dual':[False], 'multi_class':['ovr'], 'solver':[ 'liblinear', 'sag', 'saga']},
#params = {'penalty':['l1','l2'],'C':[1.4111],'solver':['liblinear'],'multi_class':['ovr']}
#,{'penalty':['l2'],'C':[1.4111],'solver':['lbfgs'],'multi_class':['ovr','multinomial']}

# use all cores to tune logistic regression with C parameter
cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs, verbose=True), cv=10, n_jobs=-1)
cv.fit(X_train, y_train)

# test the best model
print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))

y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))

# print parameters of the best model
print(cv.best_params_)



[LibLinear]Train accuracy: 0.825455997869791
Test accuracy: 0.8232370301335818
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1609
           1       0.90      0.73      0.80      1610

   micro avg       0.82      0.82      0.82      3219
   macro avg       0.84      0.82      0.82      3219
weighted avg       0.84      0.82      0.82      3219

{'C': 1}


In [25]:
# grid search CV
params = {'C' : [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9], 'penalty': ['l1','l2']}

# use all cores to tune logistic regression with C parameter
cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs, verbose=True), cv=10, n_jobs=-1)
cv.fit(X_train, y_train)

# test the best model
print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))

y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))

# print parameters of the best model
print(cv.best_params_)



[LibLinear]Train accuracy: 0.8253228598056185
Test accuracy: 0.8235476856166511
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1609
           1       0.90      0.73      0.81      1610

   micro avg       0.82      0.82      0.82      3219
   macro avg       0.84      0.82      0.82      3219
weighted avg       0.84      0.82      0.82      3219

{'C': 1.4, 'penalty': 'l2'}


In [90]:
# grid search CV
params = {'C' : [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9], 'penalty': ['l2'], 'solver':['newton-cg', 'lbfgs', 'sag']}

# use all cores to tune logistic regression with C parameter
cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs, verbose=True), cv=10, n_jobs=-1)
cv.fit(X_train, y_train)

# test the best model
print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))

y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))

# print parameters of the best model
print(cv.best_params_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Train accuracy: 0.825455997869791
Test accuracy: 0.8229263746505125
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1609
           1       0.90      0.73      0.80      1610

   micro avg       0.82      0.82      0.82      3219
   macro avg       0.83      0.82      0.82      3219
weighted avg       0.83      0.82      0.82      3219

{'C': 1.3, 'penalty': 'l2', 'solver': 'newton-cg'}


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


# USE RFE on Best model

In [68]:
rfe = RFECV(estimator = LogisticRegression(random_state=rs, solver = 'lbfgs', max_iter = 200), cv=10)
rfe.fit(X_train, y_train) # run the RFECV

# comparing how many variables before and after
print("Original feature set", X_train.shape[1])
print("Number of features after elimination", rfe.n_features_)

Original feature set 126
Number of features after elimination 44


In [69]:
# taking only the important features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

In [76]:
# The selected features from RFE
feature_importance = list(zip(X.columns, rfe.support_))
new_features = []
for key,value in enumerate(feature_importance):
    if(value[1]) == True:
        new_features.append(value[0])
        
print(new_features)

['VehOdo', 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailCleanPrice', 'VehBCost', 'Auction_ADESA', 'Auction_MANHEIM', 'Auction_OTHER', 'VehYear_2001.0', 'VehYear_2002.0', 'VehYear_2006.0', 'VehYear_2007.0', 'VehYear_2008.0', 'Make_ACURA', 'Make_SUZUKI', 'Color_NOT AVAIL', 'WheelType_Alloy', 'TopThreeAmericanName_FORD', 'VNST_AL', 'VNST_AZ', 'VNST_CA', 'VNST_CO', 'VNST_FL', 'VNST_GA', 'VNST_KY', 'VNST_LA', 'VNST_MO', 'VNST_MS', 'VNST_NC', 'VNST_NH', 'VNST_NJ', 'VNST_NM', 'VNST_NV', 'VNST_OK', 'VNST_OR', 'VNST_PA', 'VNST_TX', 'VNST_UT', 'VNST_VA', 'VNST_WA', 'VNST_WV']


In [92]:
# grid search CV
params = {'C' : [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9], 'penalty': ['l1','l2']}

# use all cores to tune logistic regression with C parameter
cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs, verbose=True), cv=10, n_jobs=-1)
cv.fit(X_train_rfe, y_train)

# test the best model
print("Train accuracy:", cv.score(X_train_rfe, y_train))
print("Test accuracy:", cv.score(X_test_rfe, y_test))

y_pred = cv.predict(X_test_rfe)
print(classification_report(y_test, y_pred))

# print parameters of the best model
print(cv.best_params_)



[LibLinear]Train accuracy: 0.825722273998136
Test accuracy: 0.8219944082013048
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1609
           1       0.90      0.72      0.80      1610

   micro avg       0.82      0.82      0.82      3219
   macro avg       0.84      0.82      0.82      3219
weighted avg       0.84      0.82      0.82      3219

{'C': 1.0, 'penalty': 'l1'}


In [81]:
# grab feature importances from the model and feature name from the original X
importances = cv.best_params_

# sort them out in descending order
indices = np.argsort(np.absolute(coef))
indices = np.flip(indices, axis=0)

# limit to 20 features, you can leave this out to print out everything
indices = indices[:20]

for i in indices:
    print(feature_names[i], ':', coef[i])

MMRCurrentRetailAveragePrice : 2.0672111164595557
VehYear_2007.0 : 1.8234128504891336
Make_INFINITI : 1.3823767871813302
VehOdo : -1.3458998141220209
MMRAcquisitionAuctionCleanPrice : 1.1287376120086121
MMRCurrentRetailRatio : -0.9054728026388369
MMRCurrentRetailCleanPrice : -0.8389101323172613
Auction_OTHER : -0.7859982531009929
Make_JEEP : 0.7506545015937736
Make_CHEVROLET : 0.6429702499552219
MMRAcquisitionAuctionAveragePrice : 0.5923142485237657
MMRCurrentAuctionAveragePrice : -0.5790049081542791
MMRAcquisitionRetailAveragePrice : 0.531617299538697
Make_DODGE : 0.5304218713789429
VehYear_2009.0 : -0.5074318693354968
VehYear_2008.0 : 0.48389871704137
Make_CHRYSLER : 0.47008441291627184
Make_GMC : 0.428219962377493
Make_HYUNDAI : 0.34817368574598895
MMRCurrentAuctionCleanPrice : -0.33076641236558174


# 8. Get Log of each variables - not necessary

In [14]:
df = data_prep()

# separating the y_label -- prediction set
y_log = df['IsBadBuy']

#X -features are all column except y
X_log = df.drop(['IsBadBuy'], axis = 1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41432 entries, 0 to 41475
Data columns (total 31 columns):
PurchaseID                           41432 non-null int64
PurchaseTimestamp                    41432 non-null int64
PurchaseDate                         41432 non-null object
Auction                              41432 non-null object
VehYear                              41432 non-null float64
Make                                 41432 non-null object
Color                                41432 non-null object
Transmission                         41432 non-null object
WheelTypeID                          41432 non-null object
WheelType                            41380 non-null object
VehOdo                               41432 non-null float64
Nationality                          41432 non-null object
Size                                 41432 non-null object
TopThreeAmericanName                 41432 non-null object
MMRAcquisitionAuctionAveragePrice    41416 non-null object
MMRAcq

In [15]:
# list columns to be transformed
columns_to_transform = ['MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice',
                        'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice','MMRCurrentRetailCleanPrice','VehBCost']

# transform the columns with np.log
for col in columns_to_transform:
    X_log[col] = X_log[col].apply(lambda x: x+1)
    X_log[col] = X_log[col].apply(np.log)

In [13]:
# create X, y and train test data partitions
X_mat_log = X_log.as_matrix()
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_mat_log, y_log, test_size=0.3, stratify=y_log, 
                                                                    random_state=rs)

# standardise them again
scaler_log = StandardScaler()
X_train_log = scaler_log.fit_transform(X_train_log, y_train_log)
X_test_log = scaler_log.transform(X_test_log)

  


In [14]:
# grid search CV
params = {'C' : [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9], 'penalty': ['l1','l2']}
cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train_log, y_train_log)

# test the best model
print("Train accuracy:", cv.score(X_train_log, y_train_log))
print("Test accuracy:", cv.score(X_test_log, y_test_log))

y_pred = cv.predict(X_test_log)
print(classification_report(y_test_log, y_pred))

# print parameters of the best model
print(cv.best_params_)

Train accuracy: 0.8242577552922381
Test accuracy: 0.8207517862690277
              precision    recall  f1-score   support

           0       0.77      0.91      0.84      1609
           1       0.90      0.73      0.80      1610

   micro avg       0.82      0.82      0.82      3219
   macro avg       0.83      0.82      0.82      3219
weighted avg       0.83      0.82      0.82      3219

{'C': 1.4111, 'multi_class': 'ovr', 'penalty': 'l1', 'solver': 'liblinear'}
