In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from tools import data_prep

# 1. Get Data from data_prep function

In [3]:
df = data_prep()
y = df['IsBadBuy']
X = df.drop(['IsBadBuy'], axis=1)

  if self.run_code(code, result):


<class 'pandas.core.frame.DataFrame'>
Int64Index: 41432 entries, 0 to 41475
Data columns (total 31 columns):
PurchaseID                           41432 non-null int64
PurchaseTimestamp                    41432 non-null int64
PurchaseDate                         41432 non-null object
Auction                              41432 non-null object
VehYear                              41432 non-null float64
Make                                 41432 non-null object
Color                                41432 non-null object
Transmission                         41432 non-null object
WheelTypeID                          41432 non-null object
WheelType                            41380 non-null object
VehOdo                               41432 non-null float64
Nationality                          41432 non-null object
Size                                 41432 non-null object
TopThreeAmericanName                 41432 non-null object
MMRAcquisitionAuctionAveragePrice    41416 non-null object
MMRAcq

# 2. Split Data

In [4]:
#from sklearn.preprocessing import LabelEncoder
#labelEncoder = LabelEncoder()

#categorical_feature_mask = df_final.dtypes==object
#categorical_cols = df_final.columns[categorical_feature_mask].tolist()
#df_final[categorical_cols] = df_final[categorical_cols].apply(lambda col: labelEncoder.fit_transform(col))
#df_final.values[:,7] = labelEncoder.fit_transform(df_final.values[:,7])

#df_final

# set the random seed - consistent
rs = 10

# train test split
X_mat = X.as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X_mat, y, test_size=0.3, stratify=y, random_state=rs)

  from ipykernel import kernelapp as app


# 3. Scalering value

In [5]:
# initialise a standard scaler object
scaler = StandardScaler()

# learn the mean and std.dev of variables from training data
# then use the learned values to transform training data
X_train = scaler.fit_transform(X_train, y_train)

# use the statistic that you learned from training to transform test data
# NEVER learn from test data, this is supposed to be a set of dataset
# that the model has never seen before
X_test = scaler.transform(X_test)

# 4. Training a Logistic Regression Model

In [6]:
model = LogisticRegression(random_state=rs)

# fit it to training data
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=10, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# 5. Evaluate first Logistic Regression Model

In [7]:
# training and test accuracy
print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

# classification report on test data
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

Train accuracy: 0.825455997869791
Test accuracy: 0.8232370301335818
             precision    recall  f1-score   support

          0       0.77      0.92      0.84      1609
          1       0.90      0.73      0.80      1610

avg / total       0.84      0.82      0.82      3219



# 6. Get the top 20 most important variables

In [10]:
# grab feature importances from the model and feature name from the original X
coef = model.coef_[0]
feature_names = X.columns

# sort them out in descending order
indices = np.argsort(np.absolute(coef))
indices = np.flip(indices, axis=0)

# limit to 20 features, you can leave this out to print out everything
indices = indices[:20]

for i in indices:
    print(f"{feature_names[i]:<35}: {coef[i]}")

Auction_ADESA                      : 2.06569972872485
VNST_AZ                            : 1.8048255117114467
MMRAcquisitionAuctionAveragePrice  : -1.3802951674635233
VNST_OK                            : 1.270164390498947
MMRAcquisitionRetailAveragePrice   : 1.2258851395157986
Auction_OTHER                      : -0.9037717245362028
Auction_MANHEIM                    : -0.8391635647189372
VNST_CO                            : -0.8096213013663035
MMRCurrentAuctionAveragePrice      : 0.68008763880614
VNST_PA                            : 0.669406061904637
VehYear_2008.0                     : -0.6623714131764916
MMRAcquisitionAuctionCleanPrice    : 0.5924495249311702
VNST_MO                            : 0.570462431964441
VNST_TX                            : -0.5348395061384371
VNST_FL                            : -0.48501297235417323
VNST_GA                            : -0.4633650440807044
VNST_VA                            : -0.4547188987170395
VNST_MS                            : 0.428465

# 7. Use GridSearchCV

In [11]:
# grid search CV
#params = {'C': [pow(10, x) for x in range(-6, 4)]}
#params = {'C' : [1.4111, 1.4112, 1.4113, 1.4114], 'penalty': ['l1','l2'],
          #'dual':[False], 'multi_class':['ovr'], 'solver':[ 'liblinear', 'sag', 'saga']},
params = {'penalty':['l1','l2'],'C':[1.4111],'solver':['liblinear'],'multi_class':['ovr']},
{'penalty':['l2'],'C':[1.4111],'solver':['lbfgs'],'multi_class':['ovr','multinomial']}

# use all cores to tune logistic regression with C parameter
cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train, y_train)

# test the best model
print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))

y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))

# print parameters of the best model
print(cv.best_params_)

Train accuracy: 0.8253228598056185
Test accuracy: 0.8235476856166511
             precision    recall  f1-score   support

          0       0.77      0.92      0.84      1609
          1       0.90      0.73      0.81      1610

avg / total       0.84      0.82      0.82      3219

{'C': 1.4111, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'liblinear'}


# 8. Get Log of each variables

In [12]:
# copy the dataframe
df_log = df.copy()
#X_log, y_log = data_prep()
y_log = df_log['IsBadBuy']
X_log = df_log.drop(['IsBadBuy'], axis=1)

In [13]:
# list columns to be transformed
columns_to_transform = ['MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice',
                        'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice','MMRCurrentRetailCleanPrice','VehBCost']

# transform the columns with np.log
for col in columns_to_transform:
    X_log[col] = X_log[col].apply(lambda x: x+1)
    X_log[col] = X_log[col].apply(np.log)

In [14]:
# create X, y and train test data partitions
X_mat_log = X_log.as_matrix()
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_mat_log, y_log, test_size=0.3, stratify=y_log, 
                                                                    random_state=rs)

# standardise them again
scaler_log = StandardScaler()
X_train_log = scaler_log.fit_transform(X_train_log, y_train_log)
X_test_log = scaler_log.transform(X_test_log)

  


In [15]:
# grid search CV
params = {'penalty':['l1','l2'],'C':[1.4111],'solver':['liblinear'],'multi_class':['ovr']},
{'penalty':['l2'],'C':[1.4111],'solver':['lbfgs'],'multi_class':['ovr','multinomial']}
cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train_log, y_train_log)

# test the best model
print("Train accuracy:", cv.score(X_train_log, y_train_log))
print("Test accuracy:", cv.score(X_test_log, y_test_log))

y_pred = cv.predict(X_test_log)
print(classification_report(y_test_log, y_pred))

# print parameters of the best model
print(cv.best_params_)

Train accuracy: 0.8250565836772733
Test accuracy: 0.8213730972351662
             precision    recall  f1-score   support

          0       0.77      0.91      0.84      1609
          1       0.89      0.73      0.80      1610

avg / total       0.83      0.82      0.82      3219

{'C': 1.4111, 'multi_class': 'ovr', 'penalty': 'l1', 'solver': 'liblinear'}


# 9. Use RFE

In [16]:
from sklearn.feature_selection import RFECV

rfe = RFECV(estimator = LogisticRegression(random_state=rs), cv=10)
rfe.fit(X_train, y_train) # run the RFECV

# comparing how many variables before and after
print("Original feature set", X_train.shape[1])
print("Number of features after elimination", rfe.n_features_)

Original feature set 126
Number of features after elimination 50


In [None]:
X_train_sel = rfe.transform(X_train)
X_test_sel = rfe.transform(X_test)

In [None]:
# Run another GridSearchCV and test if the new input set improves the model performance.
# grid search CV
params = {'C': [pow(10, x) for x in range(-6, 4)]}

cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train_sel, y_train)

# test the best model
print("Train accuracy:", cv.score(X_train_sel, y_train))
print("Test accuracy:", cv.score(X_test_sel, y_test))

y_pred = cv.predict(X_test_sel)
print(classification_report(y_test, y_pred))

# print parameters of the best model
print(cv.best_params_)

In [None]:
# RFE + LOg reduction
# running RFE + log transformation
rfe = RFECV(estimator = LogisticRegression(random_state=rs), cv=10)
rfe.fit(X_train_log, y_train_log) # run the RFECV on log transformed dataset

# comparing how many variables before and after
print("Original feature set", X_train_log.shape[1])
print("Number of features after elimination", rfe.n_features_)

# select features from log transformed dataset
X_train_sel_log = rfe.transform(X_train_log)
X_test_sel_log = rfe.transform(X_test_log)

# init grid search CV on transformed dataset
params = {'C': [pow(10, x) for x in range(-6, 4)]}
cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train_sel_log, y_train_log)

# test the best model
print("Train accuracy:", cv.score(X_train_sel_log, y_train_log))
print("Test accuracy:", cv.score(X_test_sel_log, y_test_log))

y_pred_log = cv.predict(X_test_sel_log)
print(classification_report(y_test_log, y_pred_log))

# print parameters of the best model
print(cv.best_params_)

In [None]:
# grab feature importances from the model and feature name from the original X
coef = model.coef_[0]
feature_names = X.columns

# sort them out in descending order
indices = np.argsort(np.absolute(coef))
indices = np.flip(indices, axis=0)

# limit to 20 features, you can leave this out to print out everything
indices = indices[:20]

for i in indices:
    print(feature_names[i], ':', coef[i])