In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from tools import data_prep

# 1. Get Data from data_prep function

In [4]:
X, y = data_prep()

  if (yield from self.run_code(code, result)):


<class 'pandas.core.frame.DataFrame'>
Int64Index: 41432 entries, 0 to 41475
Data columns (total 31 columns):
PurchaseID                           41432 non-null int64
PurchaseTimestamp                    41432 non-null int64
PurchaseDate                         41432 non-null object
Auction                              41432 non-null object
VehYear                              41432 non-null float64
Make                                 41432 non-null object
Color                                41432 non-null object
Transmission                         41432 non-null object
WheelTypeID                          41432 non-null object
WheelType                            41380 non-null object
VehOdo                               41432 non-null float64
Nationality                          41432 non-null object
Size                                 41432 non-null object
TopThreeAmericanName                 41432 non-null object
MMRAcquisitionAuctionAveragePrice    41416 non-null object
MMRAcq

# 2. Split Data

In [5]:
#from sklearn.preprocessing import LabelEncoder
#labelEncoder = LabelEncoder()

#categorical_feature_mask = df_final.dtypes==object
#categorical_cols = df_final.columns[categorical_feature_mask].tolist()
#df_final[categorical_cols] = df_final[categorical_cols].apply(lambda col: labelEncoder.fit_transform(col))
#df_final.values[:,7] = labelEncoder.fit_transform(df_final.values[:,7])

#df_final

# set the random seed - consistent
rs = 10

# train test split
X_mat = X.as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X_mat, y, test_size=0.3, stratify=y, random_state=rs)

  from ipykernel import kernelapp as app


# 3. Scalering value 

In [6]:
# initialise a standard scaler object
scaler = StandardScaler()

# visualise min, max, mean and standard dev of data before scaling
print("Before scaling\n-------------")
for i in range(5):
    col = X_train[:,i]
    print("Variable #{}: min {}, max {}, mean {:.2f} and std dev {:.2f}".
          format(i, min(col), max(col), np.mean(col), np.std(col)))

# learn the mean and std.dev of variables from training data
# then use the learned values to transform training data
X_train = scaler.fit_transform(X_train, y_train)

print("After scaling\n-------------")
for i in range(5):
    col = X_train[:,i]
    print("Variable #{}: min {}, max {}, mean {:.2f} and std dev {:.2f}".
          format(i, min(col), max(col), np.mean(col), np.std(col)))

# use the statistic that you learned from training to transform test data
# NEVER learn from test data, this is supposed to be a set of dataset
# that the model has never seen before
X_test = scaler.transform(X_test)

Before scaling
-------------
Variable #0: min 0.0, max 1.0, mean 0.04 and std dev 0.19
Variable #1: min 577.0, max 480444.0, mean 72384.47 and std dev 14998.09
Variable #2: min 0.0, max 33543.0, mean 5946.68 and std dev 2568.24
Variable #3: min 0.0, max 36701.0, mean 7178.38 and std dev 2844.14
Variable #4: min 0.0, max 36726.0, mean 8228.66 and std dev 3252.30
After scaling
-------------
Variable #0: min -0.1949472296104162, max 5.129593285312624, mean 0.00 and std dev 1.00
Variable #1: min -4.7877757075676, max 27.207441862755417, mean 0.00 and std dev 1.00
Variable #2: min -2.315464923295065, max 10.745211522747928, mean -0.00 and std dev 1.00
Variable #3: min -2.52391795848795, max 10.38015377969556, mean 0.00 and std dev 1.00
Variable #4: min -2.530107339282775, max 8.762218203273306, mean -0.00 and std dev 1.00


# 4. Training a Logistic Regression Model

In [7]:
model = LogisticRegression(random_state=rs)

# fit it to training data
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=10, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

# 5. Evaluate first Logistic Regression Model

In [8]:
# training and test accuracy
print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

# classification report on test data
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Train accuracy: 0.825455997869791
Test accuracy: 0.8232370301335818
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1609
           1       0.90      0.73      0.80      1610

   micro avg       0.82      0.82      0.82      3219
   macro avg       0.84      0.82      0.82      3219
weighted avg       0.84      0.82      0.82      3219



# 6. Get the top 20 most important variables

In [9]:
# grab feature importances from the model and feature name from the original X
coef = model.coef_[0]
feature_names = X.columns

# sort them out in descending order
indices = np.argsort(np.absolute(coef))
indices = np.flip(indices, axis=0)

# limit to 20 features, you can leave this out to print out everything
indices = indices[:20]

for i in indices:
    print(feature_names[i], ':', coef[i])

Auction_ADESA : 2.065851504575421
VNST_AZ : 1.805182869101257
MMRAcquisitionAuctionAveragePrice : -1.37992404991772
VNST_OK : 1.270017379068819
MMRAcquisitionRetailAveragePrice : 1.2257218833454477
Auction_OTHER : -0.9038357728845241
Auction_MANHEIM : -0.8392271867435732
VNST_CO : -0.8096564618373743
MMRCurrentAuctionAveragePrice : 0.679323283592787
VNST_PA : 0.6694695437702735
VehYear_2008.0 : -0.6623593399693878
MMRAcquisitionAuctionCleanPrice : 0.5918379921767993
VNST_MO : 0.5705180952109038
VNST_TX : -0.5348740698509492
VNST_FL : -0.4850539207657993
VNST_GA : -0.4633858608024842
VNST_VA : -0.45474847890695513
VNST_MS : 0.42848951730205165
MMRCurrentRetailCleanPrice : -0.4200591154538043
VNST_NJ : 0.37945108997382265


# 7. Use GridSearchCV

In [56]:
# grid search CV
#params = {'C': [pow(10, x) for x in range(-6, 4)]}
#params = {'C' : [1.4111, 1.4112, 1.4113, 1.4114], 'penalty': ['l1','l2'],
          #'dual':[False], 'multi_class':['ovr'], 'solver':[ 'liblinear', 'sag', 'saga']},
params = {'penalty':['l1','l2'],'C':[1.4111],'solver':['liblinear'],'multi_class':['ovr']},
{'penalty':['l2'],'C':[1.4111],'solver':['lbfgs'],'multi_class':['ovr','multinomial']}

# use all cores to tune logistic regression with C parameter
cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train, y_train)

# test the best model
print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))

y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))

# print parameters of the best model
print(cv.best_params_)

Train accuracy: 0.8253228598056185
Test accuracy: 0.8235476856166511
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1609
           1       0.90      0.73      0.81      1610

   micro avg       0.82      0.82      0.82      3219
   macro avg       0.84      0.82      0.82      3219
weighted avg       0.84      0.82      0.82      3219

{'C': 1.4111, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'liblinear'}


# 8. Get Log of each variables

In [9]:
X_log, y_log = data_prep()

  if (yield from self.run_code(code, result)):


<class 'pandas.core.frame.DataFrame'>
Int64Index: 41432 entries, 0 to 41475
Data columns (total 31 columns):
PurchaseID                           41432 non-null int64
PurchaseTimestamp                    41432 non-null int64
PurchaseDate                         41432 non-null object
Auction                              41432 non-null object
VehYear                              41432 non-null float64
Make                                 41432 non-null object
Color                                41432 non-null object
Transmission                         41432 non-null object
WheelTypeID                          41432 non-null object
WheelType                            41380 non-null object
VehOdo                               41432 non-null float64
Nationality                          41432 non-null object
Size                                 41432 non-null object
TopThreeAmericanName                 41432 non-null object
MMRAcquisitionAuctionAveragePrice    41416 non-null object
MMRAcq

In [13]:
# transform the columns with np.log
X_log.applymap(np.log)

  return lib.map_infer(x.astype(object).values, func)


Unnamed: 0,Transmission,VehOdo,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,MMRAcquisitonRetailCleanPrice,MMRCurrentAuctionAveragePrice,MMRCurrentAuctionCleanPrice,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,...,VNST_OK,VNST_OR,VNST_PA,VNST_SC,VNST_TN,VNST_TX,VNST_UT,VNST_VA,VNST_WA,VNST_WV
0,-inf,10.841520,9.055556,9.140454,9.185125,9.265870,8.959440,9.053102,9.373904,9.433884,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
1,-inf,10.790185,9.055556,9.140454,9.185125,9.265870,9.055790,9.140454,9.185330,9.265870,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
2,-inf,10.743286,9.086476,9.151439,9.214532,9.276315,9.004177,9.074864,9.136479,9.203517,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
3,-inf,10.828004,8.876963,8.958025,9.016513,9.092907,8.864181,8.939712,9.004545,9.075551,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
4,-inf,10.823750,9.055556,9.140454,9.185125,9.265870,8.969160,9.072342,9.103423,9.201098,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
5,-inf,13.082466,9.086476,9.151439,9.214532,9.276315,8.988321,9.051345,9.121509,9.181118,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
6,-inf,10.787937,9.086476,9.151439,9.214532,9.276315,8.988321,9.051345,9.121509,9.181118,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
7,-inf,10.840796,9.101306,9.201905,9.228573,9.324472,8.959697,9.037652,9.387482,9.478381,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
8,-inf,10.999179,9.189423,9.284427,9.312536,9.403519,9.132919,9.264544,9.461644,9.556409,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
9,-inf,10.810899,9.086476,9.151439,9.214532,9.276315,8.988321,9.051345,9.121509,9.181118,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf


In [14]:
# create X, y and train test data partitions
X_mat_log = X_log.as_matrix()
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_mat_log, y_log, test_size=0.3, stratify=y_log, 
                                                                    random_state=rs)

# standardise them again
scaler_log = StandardScaler()
X_train_log = scaler_log.fit_transform(X_train_log, y_train_log)
X_test_log = scaler_log.transform(X_test_log)

  


In [20]:
# grid search CV
params = {'C': [pow(10, x) for x in range(-6, 4)]}
#params = {'C': [1]}

cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train_log, y_train_log)

# test the best model
print("Train accuracy:", cv.score(X_train_log, y_train_log))
print("Test accuracy:", cv.score(X_test_log, y_test_log))

y_pred = cv.predict(X_test_log)
print(classification_report(y_test_log, y_pred))

# print parameters of the best model
print(cv.best_params_)



Train accuracy: 0.825455997869791
Test accuracy: 0.8232370301335818
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1609
           1       0.90      0.73      0.80      1610

   micro avg       0.82      0.82      0.82      3219
   macro avg       0.84      0.82      0.82      3219
weighted avg       0.84      0.82      0.82      3219

{'C': 1}


# 9. Use RFE

In [21]:
from sklearn.feature_selection import RFECV

rfe = RFECV(estimator = LogisticRegression(random_state=rs), cv=10)
rfe.fit(X_train, y_train) # run the RFECV

# comparing how many variables before and after
print("Original feature set", X_train.shape[1])
print("Number of features after elimination", rfe.n_features_)





































































Original feature set 126
Number of features after elimination 50


In [22]:
X_train_sel = rfe.transform(X_train)
X_test_sel = rfe.transform(X_test)

In [23]:
# grid search CV
params = {'C': [pow(10, x) for x in range(-6, 4)]}

cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train_sel, y_train)

# test the best model
print("Train accuracy:", cv.score(X_train_sel, y_train))
print("Test accuracy:", cv.score(X_test_sel, y_test))

y_pred = cv.predict(X_test_sel)
print(classification_report(y_test, y_pred))

# print parameters of the best model
print(cv.best_params_)



Train accuracy: 0.8241246172280655
Test accuracy: 0.8235476856166511
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1609
           1       0.90      0.73      0.81      1610

   micro avg       0.82      0.82      0.82      3219
   macro avg       0.84      0.82      0.82      3219
weighted avg       0.84      0.82      0.82      3219

{'C': 1000}


In [24]:
# running RFE + log transformation
rfe = RFECV(estimator = LogisticRegression(random_state=rs), cv=10)
rfe.fit(X_train_log, y_train_log) # run the RFECV on log transformed dataset

# comparing how many variables before and after
print("Original feature set", X_train_log.shape[1])
print("Number of features after elimination", rfe.n_features_)

# select features from log transformed dataset
X_train_sel_log = rfe.transform(X_train_log)
X_test_sel_log = rfe.transform(X_test_log)

# init grid search CV on transformed dataset
params = {'C': [pow(10, x) for x in range(-6, 4)]}
cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train_sel_log, y_train_log)

# test the best model
print("Train accuracy:", cv.score(X_train_sel_log, y_train_log))
print("Test accuracy:", cv.score(X_test_sel_log, y_test_log))

y_pred_log = cv.predict(X_test_sel_log)
print(classification_report(y_test_log, y_pred_log))

# print parameters of the best model
print(cv.best_params_)





































































Original feature set 126
Number of features after elimination 50




Train accuracy: 0.8241246172280655
Test accuracy: 0.8235476856166511
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1609
           1       0.90      0.73      0.81      1610

   micro avg       0.82      0.82      0.82      3219
   macro avg       0.84      0.82      0.82      3219
weighted avg       0.84      0.82      0.82      3219

{'C': 1000}


In [25]:
# grab feature importances from the model and feature name from the original X
coef = model.coef_[0]
feature_names = X.columns

# sort them out in descending order
indices = np.argsort(np.absolute(coef))
indices = np.flip(indices, axis=0)

# limit to 20 features, you can leave this out to print out everything
indices = indices[:20]

for i in indices:
    print(feature_names[i], ':', coef[i])

Auction_ADESA : 2.065851504575421
VNST_AZ : 1.805182869101257
MMRAcquisitionAuctionAveragePrice : -1.37992404991772
VNST_OK : 1.270017379068819
MMRAcquisitionRetailAveragePrice : 1.2257218833454477
Auction_OTHER : -0.9038357728845241
Auction_MANHEIM : -0.8392271867435732
VNST_CO : -0.8096564618373743
MMRCurrentAuctionAveragePrice : 0.679323283592787
VNST_PA : 0.6694695437702735
VehYear_2008.0 : -0.6623593399693878
MMRAcquisitionAuctionCleanPrice : 0.5918379921767993
VNST_MO : 0.5705180952109038
VNST_TX : -0.5348740698509492
VNST_FL : -0.4850539207657993
VNST_GA : -0.4633858608024842
VNST_VA : -0.45474847890695513
VNST_MS : 0.42848951730205165
MMRCurrentRetailCleanPrice : -0.4200591154538043
VNST_NJ : 0.37945108997382265
