In [32]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import feature_selection as fs
from sklearn import linear_model
import sklearn.metrics as sklm
import scipy.stats as ss
import pickle

In [33]:
df = pd.read_csv('Class_BikeBuyer.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16404 entries, 0 to 16403
Data columns (total 11 columns):
CustomerID           16404 non-null int64
CountryRegionName    16404 non-null object
Education            16404 non-null object
Occupation           16404 non-null object
Gender               16404 non-null object
MaritalStatus        16404 non-null object
NumberCarsOwned      16404 non-null int64
TotalChildren        16404 non-null int64
BikeBuyer            16404 non-null int64
AgeRange             16404 non-null object
log_YearlyIncome     16404 non-null float64
dtypes: float64(1), int64(4), object(6)
memory usage: 1.4+ MB


In [34]:
labels = np.array(df['BikeBuyer'])

In [35]:
def encode_string(cat_features):
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

In [36]:
categorical_columns = ['Education', 'Occupation', 'Gender', 'MaritalStatus', 'AgeRange']
Features = encode_string(df['CountryRegionName'])
for col in categorical_columns:
    temp = encode_string(df[col])
    Features = np.concatenate([Features, temp], axis = 1)

print(Features.shape)
print(Features[:2, :])

(16404, 24)
[[1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0.]]


In [37]:
Features = np.concatenate([Features, np.array(df[['NumberCarsOwned', 
                            'TotalChildren', 'log_YearlyIncome']])], axis = 1)
print(Features.shape)
print(Features[:2, :]) 

(16404, 27)
[[ 1.          0.          0.          0.          0.          0.
   1.          0.          0.          0.          0.          0.
   0.          0.          1.          0.          0.          1.
   1.          0.          1.          0.          0.          0.
   0.          2.         11.83462483]
 [ 1.          0.          0.          0.          0.          0.
   1.          0.          0.          0.          0.          0.
   0.          0.          1.          0.          0.          1.
   0.          1.          1.          0.          0.          0.
   1.          3.         11.52427086]]


In [38]:
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = int(0.2*df.shape[0]))
print(int(0.2*df.shape[0]))

3280


In [39]:
X_train = Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
X_test = Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])

In [40]:
scaler = preprocessing.StandardScaler().fit(X_train[:,25:])
X_train[:,25:] = scaler.transform(X_train[:,25:])
X_test[:,25:] = scaler.transform(X_test[:,25:])
X_train[:2,]

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        -0.59156479, -0.34454427],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        -1.18630175, -0.36126677]])

In [41]:
logistic_mod = linear_model.LogisticRegression() 
logistic_mod.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [42]:
probabilities = logistic_mod.predict_proba(X_test)
print(probabilities[:15,:])

[[0.82614232 0.17385768]
 [0.69827008 0.30172992]
 [0.50150094 0.49849906]
 [0.21193585 0.78806415]
 [0.82081826 0.17918174]
 [0.97602285 0.02397715]
 [0.78776119 0.21223881]
 [0.26786569 0.73213431]
 [0.93869988 0.06130012]
 [0.97239176 0.02760824]
 [0.89678132 0.10321868]
 [0.79707396 0.20292604]
 [0.88067048 0.11932952]
 [0.54902493 0.45097507]
 [0.63847287 0.36152713]]


In [43]:
def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])
scores = score_model(probabilities, 0.5)
print(np.array(scores[:15]))
print(y_test[:15])

[0 0 0 1 0 0 0 1 0 0 0 0 0 0 0]
[0 0 0 1 1 0 1 1 0 0 0 0 1 1 0]


In [44]:
def print_metrics(labels, scores):
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy  %0.2f' % sklm.accuracy_score(labels, scores))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d'   % metrics[3][0] + '        %6d'   % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
print_metrics(y_test, scores) 

                 Confusion matrix
                 Score positive    Score negative
Actual positive      1890               278
Actual negative       524               588

Accuracy  0.76
 
           Positive      Negative
Num case     2168          1112
Precision    0.78          0.68
Recall       0.87          0.53
F1           0.82          0.59


In [45]:
pickle.dump(logistic_mod, open('classification_model.sav', 'wb'))