In [1]:
import pandas as pd
import glob
import os 

# Discover the data

In [2]:
path = glob.glob(os.getcwd()+"/*.csv")
data = pd.read_csv(path[0], index_col=0)
data.shape

(1000, 10)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 85.9+ KB


In [4]:
data.head(2)

#in the dataset, each row is an individual that took a credit in a bank. 
#Each person is classified as good or bad credit risk

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad


In [5]:
dict(data.dtypes)

{'Age': dtype('int64'),
 'Sex': dtype('O'),
 'Job': dtype('int64'),
 'Housing': dtype('O'),
 'Saving accounts': dtype('O'),
 'Checking account': dtype('O'),
 'Credit amount': dtype('int64'),
 'Duration': dtype('int64'),
 'Purpose': dtype('O'),
 'Risk': dtype('O')}

In [6]:
for k in data.columns:
    print(data[k].unique())

[67 22 49 45 53 35 61 28 25 24 60 32 44 31 48 26 36 39 42 34 63 27 30 57
 33 37 58 23 29 52 50 46 51 41 40 66 47 56 54 20 21 38 70 65 74 68 43 55
 64 75 19 62 59]
['male' 'female']
[2 1 3 0]
['own' 'free' 'rent']
[nan 'little' 'quite rich' 'rich' 'moderate']
['little' 'moderate' nan 'rich']
[ 1169  5951  2096  7882  4870  9055  2835  6948  3059  5234  1295  4308
  1567  1199  1403  1282  2424  8072 12579  3430  2134  2647  2241  1804
  2069  1374   426   409  2415  6836  1913  4020  5866  1264  1474  4746
  6110  2100  1225   458  2333  1158  6204  6187  6143  1393  2299  1352
  7228  2073  5965  1262  3378  2225   783  6468  9566  1961  6229  1391
  1537  1953 14421  3181  5190  2171  1007  1819  2394  8133   730  1164
  5954  1977  1526  3965  4771  9436  3832  5943  1213  1568  1755  2315
  1412 12612  2249  1108   618  1409   797  3617  1318 15945  2012  2622
  2337  7057  1469  2323   932  1919  2445 11938  6458  6078  7721  1410
  1449   392  6260  7855  1680  3578  7174  2132  4

# Define a feature engineering pipeline

In [7]:
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline
import numpy as np

X = data.loc[:,~data.columns.isin(["Risk"])]
data.Risk.replace(["good","bad"], [1,0], inplace=True)
y = data["Risk"]
# 1 = good and 0 = bad 

In [8]:
def categorize_age(X):
    X['Age_cat'] = pd.cut(X.Age,(18, 25, 35, 60, 120), labels=['Baby', 'Young', 'Adult', 'Senior'])
    return X

def fill_na(X):
    return X.replace(np.nan,"unknown")

transfna = FunctionTransformer(fill_na)
age_categorizer = FunctionTransformer(categorize_age) #transform age into categories
ohe = OneHotEncoder(drop='first') #transform all categorical variables into binary ones, drop first since we will get redundant info
mms = MinMaxScaler() #transform all numerical variables into 0-1 data
ct = make_column_transformer((ohe,make_column_selector(dtype_exclude=int)),
                            (mms, make_column_selector(dtype_include=int)),
                            remainder='passthrough')

steps = [('manage_na', transfna ),
        ('add_age_cat', age_categorizer),
        ('preprocessing', ct)]

preprocessing_pipeline = Pipeline(steps=steps)

# Create a first model : Random Forest for binary classification (+ Singular value decomposition)

In [9]:
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

pipe_rf = Pipeline([("final_preprocessing", preprocessing_pipeline),
                    ('svd', TruncatedSVD(n_components=2)),
                    ('clf', RandomForestClassifier(n_estimators=10))])

In [10]:
pipe_rf.fit(X_train, y_train.ravel())

Pipeline(steps=[('final_preprocessing',
                 Pipeline(steps=[('manage_na',
                                  FunctionTransformer(func=<function fill_na at 0x7feb4def7ef0>)),
                                 ('add_age_cat',
                                  FunctionTransformer(func=<function categorize_age at 0x7feb4def7830>)),
                                 ('preprocessing',
                                  ColumnTransformer(remainder='passthrough',
                                                    transformers=[('onehotencoder',
                                                                   OneHotEncoder(drop='first'),
                                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7feb5341bc10>),
                                                                  ('minmaxscaler',
                                                                   MinMaxScaler(),
                                

In [11]:
from sklearn.model_selection import GridSearchCV

# Create a list of choices for each hyperparameter
param_range_svd = [2, 5, 10, 15, 20, 23] # dimension of the data after dimensionality reduction 
param_range_clf = [5, 10, 20] # number of trees in RandomForest

# Create the grid of possible combinations 
param_grid = [{'svd__n_components': param_range_svd,
               'clf__n_estimators': param_range_clf}]

# Prepare the Grid Search Fine Tuning 
gs = GridSearchCV(estimator=pipe_rf, # the model
                  param_grid=param_grid, # its hyperparameters possibilities
                  scoring='precision', # precision is the evaluation metric
                  cv=5, # We use 5 folds in cross validation
                  n_jobs=-1,
                  return_train_score=True,
                  error_score='raise',
                  verbose=1)

# Test all the possible hyperparamters
gs = gs.fit(X_train, y_train.ravel())

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    3.8s finished


In [12]:
# The best hyperparameters
print("The optimal precision is {}".format(gs.best_score_))
print("The optimal hyperparameters are {}".format(gs.best_params_))

The optimal precision is 0.7634004974232772
The optimal hyperparameters are {'clf__n_estimators': 10, 'svd__n_components': 23}


In [13]:
from sklearn.model_selection import cross_validate
cross_validate(estimator=gs.best_estimator_, X=X,y=y.ravel(),scoring="precision", cv=5, error_score='raise', return_train_score=True)

{'fit_time': array([0.07673407, 0.07135296, 0.07201171, 0.06937718, 0.06759501]),
 'score_time': array([0.01524806, 0.01568532, 0.01432824, 0.01445079, 0.01453805]),
 'test_score': array([0.77083333, 0.7755102 , 0.79591837, 0.75333333, 0.75675676]),
 'train_score': array([0.99107143, 0.99279279, 0.99284436, 0.99459459, 0.99641577])}

# Create a 2nd Model : PCA + Logistic Regression

In [14]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

pipe_lr = Pipeline(steps=[
    ('final_preprocessing',preprocessing_pipeline),
    ('PCA', PCA(n_components=2)),
    ('lr', LogisticRegression(penalty='l1', C=0.1, solver="liblinear"))
])

# Create a list of choices for each hyperparameter
param_range_pca = [2, 5, 10, 5, 20] # dimension of the data after dimensionality reduction 
param_lr_penalty = ['l1', 'l2'] 
param_lr_C = [1,0.5,0.1]

# Create the grid of possible combinations 
param_grid = [{'PCA__n_components': param_range_pca,
               'lr__penalty': param_lr_penalty,
               'lr__C': param_lr_C}]

gs2 = GridSearchCV(estimator=pipe_lr, # the model
                  param_grid=param_grid, # its hyperparameters possibilities
                  scoring='precision', # precision is the evaluation metric
                  cv=5, # We use 5 folds in cross validation
                  n_jobs=-1,
                  error_score='raise',
                  verbose=1)

gs2.fit(X_train, y_train.ravel())

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    1.3s finished


GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('final_preprocessing',
                                        Pipeline(steps=[('manage_na',
                                                         FunctionTransformer(func=<function fill_na at 0x7feb4def7ef0>)),
                                                        ('add_age_cat',
                                                         FunctionTransformer(func=<function categorize_age at 0x7feb4def7830>)),
                                                        ('preprocessing',
                                                         ColumnTransformer(remainder='passthrough',
                                                                           transformers=[('onehotencoder',...
                                                                                          <sklearn.compose._column_transformer.make_column_selector object at 0x7feb5341bc10>),
                                            

In [15]:
# The best hyperparameters
print("The optimal precision is {}".format(gs2.best_score_))
print("The optimal hyperparameters are {}".format(gs2.best_params_))

The optimal precision is 0.7526084601252071
The optimal hyperparameters are {'PCA__n_components': 20, 'lr__C': 1, 'lr__penalty': 'l1'}


In [16]:
cross_validate(estimator=gs2.best_estimator_, 
               X=X,y=y.ravel(),
               scoring="precision", 
               cv=5, 
               error_score='raise', 
               return_train_score=True)

{'fit_time': array([0.02446222, 0.02842307, 0.02781701, 0.02658105, 0.02709985]),
 'score_time': array([0.01032662, 0.01136494, 0.01302695, 0.01259184, 0.01306009]),
 'test_score': array([0.77777778, 0.74251497, 0.76608187, 0.77160494, 0.75925926]),
 'train_score': array([0.77897991, 0.7814871 , 0.76796407, 0.76911315, 0.76666667])}

# Save the model with joblib

In [17]:
from joblib import dump
dump(gs2.best_estimator_, "pipeline_lr.joblib")

['pipeline_lr.joblib']

In [18]:
from joblib import load
model = load("pipeline_lr.joblib")

In [19]:
cols = ['Age', 'Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account',
       'Credit amount', 'Duration', 'Purpose']
data = [k for k in X.iloc[0,:].values]
data = pd.DataFrame([data], columns=cols)

In [20]:
model.predict(data), y[0]

(array([1]), 1)