# Pipeline to automate data preprocessing and model training

## Load data

In [13]:
import pandas as pd

path = '../../../data/default_credit_card/output/simplified_features_cat.csv'
df = pd.read_csv(path)
df

Unnamed: 0,Industry,Ethnicity,Gender,Age,CivilStatus,YearsEmployed,Income,Approved
0,Industrials,White,Male,30,Married,1.25,0.000000,1
1,Materials,Black,Female,58,Married,3.04,632.793678,1
...,...,...,...,...,...,...,...,...
687,ConsumerStaples,White,Male,17,Married,0.04,662.007321,0
688,Energy,Black,Male,35,Married,8.29,0.000000,0


## Feature selection

In [14]:
target = 'Approved'

y = df[target]
X = df.drop(columns=target)

## Train test split

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

## Pipeline

### Data preprocessing

In [16]:
features = X.dtypes

features_categorical = features[features == 'object'].index
features_categorical

Index(['Industry', 'Ethnicity', 'Gender', 'CivilStatus'], dtype='object')

In [17]:
features_numerical = features[features != 'object'].index
features_numerical

Index(['Age', 'YearsEmployed', 'Income'], dtype='object')

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), features_categorical),
        ('scaler', MinMaxScaler(), features_numerical)
    ])

### Modelling with grid search

In [19]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_leaf': [50, 100, 200, 500],
    'criterion': ['gini', 'entropy']
}

In [20]:
from sklearn.model_selection import GridSearchCV
model_grid = GridSearchCV(model, param_grid, cv=3, verbose=1)

### Pipeline to combine steps

In [21]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('modelling', model_grid)
])

pipeline.fit(X_train, y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


In [22]:
pipeline.score(X_test, y_test)

0.6908212560386473

In [23]:
pipeline.score(X_train, y_train)

0.7593360995850622

In [25]:
pipeline.named_steps['modelling'].best_params_

{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 50}

In [26]:
pipeline.score(X_test, y_test)

0.6908212560386473

In [27]:
pipeline.score(X_train, y_train)

0.7593360995850622

## Pipeline with another model

In [28]:
from sklearn.svm import SVC

model = SVC(probability=True)
model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [29]:
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

model_grid = GridSearchCV(model, param_grid, cv=3)
model_grid

In [30]:
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('modelling', model_grid)
])

pipeline.fit(X_train, y_train)

In [31]:
pipeline.score(X_test, y_test)

0.6956521739130435

In [32]:
pipeline.score(X_train, y_train)

0.7904564315352697

In [33]:
pipeline.steps[1][1].best_params_

{'C': 10, 'gamma': 0.1, 'kernel': 'linear'}