# Dummy variables

## Load data

In [1]:
import pandas as pd

path = '../../../data/default_credit_card/output/simplified_features_cat.csv'
df = pd.read_csv(path)
df

Unnamed: 0,Industry,Ethnicity,Gender,Age,CivilStatus,YearsEmployed,Income,Approved
0,Industrials,White,Male,30,Married,1.25,0,1
1,Materials,Black,Female,58,Married,3.04,560,1
...,...,...,...,...,...,...,...,...
688,ConsumerStaples,White,Male,17,Married,0.04,750,0
689,Energy,Black,Male,35,Married,8.29,0,0


## Feature selection

In [2]:
target = 'Approved'

y = df[target]
X = df.drop(columns=target)

## Train test split

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

## Pipeline

### One hot encoding (dummy variables)

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [5]:
features = df.dtypes

features_categorical = features[features == 'object'].index
features_categorical

Index(['Industry', 'Ethnicity', 'Gender', 'CivilStatus'], dtype='object')

In [6]:
transformer = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), features_categorical)
], remainder='passthrough')

### Model

In [7]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_leaf': [50, 100, 200, 500],
    'criterion': ['gini', 'entropy']
}

In [8]:
from sklearn.model_selection import GridSearchCV
model_grid = GridSearchCV(model, param_grid, cv=3, verbose=1)

### Alltogether

In [9]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ('preprocessing', transformer),
    ('modelling', model_grid)
])

In [10]:
pipeline.fit(X_train, y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


In [11]:
pipeline.score(X_test, y_test)

0.7053140096618358

In [12]:
pipeline.score(X_train, y_train)

0.7494824016563147