In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

data = pd.read_csv(r"C:\Users\Marvin\Documents\WBS\Data-Science-Bootcamp\7_Supervised ML\Data\housing_iteration_3_classification\housing_iteration_3_classification.csv")

In [3]:
data

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,MSZoning,Condition1,Heating,Street,CentralAir,Foundation
0,8450,65.0,856,3,0,0,2,0,0,0,RL,Norm,GasA,Pave,Y,PConc
1,9600,80.0,1262,3,1,0,2,298,0,0,RL,Feedr,GasA,Pave,Y,CBlock
2,11250,68.0,920,3,1,0,2,0,0,0,RL,Norm,GasA,Pave,Y,PConc
3,9550,60.0,756,3,1,0,3,0,0,0,RL,Norm,GasA,Pave,Y,BrkTil
4,14260,84.0,1145,4,1,0,3,192,0,0,RL,Norm,GasA,Pave,Y,PConc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,RL,Norm,GasA,Pave,Y,PConc
1456,13175,85.0,1542,3,2,0,2,349,0,0,RL,Norm,GasA,Pave,Y,CBlock
1457,9042,66.0,1152,4,2,0,1,0,0,1,RL,Norm,GasA,Pave,Y,Stone
1458,9717,68.0,1078,2,0,0,1,366,0,0,RL,Norm,GasA,Pave,Y,CBlock


In [4]:
# X and y creation
X = data
y = X.pop("Expensive")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

### Creating the "numeric pipe" and the "categoric pipe"

In [49]:
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))

 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False,handle_unknown='infrequent_if_exist',min_frequency=0.03))

In [28]:
from sklearn.compose import make_column_transformer

preprocessor = make_column_transformer(
    (numeric_pipe, X_num_columns),
    (categoric_pipe, X_cat_columns),
)

In [51]:
full_pipeline = make_pipeline(preprocessor,
                              DecisionTreeClassifier(max_depth=10,
                                                     min_samples_leaf=4,
                                                     min_samples_split=38,
                                                     criterion="entropy",
                                                     ))

{'columntransformer__pipeline-1__simpleimputer__strategy': 'mean',
 'decisiontreeclassifier__criterion': 'entropy',
 'decisiontreeclassifier__max_depth': 10,
 'decisiontreeclassifier__min_samples_leaf': 4,
 'decisiontreeclassifier__min_samples_split': 38}

In [45]:
full_pipeline.fit(X_train, y_train)

In [46]:
full_pipeline.predict(X_train)

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [31]:
from sklearn.model_selection import GridSearchCV

# define parameter grid
param_grid = {
    "decisiontreeclassifier__max_depth": range(2, 5, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 10, 2)
}

# define GridSearchCV
search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

scores = {"dtree" : search.best_score_}

scores

Fitting 5 folds for each of 30 candidates, totalling 150 fits


{'dtree': 0.9186786985070248}

In [48]:
# define parameter grid
param_grid = {
    "columntransformer__pipeline-1__simpleimputer__strategy":["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 12, 1),
    "decisiontreeclassifier__min_samples_leaf": range(2, 10, 1),
    'decisiontreeclassifier__min_samples_split': range(3, 40, 5),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
}

# define GridSearchCV
search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      scoring='accuracy',
                      verbose=1)

search.fit(X_train, y_train)

best_param = search.best_params_

best_param

Fitting 5 folds for each of 2560 candidates, totalling 12800 fits


{'columntransformer__pipeline-1__simpleimputer__strategy': 'mean',
 'decisiontreeclassifier__criterion': 'entropy',
 'decisiontreeclassifier__max_depth': 10,
 'decisiontreeclassifier__min_samples_leaf': 4,
 'decisiontreeclassifier__min_samples_split': 38}

In [52]:
search.fit(X_train, y_train)

scores = {"dtree" : search.best_score_}

scores

Fitting 5 folds for each of 2560 candidates, totalling 12800 fits


{'dtree': 0.9212354645831041}

In [53]:
# training accuracy
y_train_pred = search.predict(X_train)

accuracy_score(y_train, y_train_pred)

0.9409246575342466

In [55]:
# testing accuracy
y_test_pred = search.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9075342465753424

### Grid Search Cross Validation

In [56]:
from sklearn.preprocessing import StandardScaler

In [None]:
# initialize transformers & model

scaler = StandardScaler()


In [58]:
full_pipeline = make_pipeline(preprocessor,
                              StandardScaler(),
                              DecisionTreeClassifier(max_depth=10,
                                                     min_samples_leaf=4,
                                                     min_samples_split=38,
                                                     criterion="entropy",
                                                     )).set_output(transform='pandas')

In [63]:
full_pipeline

In [60]:
# define parameter grid
param_grid = {
    "columntransformer__pipeline-1__simpleimputer__strategy":["mean", "median"],
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    "decisiontreeclassifier__max_depth": range(2, 12, 1),
    "decisiontreeclassifier__min_samples_leaf": range(2, 10, 1),
    'decisiontreeclassifier__min_samples_split': range(3, 40, 5),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
}

In [61]:
# define cross validation
search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=10,
                      verbose=1)

In [62]:
# fit
search.fit(X_train, y_train)

Fitting 10 folds for each of 10240 candidates, totalling 102400 fits


In [64]:
# cross validation average accuracy
search.best_score_

0.9238063660477455

In [65]:
# best parameters
search.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'mean',
 'decisiontreeclassifier__criterion': 'entropy',
 'decisiontreeclassifier__max_depth': 7,
 'decisiontreeclassifier__min_samples_leaf': 3,
 'decisiontreeclassifier__min_samples_split': 18,
 'standardscaler__with_mean': False,
 'standardscaler__with_std': False}

In [66]:
full_pipeline = make_pipeline(preprocessor,
                              StandardScaler(with_mean=False,
                                             with_std=False),
                              DecisionTreeClassifier(max_depth=7,
                                                     min_samples_leaf=3,
                                                     min_samples_split=18,
                                                     criterion="entropy",
                                                     )).set_output(transform='pandas')

In [67]:
# training accuracy
y_train_pred = search.predict(X_train)

accuracy_score(y_train, y_train_pred)

0.9477739726027398

In [68]:
# testing accuracy
y_test_pred = search.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9143835616438356