In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

house = pd.read_csv("https://raw.githubusercontent.com/icaromisquita/archives/main/housing-classification-iter5.csv")
house.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
0,8450,65.0,856,3,0,0,2,0,0,0,...,1,8,2003.0,548,61,0,0,0,2,2008
1,9600,80.0,1262,3,1,0,2,298,0,0,...,1,6,1976.0,460,0,0,0,0,5,2007
2,11250,68.0,920,3,1,0,2,0,0,0,...,1,6,2001.0,608,42,0,0,0,9,2008
3,9550,60.0,756,3,1,0,3,0,0,0,...,1,7,1998.0,642,35,272,0,0,2,2006
4,14260,84.0,1145,4,1,0,3,192,0,0,...,1,9,2000.0,836,84,0,0,0,12,2008


###Removing the "Expensive" column from the Dataframe as it is the variable we want to predict (y value) 

In [2]:
y= house["Expensive"]
X = house
X = X.drop(columns="Expensive")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [3]:
X.isna().sum()

LotArea            0
LotFrontage      259
TotalBsmtSF        0
BedroomAbvGr       0
Fireplaces         0
PoolArea           0
GarageCars         0
WoodDeckSF         0
ScreenPorch        0
MSZoning           0
Condition1         0
Heating            0
Street             0
CentralAir         0
Foundation         0
ExterQual          0
ExterCond          0
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
KitchenQual        0
FireplaceQu      690
MSSubClass         0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
KitchenAbvGr       0
TotRmsAbvGrd       0
GarageYrBlt       81
GarageArea         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
MiscVal      

###Creating the "numeric pipe" and the "categoric pipe"

In [4]:

# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").copy().columns
X_num_columns = X.select_dtypes(include="number").copy().columns

#Setting the imputers, Scaler 
imputer =  [ SimpleImputer(), KNNImputer()]
 
scaler =  [StandardScaler(),  MinMaxScaler(), 
           Normalizer(), OneHotEncoder()]
#model = [DecisionTreeClassifier(), KNeighborsClassifier()]

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(imputer,
                            scaler)

 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(
                 imputer
                 )

TypeError: ignored

####Using a pipeline with 2 branches 

In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns),
        ("cat_pipe", categoric_pipe, X_cat_columns),
    ]
)

###Creating the full_pipeline (preprocessor + Decision Tree)

In [None]:
#full_pipeline = make_pipeline(preprocessor, 
#                              DecisionTreeClassifier())
model = [DecisionTreeClassifier(), KNeighborsClassifier()]

full_pipeline = make_pipeline(preprocessor, 
                              model)

###We can then fit this full_pipeline to the data:

Note: notice that we did not fit the preprocessor before —we only fit the pipeline once it has been full assembled.

In [None]:
full_pipeline.fit(X_train, y_train)

###Doing them a prediction

In [None]:
full_pipeline.predict(X_train)

####A code to ignore the errors, when using the GridSearch validation

In [None]:
import warnings
warnings.filterwarnings('ignore')

##Using a pipeline with GridSearch to define the preprocessor and the models

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KDTree


full_pipeline = make_pipeline(preprocessor, 
                              model)

param_grid = {
    "columntransformer__cat_pipe__simpleimputer__strategy":["constant"],
    "columntransformer__cat_pipe__simpleimputer__fill_value": "N_A",
    
    "columntransformer__cat_pipe__onehotencoder__drop": "first",
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "columntransformer__num_pipe__standardscaler__with_mean":[True, False],
    "columntransformer__num_pipe__standardscaler__with_std":[True, False], 
    "decisiontreeclassifier__max_depth": range(2, 14),
    "decisiontreeclassifier__min_samples_leaf": range(2,20),
    "decisiontreeclassifier__criterion":["gini", "entropy"],
    "kneighborstransformer__algorithm":["ball_tree", "kd_tree", "brute"],
    "kneighborsclassifier__n_neighbors": range(3,15),
    "kneighborsclassifier__leaf_size": range(2,30,2),
    "kdtree__balanced_tree": :[True, False],
    "kdtree__leafsize": range(2,20),     
}


search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

In [None]:
full_pipeline["decisiontreeclassifier"].get_params().keys()

###Checking the Accuracy
Make predictions and check accuracy on the train set

In [None]:
from sklearn.metrics import accuracy_score

y_pred_tree_train = search.predict(X_train)

accuracy_score(y_true = y_train,
               y_pred = y_pred_tree_train
              )

Make predictions on the test set

In [None]:
y_pred_tree_test = search.predict(X_test)

accuracy_score(y_true = y_test,
               y_pred = y_pred_tree_test
              )

##Printing the tree

###Retrieving the column names for the "one-hot" columns

In [None]:
# selecting non-numerical columns
X_train_cat = X_train.select_dtypes(exclude="number")

# defining the imputer to use "N_A" as replacement value
cat_imputer = SimpleImputer(strategy="constant", 
                            fill_value="N_A")

# fitting the imputer
cat_imputer.fit(X_train_cat)

# transforming the data & keeping it as a DataFrame
X_cat_imputed = pd.DataFrame(cat_imputer.transform(X_train_cat), 
                             columns=X_train_cat.columns)
X_cat_imputed.head()

# initialize
my_onehot = OneHotEncoder(drop="first")

# fit
my_onehot.fit(X_cat_imputed)

# transform
X_cat_imputed_onehot = my_onehot.transform(X_cat_imputed)
df = pd.DataFrame.sparse.from_spmatrix(X_cat_imputed_onehot)

colnames = my_onehot.get_feature_names_out(X_cat_imputed.columns)
df.columns = colnames
df.head(3)

Plotting the tree

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(64, 32))
plot_tree(
    search.best_estimator_['decisiontreeclassifier'],
    filled=True, 
    rounded=True, 
    feature_names=df.columns,
    class_names=[ 'Not Expensive', 'Expensive']
    );