<a href="https://colab.research.google.com/github/jballauff-ds/ML-housing-price-prediction/blob/main/Housing_with_ordered_categorical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Housing Price Classification
## iteration 2: Ordered Categorical Data

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.tree import plot_tree
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [10]:
url = "https://raw.githubusercontent.com/jballauff-ds/ML-housing-price-prediction/main/data/data_housing_02.csv"
df  = pd.read_csv(url)

# Data preparation

In [11]:
df.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,KitchenQual,FireplaceQu
0,8450,65.0,856,3,0,0,2,0,0,0,...,Y,PConc,Gd,TA,Gd,TA,No,GLQ,Gd,
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Y,CBlock,TA,TA,Gd,TA,Gd,ALQ,TA,TA
2,11250,68.0,920,3,1,0,2,0,0,0,...,Y,PConc,Gd,TA,Gd,TA,Mn,GLQ,Gd,TA
3,9550,60.0,756,3,1,0,3,0,0,0,...,Y,BrkTil,TA,TA,TA,Gd,No,ALQ,Gd,Gd
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Y,PConc,Gd,TA,Gd,TA,Av,GLQ,Gd,TA


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotArea       1460 non-null   int64  
 1   LotFrontage   1201 non-null   float64
 2   TotalBsmtSF   1460 non-null   int64  
 3   BedroomAbvGr  1460 non-null   int64  
 4   Fireplaces    1460 non-null   int64  
 5   PoolArea      1460 non-null   int64  
 6   GarageCars    1460 non-null   int64  
 7   WoodDeckSF    1460 non-null   int64  
 8   ScreenPorch   1460 non-null   int64  
 9   Expensive     1460 non-null   int64  
 10  MSZoning      1460 non-null   object 
 11  Condition1    1460 non-null   object 
 12  Heating       1460 non-null   object 
 13  Street        1460 non-null   object 
 14  CentralAir    1460 non-null   object 
 15  Foundation    1460 non-null   object 
 16  ExterQual     1460 non-null   object 
 17  ExterCond     1460 non-null   object 
 18  BsmtQual      1423 non-null 

In [13]:
df.isna().sum()

LotArea           0
LotFrontage     259
TotalBsmtSF       0
BedroomAbvGr      0
Fireplaces        0
PoolArea          0
GarageCars        0
WoodDeckSF        0
ScreenPorch       0
Expensive         0
MSZoning          0
Condition1        0
Heating           0
Street            0
CentralAir        0
Foundation        0
ExterQual         0
ExterCond         0
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
KitchenQual       0
FireplaceQu     690
dtype: int64

In [15]:
numerical = list(df.select_dtypes(exclude = "object"))
numerical = [i for i in numerical if i != "Expensive"]
categorical = list(df.select_dtypes(include = "object"))

In [16]:
for i in categorical:
  print(df[i].value_counts())

MSZoning
RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: count, dtype: int64
Condition1
Norm      1260
Feedr       81
Artery      48
RRAn        26
PosN        19
RRAe        11
PosA         8
RRNn         5
RRNe         2
Name: count, dtype: int64
Heating
GasA     1428
GasW       18
Grav        7
Wall        4
OthW        2
Floor       1
Name: count, dtype: int64
Street
Pave    1454
Grvl       6
Name: count, dtype: int64
CentralAir
Y    1365
N      95
Name: count, dtype: int64
Foundation
PConc     647
CBlock    634
BrkTil    146
Slab       24
Stone       6
Wood        3
Name: count, dtype: int64
ExterQual
TA    906
Gd    488
Ex     52
Fa     14
Name: count, dtype: int64
ExterCond
TA    1282
Gd     146
Fa      28
Ex       3
Po       1
Name: count, dtype: int64
BsmtQual
TA    649
Gd    618
Ex    121
Fa     35
Name: count, dtype: int64
BsmtCond
TA    1311
Gd      65
Fa      45
Po       2
Name: count, dtype: int64
BsmtExposure
No    953
Av    221
Gd  

## Feature Engeneering: Omitting of very rare features
 + Drop Street as it has very low variation
 + Simplify Heating -> Grav|Wall|OthW|Floor to Other
 + Simplify Foundation -> Slab/Stone/Wood to Other
 + Simplify Conddition1 -> proximity to any railroad (RR), any positive feature (Pos)

In [17]:
df_modified = df.drop(columns = "Street").copy()
df_modified.loc[df_modified["Condition1"].str.contains("Pos"), "Condition1"] = "Pos"
df_modified.loc[df_modified["Condition1"].str.contains("RR"), "Condition1"] = "RR"
df_modified.loc[df_modified["Heating"].str.contains("Grav|Wall|OthW|Floor"), "Heating"]  = "Othr"
df_modified.loc[df_modified["Foundation"].str.contains("Slab|Wood|Stone"), "Foundation"]  = "Othr"

categorical=[i for i in categorical if i != "Street"]


## Feature Engeneering: Order categories

In [18]:
categorical_ordered = ["ExterQual","ExterCond","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","KitchenQual", "FireplaceQu"]
categorical_unordered = [i for i in categorical if i not in categorical_ordered ]

In [19]:
for i in categorical_ordered:
  print(df_modified[i].value_counts())

ExterQual
TA    906
Gd    488
Ex     52
Fa     14
Name: count, dtype: int64
ExterCond
TA    1282
Gd     146
Fa      28
Ex       3
Po       1
Name: count, dtype: int64
BsmtQual
TA    649
Gd    618
Ex    121
Fa     35
Name: count, dtype: int64
BsmtCond
TA    1311
Gd      65
Fa      45
Po       2
Name: count, dtype: int64
BsmtExposure
No    953
Av    221
Gd    134
Mn    114
Name: count, dtype: int64
BsmtFinType1
Unf    430
GLQ    418
ALQ    220
BLQ    148
Rec    133
LwQ     74
Name: count, dtype: int64
KitchenQual
TA    735
Gd    586
Ex    100
Fa     39
Name: count, dtype: int64
FireplaceQu
Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: count, dtype: int64


In [29]:
order = {
    'ExterQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex', np.nan],
    'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex', np.nan],
    'BsmtQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex', np.nan],
    'BsmtCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex', np.nan],
    'BsmtExposure': ['No', 'Mn', 'Av', 'Gd', np.nan],
    'BsmtFinType1': ['Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ', np.nan],
    'KitchenQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex', np.nan],
    'FireplaceQu': ['Po', 'Fa', 'TA', 'Gd', 'Ex', np.nan],
}
custom_order = [order[feature] for feature in order]
ord_encoder = OrdinalEncoder(categories=custom_order, encoded_missing_value=-1)

# Decision Tree Classifier

In [36]:
X = df_modified.copy()
y = X.pop("Expensive")

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
import numpy as np
num_imputer = SimpleImputer()
unoredered_encoder = OneHotEncoder(drop="first", sparse_output=False)
ordered_encoder = OrdinalEncoder(categories=custom_order, encoded_missing_value=-1)
scaler = StandardScaler()
dtree = DecisionTreeClassifier()

num_branch = make_pipeline(num_imputer)
cat_unord_branch = make_pipeline(unoredered_encoder)
cat_ord_branch = make_pipeline(ordered_encoder)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_branch", num_branch, numerical),
        ("cat_branch_unord", cat_unord_branch, categorical_unordered),
        ("cat_branch_ord", cat_ord_branch, categorical_ordered)
    ]
)

pipeDT = make_pipeline(preprocessor,scaler,dtree).set_output(transform='pandas')

In [39]:
pipeDT.fit(X_train, y_train)

In [41]:
params = {
    "columntransformer__num_branch__simpleimputer__strategy": ["mean", "median"],
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    'decisiontreeclassifier__max_depth': range(4, 8),
    'decisiontreeclassifier__min_samples_leaf': range(1, 4, 1),
    'decisiontreeclassifier__min_samples_split': range(8, 16, 2),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
}

In [42]:
search = GridSearchCV(pipeDT, params, cv=5, scoring='accuracy', verbose=1)

In [43]:
search.fit(X_train, y_train);

Fitting 5 folds for each of 768 candidates, totalling 3840 fits


In [44]:
search.best_score_

0.9118264186933714

In [45]:
search.best_params_

{'columntransformer__num_branch__simpleimputer__strategy': 'mean',
 'decisiontreeclassifier__criterion': 'entropy',
 'decisiontreeclassifier__max_depth': 6,
 'decisiontreeclassifier__min_samples_leaf': 2,
 'decisiontreeclassifier__min_samples_split': 8,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True}

In [46]:
y_train_pred = search.predict(X_train)
accuracy_score(y_train, y_train_pred)

0.9503424657534246

In [47]:
y_test_pred = search.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.934931506849315