<a href="https://colab.research.google.com/github/jballauff-ds/ML-housing-price-prediction/blob/main/Housing_with_categorical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Housing Price Classification
## iteration 1: Encoding categorical data

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.tree import plot_tree
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [5]:
url = "https://raw.githubusercontent.com/jballauff-ds/ML-housing-price-prediction/main/data/data_housing_01.csv"
df  = pd.read_csv(url)

## Variable description

LotFrontage: Linear feet of street connected to property

LotArea: Lot size in square feet

TotalBsmtSF: Total square feet of basement area

BedroomAbvGr: Bedrooms above grade (does NOT include basement bedrooms)

Fireplaces: Number of fireplaces

PoolArea: Pool area in square feet

GarageCars: Size of garage in car capacity

WoodDeckSF: Wood deck area in square feet

ScreenPorch: Screen porch area in square feet

MSZoning: Identifies the general zoning classification of the sale.

       A	Agriculture
       C	Commercial
       FV	Floating Village Residential
       I	Industrial
       RH	Residential High Density
       RL	Residential Low Density
       RP	Residential Low Density Park
       RM	Residential Medium Density

Condition1: Proximity to various conditions

       Artery	Adjacent to arterial street
       Feedr	Adjacent to feeder street
       Norm	Normal
       RRNn	Within 200' of North-South Railroad
       RRAn	Adjacent to North-South Railroad
       PosN	Near positive off-site feature--park, greenbelt, etc.
       PosA	Adjacent to postive off-site feature
       RRNe	Within 200' of East-West Railroad
       RRAe	Adjacent to East-West Railroad

Heating: Type of heating

       Floor	Floor Furnace
       GasA	Gas forced warm air furnace
       GasW	Gas hot water or steam heat
       Grav	Gravity furnace
       OthW	Hot water or steam heat other than gas
       Wall	Wall furnace

Street: Type of road access to property

       Grvl	Gravel
       Pave	Paved

CentralAir: Central air conditioning

       N	No
       Y	Yes

Foundation: Type of foundation

       BrkTil	Brick & Tile
       CBlock	Cinder Block
       PConc	Poured Contrete
       Slab	Slab
       Stone	Stone
       Wood	Wood

# Data preparation

In [6]:
df.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,MSZoning,Condition1,Heating,Street,CentralAir,Foundation
0,8450,65.0,856,3,0,0,2,0,0,0,RL,Norm,GasA,Pave,Y,PConc
1,9600,80.0,1262,3,1,0,2,298,0,0,RL,Feedr,GasA,Pave,Y,CBlock
2,11250,68.0,920,3,1,0,2,0,0,0,RL,Norm,GasA,Pave,Y,PConc
3,9550,60.0,756,3,1,0,3,0,0,0,RL,Norm,GasA,Pave,Y,BrkTil
4,14260,84.0,1145,4,1,0,3,192,0,0,RL,Norm,GasA,Pave,Y,PConc


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotArea       1460 non-null   int64  
 1   LotFrontage   1201 non-null   float64
 2   TotalBsmtSF   1460 non-null   int64  
 3   BedroomAbvGr  1460 non-null   int64  
 4   Fireplaces    1460 non-null   int64  
 5   PoolArea      1460 non-null   int64  
 6   GarageCars    1460 non-null   int64  
 7   WoodDeckSF    1460 non-null   int64  
 8   ScreenPorch   1460 non-null   int64  
 9   Expensive     1460 non-null   int64  
 10  MSZoning      1460 non-null   object 
 11  Condition1    1460 non-null   object 
 12  Heating       1460 non-null   object 
 13  Street        1460 non-null   object 
 14  CentralAir    1460 non-null   object 
 15  Foundation    1460 non-null   object 
dtypes: float64(1), int64(9), object(6)
memory usage: 182.6+ KB


In [9]:
numerical = list(df.select_dtypes(exclude = "object"))
numerical = [i for i in numerical if i != "Expensive"]
categorical = list(df.select_dtypes(include = "object"))

In [10]:
for i in categorical:
  print(df[i].value_counts())

MSZoning
RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: count, dtype: int64
Condition1
Norm      1260
Feedr       81
Artery      48
RRAn        26
PosN        19
RRAe        11
PosA         8
RRNn         5
RRNe         2
Name: count, dtype: int64
Heating
GasA     1428
GasW       18
Grav        7
Wall        4
OthW        2
Floor       1
Name: count, dtype: int64
Street
Pave    1454
Grvl       6
Name: count, dtype: int64
CentralAir
Y    1365
N      95
Name: count, dtype: int64
Foundation
PConc     647
CBlock    634
BrkTil    146
Slab       24
Stone       6
Wood        3
Name: count, dtype: int64


## Feature Engeneering: Omitting of very rare features
 + Drop Street as it has very low variation
 + Simplify Heating -> Grav|Wall|OthW|Floor to Other
 + Simplify Foundation -> Slab/Stone/Wood to Other
 + Simplify Conddition1 -> proximity to any railroad (RR), any positive feature (Pos)

In [13]:
df_modified = df.drop(columns = "Street").copy()
df_modified.loc[df_modified["Condition1"].str.contains("Pos"), "Condition1"] = "Pos"
df_modified.loc[df_modified["Condition1"].str.contains("RR"), "Condition1"] = "RR"
df_modified.loc[df_modified["Heating"].str.contains("Grav|Wall|OthW|Floor"), "Heating"]  = "Othr"
df_modified.loc[df_modified["Foundation"].str.contains("Slab|Wood|Stone"), "Foundation"]  = "Othr"

categorical=[i for i in categorical if i != "Street"]

In [14]:
for i in categorical:
  print(df_modified[i].value_counts())

MSZoning
RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: count, dtype: int64
Condition1
Norm      1260
Feedr       81
Artery      48
RR          44
Pos         27
Name: count, dtype: int64
Heating
GasA    1428
GasW      18
Othr      14
Name: count, dtype: int64
CentralAir
Y    1365
N      95
Name: count, dtype: int64
Foundation
PConc     647
CBlock    634
BrkTil    146
Othr       33
Name: count, dtype: int64


In [15]:
X = df_modified.copy()
y = X.pop("Expensive")

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

all categorical features must exist in the train and test dataset

In [17]:
for i in categorical:
  print("Train:", X_train[i].unique())
  print("Test:", X_test[i].unique())

Train: ['RL' 'RM' 'FV' 'RH' 'C (all)']
Test: ['RL' 'RM' 'C (all)' 'FV' 'RH']
Train: ['Norm' 'Feedr' 'RR' 'Artery' 'Pos']
Test: ['Norm' 'Artery' 'Feedr' 'RR' 'Pos']
Train: ['GasA' 'Othr' 'GasW']
Test: ['GasA' 'GasW' 'Othr']
Train: ['Y' 'N']
Test: ['Y' 'N']
Train: ['CBlock' 'PConc' 'BrkTil' 'Othr']
Test: ['CBlock' 'PConc' 'BrkTil' 'Othr']


# Train Decision Tree

In [18]:
num_imputer = SimpleImputer()
encoder = OneHotEncoder(drop="first",sparse_output=False)
scaler = StandardScaler()
dtree = DecisionTreeClassifier()

numeric_branch = make_pipeline(num_imputer)
categorical_branch = make_pipeline(encoder)

preprocessor = ColumnTransformer(
    transformers=[
        ("numbranch", numeric_branch, numerical),
        ("catbranch", categorical_branch, categorical),
    ]
)

pipeDT = make_pipeline(preprocessor, scaler, dtree).set_output(transform='pandas')

In [19]:
pipeDT.fit(X_train, y_train)

In [21]:
params = {
    "columntransformer__numbranch__simpleimputer__strategy": ["mean", "median"],
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    'decisiontreeclassifier__max_depth': range(4, 8),
    'decisiontreeclassifier__min_samples_leaf': range(1, 4, 1),
    'decisiontreeclassifier__min_samples_split': range(8, 16, 2),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
    }

In [22]:
search = GridSearchCV(pipeDT, params, cv=5, scoring='accuracy', verbose=1)

In [23]:
search.fit(X_train, y_train);

Fitting 5 folds for each of 768 candidates, totalling 3840 fits


In [24]:
search.best_score_

0.9058361762224424

In [25]:
search.best_params_

{'columntransformer__numbranch__simpleimputer__strategy': 'mean',
 'decisiontreeclassifier__criterion': 'entropy',
 'decisiontreeclassifier__max_depth': 6,
 'decisiontreeclassifier__min_samples_leaf': 3,
 'decisiontreeclassifier__min_samples_split': 10,
 'standardscaler__with_mean': False,
 'standardscaler__with_std': False}

In [26]:
y_train_pred = search.predict(X_train)
accuracy_score(y_train, y_train_pred)

0.9392123287671232

In [27]:
y_test_pred = search.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.928082191780822