<a href="https://colab.research.google.com/github/jballauff-ds/ML-housing-price-prediction/blob/main/Housing_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Housing Price Classification
## iteration 0: Data Exploration, Naive Classifier, Decision Tree Classifier  

# Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.tree import plot_tree
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Data preparation

## Variable description

LotFrontage: Linear feet of street connected to property

LotArea: Lot size in square feet

TotalBsmtSF: Total square feet of basement area

BedroomAbvGr: Bedrooms above grade (does NOT include basement bedrooms)

Fireplaces: Number of fireplaces

PoolArea: Pool area in square feet

GarageCars: Size of garage in car capacity

WoodDeckSF: Wood deck area in square feet

ScreenPorch: Screen porch area in square feet

__Expensive: Whether the house is classified expensive or not (used to train predictions)__

In [None]:
url = "https://raw.githubusercontent.com/jballauff-ds/ML-housing-price-prediction/main/data/data_housing_00.csv?token=GHSAT0AAAAAACQ6BETTJW4SJQIDUSNYV2JSZR3ZNDQ"
df  = pd.read_csv(url)

In [None]:
df.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive
0,8450,65.0,856,3,0,0,2,0,0,0
1,9600,80.0,1262,3,1,0,2,298,0,0
2,11250,68.0,920,3,1,0,2,0,0,0
3,9550,60.0,756,3,1,0,3,0,0,0
4,14260,84.0,1145,4,1,0,3,192,0,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotArea       1460 non-null   int64  
 1   LotFrontage   1201 non-null   float64
 2   TotalBsmtSF   1460 non-null   int64  
 3   BedroomAbvGr  1460 non-null   int64  
 4   Fireplaces    1460 non-null   int64  
 5   PoolArea      1460 non-null   int64  
 6   GarageCars    1460 non-null   int64  
 7   WoodDeckSF    1460 non-null   int64  
 8   ScreenPorch   1460 non-null   int64  
 9   Expensive     1460 non-null   int64  
dtypes: float64(1), int64(9)
memory usage: 114.2 KB


In [None]:
X = df.copy()
y = X.pop("Expensive")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

In [None]:
X_train.groupby(y_train).mean()

Unnamed: 0_level_0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch
Expensive,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,9608.648129,67.133829,976.344793,2.846309,0.527806,2.110212,1.613751,77.334681,14.490394
1,16017.72067,85.176101,1534.50838,3.061453,1.156425,7.223464,2.581006,166.502793,21.497207


In [None]:
X_train.groupby(y_train).median()

Unnamed: 0_level_0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch
Expensive,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,9069.0,65.0,936.0,3.0,0.0,0.0,2.0,0.0,0.0
1,12090.0,85.0,1530.0,3.0,1.0,0.0,3.0,171.0,0.0


## Setting up a naive / intuitive model
__Observations:__
+ Expensive houses have larger LotArea
+ Fireplace makes house expensive
+ WoodDeck makes house expensive
+ more than two garage places makes house expensive

Setting up prediction rules to determine expensive house based on naive data exploration

In [None]:
variables = ["LotArea", "Fireplaces", "GarageCars", "WoodDeckSF"]
rules = [lambda x: x > 12000, lambda x: x > 0, lambda x: x > 2, lambda x: x > 0]
weights = [1,1,1,1]

In [None]:
def naivePrediction(df, vars, rules, weights):
  pred_naive = pd.Series(0, index = df.index)
  for i in range(len(vars)):
    pred_naive += ((rules[i](df[vars[i]]))*weights[i])/sum(weights)
  return round(pred_naive)

pred_naive_train = naivePrediction(X_train, variables, rules, weights)
pred_naive_test = naivePrediction(X_test, variables, rules, weights)
train_accuracy = accuracy_score(y_true = y_train, y_pred = pred_naive_train)
test_accuracy = accuracy_score(y_true = y_test, y_pred = pred_naive_test)

print(f"Model accuracy: \n training data - {round(train_accuracy, 2)} \n test data - {round(test_accuracy, 2)}")

Model accuracy: 
 training data - 0.88 
 test data - 0.88


# Decision Tree Classifier

In [None]:
imputer = SimpleImputer()
scaler = StandardScaler()
dtree = DecisionTreeClassifier()
pipeDT = make_pipeline(imputer, scaler, dtree).set_output(transform='pandas')

In [None]:
params = {
    "simpleimputer__strategy":["mean", "median"],
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    'decisiontreeclassifier__max_depth': range(2, 7),
    'decisiontreeclassifier__min_samples_leaf': range(3, 8, 2),
    'decisiontreeclassifier__min_samples_split': range(30, 50, 2),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
    }

In [None]:
search = GridSearchCV(pipeDT, params, cv=5, scoring='accuracy', verbose=1)

In [None]:
search.fit(X_train, y_train);

Fitting 5 folds for each of 2400 candidates, totalling 12000 fits


In [None]:
search.best_score_

0.9280950808847805

In [None]:
search.best_params_

{'decisiontreeclassifier__criterion': 'gini',
 'decisiontreeclassifier__max_depth': 5,
 'decisiontreeclassifier__min_samples_leaf': 5,
 'decisiontreeclassifier__min_samples_split': 38,
 'simpleimputer__strategy': 'mean',
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True}

In [None]:
y_train_pred = search.predict(X_train)
accuracy_score(y_train, y_train_pred)

0.9332191780821918

In [None]:
y_test_pred = search.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.934931506849315