In [1]:
# Load data training set
import pandas as pd

housing = pd.read_csv('housing-classification-iter6.csv')
housing.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [2]:
# Check data types
housing.dtypes

LotArea            int64
LotFrontage      float64
TotalBsmtSF        int64
BedroomAbvGr       int64
Fireplaces         int64
                  ...   
PoolQC            object
Fence             object
MiscFeature       object
SaleType          object
SaleCondition     object
Length: 81, dtype: object

In [3]:
housing.isna().sum()

LotArea             0
LotFrontage       259
TotalBsmtSF         0
BedroomAbvGr        0
Fireplaces          0
                 ... 
PoolQC           1453
Fence            1179
MiscFeature      1406
SaleType            0
SaleCondition       0
Length: 81, dtype: int64

In [11]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   Expensive      1460 non-null   int64  
 10  MSZoning       1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Heating        1460 non-null   object 
 13  Street         1460 non-null   object 
 14  CentralAir     1460 non-null   object 
 15  Foundation     1460 non-null   object 
 16  ExterQual      1460 non-null   object 
 17  ExterCond      1460 non-null   object 
 18  BsmtQual

## Create train and test



In [10]:
# Split the data
from sklearn.model_selection import train_test_split

housing.drop_duplicates(inplace=True)

X = housing.drop(columns=['Expensive', 'Id'])
y = housing.filter(['Expensive'])

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.75, random_state=42)

In [9]:
# Check for duplicated rows
housing.duplicated().sum()

0

In [31]:
# Create diffrent column categories
num_col = ["LotArea", "LotFrontage", "TotalBsmtSF", "BedroomAbvGr", "Fireplaces", "PoolArea", "GarageCars", "WoodDeckSF", "ScreenPorch", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
           "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "KitchenAbvGr", "TotRmsAbvGrd", "GarageArea", "OpenPorchSF",
            "EnclosedPorch", "3SsnPorch", "MiscVal", "MoSold", "YearBuilt", "YearRemodAdd", "GarageYrBlt", "YrSold"] 
cat_col = ["Foundation", "CentralAir", "Street", "Heating", "Condition1", "MSZoning", "MSSubClass","Alley", "LotShape", "LandContour",
            "Utilities", "LotConfig", "Neighborhood", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Electrical", "GarageType",
            "MiscFeature", "SaleType", "SaleCondition"]
ord_col = ["ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1","KitchenQual", "FireplaceQu", "OverallQual", "OverallCond", "LandSlope", "BsmtFinType2", "HeatingQC",
            "Functional", "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", "PoolQC", "Fence"]

## Build the pipeline

In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [33]:
# Build three preprocessor pipelines 
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", RobustScaler())]
)

categorical_transformer = Pipeline(
   steps=[("imputer", SimpleImputer(strategy="constant", fill_value="None")), ("ohe", OneHotEncoder(sparse=False, handle_unknown='ignore'))]
)

ordinal_transformer = Pipeline(
   steps=[("imputer", SimpleImputer(strategy="constant", fill_value="None")), ("encode", OrdinalEncoder())]
)

# Combine them
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_col),
        ("cat", categorical_transformer, cat_col),
        ("ord", ordinal_transformer, ord_col),
    ]
)

# Now we have a full prediction pipeline
pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier(random_state=42))]
)

pipe.fit(X_train, y_train)
preds = pipe.predict(X_test)
accuracy_score(preds, y_test)

0.9205479452054794

In [22]:
X[ord_col].isna().sum()

ExterQual          0
ExterCond          0
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
KitchenQual        0
FireplaceQu      690
OverallQual        0
OverallCond        0
LandSlope          0
BsmtFinType2      38
HeatingQC          0
Functional         0
GarageFinish      81
GarageQual        81
GarageCond        81
PavedDrive         0
PoolQC          1453
Fence           1179
dtype: int64