## Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
from sklearn import set_config
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')


In [None]:
set_config(display="diagram")
set_config(transform_output="pandas")

In [None]:
## Import the data for the housing competition -> Iteration 5
url = "https://drive.google.com/file/d/1YxeVDZHfDhqWb0VOn-lfxnDKoLOayJeD/view?usp=drive_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)
data

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1456,13175,85.0,1542,3,2,0,2,349,0,0,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1457,9042,66.0,1152,4,2,0,1,0,0,1,...,Attchd,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal
1458,9717,68.0,1078,2,0,0,1,366,0,0,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal


In [None]:
data.columns

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'Expensive',
       'MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir',
       'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'FireplaceQu',
       'MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt',
       'GarageArea', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal',
       'MoSold', 'YrSold', 'Id', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrTyp

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   Expensive      1460 non-null   int64  
 10  MSZoning       1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Heating        1460 non-null   object 
 13  Street         1460 non-null   object 
 14  CentralAir     1460 non-null   object 
 15  Foundation     1460 non-null   object 
 16  ExterQual      1460 non-null   object 
 17  ExterCond      1460 non-null   object 
 18  BsmtQual

In [None]:
#Checking duplicates

data.drop_duplicates()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1456,13175,85.0,1542,3,2,0,2,349,0,0,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1457,9042,66.0,1152,4,2,0,1,0,0,1,...,Attchd,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal
1458,9717,68.0,1078,2,0,0,1,366,0,0,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal


## Preprocessor Goal

![image.png](https://i.imgur.com/61fitCB.png)

In [None]:
# Checking missing values

data.isna().sum()

LotArea             0
LotFrontage       259
TotalBsmtSF         0
BedroomAbvGr        0
Fireplaces          0
                 ... 
PoolQC           1453
Fence            1179
MiscFeature      1406
SaleType            0
SaleCondition       0
Length: 81, dtype: int64

### Splitting the data

In [None]:
# Create our feature vector
X = data.drop(columns=["Id"]).copy()

# Target feature
y = X.pop("Expensive")

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Splitting categorical & numerical

X_num = X_train.select_dtypes(include="number").copy()

X_cat = X_train.select_dtypes(exclude="number").copy()

### Pipelines

### Fixing the indexing error

In [None]:
ordinal_cols_names = ["LotShape", "Utilities"]
ordinal_cols = X_cat.columns.get_indexer(ordinal_cols_names)

ordinal_cols

array([15, 17], dtype=int64)

### How to obtain the list of non-ordinal columns?

In [None]:
non_ordinal_cols_names = [col for col in X_cat.columns if col not in ordinal_cols_names]
X_cat[non_ordinal_cols_names]

Unnamed: 0,MSZoning,Condition1,Heating,Street,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
254,RL,Norm,GasA,Pave,Y,CBlock,TA,Gd,TA,TA,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1066,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
638,RL,Feedr,GasA,Pave,Y,CBlock,TA,TA,Fa,TA,...,,,,,P,,MnPrv,,WD,Normal
799,RL,Feedr,GasA,Pave,Y,BrkTil,TA,TA,Gd,TA,...,Detchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
380,RL,Norm,GasA,Pave,Y,BrkTil,TA,TA,TA,TA,...,Detchd,Unf,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1130,RL,Norm,GasA,Pave,Y,BrkTil,TA,TA,TA,TA,...,Detchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1294,RL,Norm,GasA,Pave,Y,CBlock,TA,TA,TA,TA,...,Detchd,Unf,TA,TA,N,,,,WD,Normal
860,RL,Norm,GasA,Pave,Y,BrkTil,Gd,TA,TA,TA,...,Detchd,Unf,TA,TA,Y,,GdPrv,,WD,Normal


In [None]:
onehot_cols = X_cat.columns.get_indexer(non_ordinal_cols_names)
onehot_cols

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42], dtype=int64)

## Choosing our ordinal columns after reading the dataset documentation

LotShape: General shape of property

       Reg	Regular
       IR1	Slightly irregular
       IR2	Moderately Irregular
       IR3	Irregular


Utilities: Type of utilities available

       AllPub	All public Utilities (E,G,W,& S)
       NoSewr	Electricity, Gas, and Water (Septic Tank)
       NoSeWa	Electricity and Gas Only
       ELO	Electricity only

## Creating the categorical Pipe

In [None]:
# List of ordinal columns (getting indexes to avoid errors in the ccategorical_preprocessor)
ordinal_cols_names = ["LotShape", "Utilities"]
ordinal_cols = X_cat.columns.get_indexer(ordinal_cols_names)

# List of non_ordinal columns
non_ordinal_cols_names = [col for col in X_cat.columns if col not in ordinal_cols_names]
onehot_cols = X_cat.columns.get_indexer(non_ordinal_cols_names)

# Define the ranking for each ordinal column (be careful with the order)
ordinal_rankings = [["N_A", "IR3", "IR2", "IR1", "Reg"],
                    ["N_A", "ELO", "NoSeWa", "NoSewr", "AllPub"]]

# Define the categorical encoder
categorical_preprocessor = make_column_transformer(
    (OrdinalEncoder(categories=ordinal_rankings), ordinal_cols ),
    (OneHotEncoder(sparse=False, handle_unknown="ignore"), onehot_cols)
)

# Create a categorical pipeline (imputing + encoding)
categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="N_A"), # Play with combination of NA or N_A here
                                 categorical_preprocessor)


## Creating the numeric Pipe

In [None]:
# Create a numeric pipeline
numeric_pipe = make_pipeline(SimpleImputer())


## Preprocessor = Numeric Pipe + Categorical Pipe

In [None]:
# Final preprocessor

preprocessor = make_column_transformer(
    (numeric_pipe, X_num.columns),
    (categorical_pipe, X_cat.columns)
)

In [None]:
preprocessor

### Algorithm

### Decision Tree

In [None]:
# Create a full pipeline with the final preprocessor and a decision tree classifier
dt_pipe = make_pipeline(preprocessor,
                          #RobustScaler(),
                          DecisionTreeClassifier())
dt_pipe

In [None]:
# Define the parameter grid for grid search
dt_param_grid = {
    # Try both 'mean' and 'median' strategies for the numerical imputer
    "columntransformer__pipeline-1__simpleimputer__strategy":["mean", "median"],

    # Try different values for the max depth of the decision tree
    "decisiontreeclassifier__max_depth": range(2, 14, 2),

    # Try different values for the minimum number of samples required to be at a leaf node
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2)
}

# Create a grid search object with the full pipeline and the parameter grid
dt_search = GridSearchCV(dt_pipe,
                         dt_param_grid,
                         cv=5,  # Use 5-fold cross-validation
                         verbose=1,  # Print detailed information during the grid search
                         n_jobs=-1) # Will use all available CPU cores


# Fit the grid search object to the training data
dt_search.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [None]:
dt_search.best_score_


0.9366347529437657

In [None]:
scores = {"dtree" : dt_search.best_score_}

scores

## Random Forest

In [None]:
# Random Forest
rf_pipeline = make_pipeline(preprocessor,
                            StandardScaler(),
                            RandomForestClassifier())
rf_param_grid = {
    "columntransformer__pipeline-1__simpleimputer__strategy":["mean", "median"],
    "randomforestclassifier__n_estimators": [100, 200]
    #INCLUDE 3 more hyperparameters
}
rf_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, verbose=1, n_jobs=-1)
rf_search.fit(X_train, y_train)
scores["rf"] = rf_search.best_score_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [None]:
scores

{'dtree': 0.9357800520890649, 'rf': 0.9529070833791863}

### Evaluation

In [None]:

accuracies = {"dtree" : accuracy_score(y_true = y_test,
                                       y_pred = dt_search.predict(X_test),
                                        )}


accuracies["rf"] = accuracy_score(y_true = y_test,
                                y_pred = rf_search.predict(X_test),
                                        )


accuracies

{'dtree': 0.934931506849315, 'rf': 0.9554794520547946}

# Importing competition data

In [None]:
# import competition data
url = "https://drive.google.com/file/d/15PfmTxmavQCT-f7iY9tgwWxm9t4GRees/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
competition_data = pd.read_csv(path)
competition_data

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
2,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
3,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
4,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1936,21.0,546.0,3,0,0,0.0,0,0,RM,...,,,,,Y,,,,WD,Normal
1455,1894,21.0,546.0,3,0,0,1.0,0,0,RM,...,CarPort,Unf,TA,TA,Y,,,,WD,Abnorml
1456,20000,160.0,1224.0,4,1,0,2.0,474,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
1457,10441,62.0,912.0,3,0,0,0.0,80,0,RL,...,,,,,Y,,MnPrv,Shed,WD,Normal


# Predicting y_test and Making a submission CSV

In [None]:
competition_ids = competition_data.pop('Id')
competition_ids

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

In [None]:
my_submission = pd.DataFrame({"Id": competition_ids})
my_submission

Unnamed: 0,Id
0,1461
1,1462
2,1463
3,1464
4,1465
...,...
1454,2915
1455,2916
1456,2917
1457,2918


In [None]:
my_submission["Expensive"] = rf_search.predict(competition_data)



Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,0
1457,2918,0


In [None]:
my_submission.head(20)

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
5,1466,0
6,1467,0
7,1468,0
8,1469,0
9,1470,0


In [None]:
my_submission.to_csv("rf_1.csv", index=False)

In [None]:
# Extras needed on colab
# from google.colab import files
# files.download("my_submission_dt_1.csv")