In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier

from sklearn.impute import KNNImputer

from sklearn.ensemble import RandomForestClassifier


house = pd.read_csv("https://raw.githubusercontent.com/icaromisquita/archives/main/housing-classification-iter6.csv")
house.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [None]:
print(house. columns)

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'Expensive',
       'MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir',
       'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'FireplaceQu',
       'MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt',
       'GarageArea', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal',
       'MoSold', 'YrSold', 'Id', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrTyp

###Removing the "Expensive" column from the Dataframe as it is the variable we want to predict (y value) 

In [None]:
y= house["Expensive"]
X = house
X = X.drop(columns="Expensive")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [None]:
X.isna().sum()

LotArea             0
LotFrontage       259
TotalBsmtSF         0
BedroomAbvGr        0
Fireplaces          0
                 ... 
PoolQC           1453
Fence            1179
MiscFeature      1406
SaleType            0
SaleCondition       0
Length: 80, dtype: int64

###Creating the "numeric pipe" and the "categoric pipe"

In [None]:
# import
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").copy().columns
X_num_columns = X.select_dtypes(include="number").copy().columns

#Setting the imputers, Scaler 
imputer = KNNImputer()
scaler = StandardScaler()

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(imputer,
                     scaler)
                     

 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder()
)

####Using a pipeline with 2 branches 

In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns),
        ("cat_pipe", categoric_pipe, X_cat_columns),
    ]
)

###Creating the full_pipeline (preprocessor + Decision Tree)

In [None]:
full_pipeline = make_pipeline(preprocessor, 
                              KNeighborsClassifier())

###We can then fit this full_pipeline to the data:

Note: notice that we did not fit the preprocessor before —we only fit the pipeline once it has been full assembled.

In [None]:
full_pipeline.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('knnimputer',
                                                                   KNNImputer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSSubClass',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemod...
       'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtFinType2',
       'HeatingQC', 'Electrical', 'Functional', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fen

###Doing them a prediction

In [None]:
full_pipeline.predict(X_train)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

####A code to ignore the errors, when using the GridSearch validation

In [None]:
import warnings
warnings.filterwarnings('ignore')

##Using the new Pipeline with branches to train a DecisionTree with GridSearch cross validation.

In [None]:
from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor, 
                              RandomForestClassifier())
param_grid = {
    "columntransformer__num_pipe__standardscaler__with_mean":[True, False],
    "columntransformer__cat_pipe__onehotencoder__handle_unknown" : ["ignore"],
    "randomforestclassifier__n_estimators": range(100, 500, 50),
    "randomforestclassifier__criterion": ["gini", "entropy", "log_loss"],
    "randomforestclassifier__max_depth": range(4,10,2),
    "randomforestclassifier__min_samples_split": range(4,10,2),
    "randomforestclassifier__min_samples_leaf": range(2,10,2),
}


search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


###Checking the Accuracy
Make predictions and check accuracy on the train set

In [None]:
from sklearn.metrics import accuracy_score

y_pred_tree_train = search.predict(X_train)

accuracy_score(y_true = y_train,
               y_pred = y_pred_tree_train
              )

Make predictions on the test set

In [None]:
y_pred_tree_test = search.predict(X_test)

accuracy_score(y_true = y_test,
               y_pred = y_pred_tree_test
              )

##Making the data to upload in the competition

In [None]:
test = pd.read_csv("https://raw.githubusercontent.com/icaromisquita/archives/main/test.csv").drop(columns=["Unnamed: 0"])

(test
    .assign(Expensive = search.predict(test))
    .filter(['Id','Expensive'])
    .to_csv('.\submission8.csv', index=False)
    #['Expensive'].unique()
    )
