# 1 Data reading & splitting

In [1]:
import pandas as pd
from sklearn import set_config

set_config(transform_output="pandas")

# import the dataframe
# reading the data
url = '/home/almuth/Documents/_data_science_bootcamp/Bootcamp/week7_supervised_ml/housing_iter_6/data/iter-6/housing-classification-iter6.csv'
housing6 = pd.read_csv(url)


In [19]:
housing6


Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1456,13175,85.0,1542,3,2,0,2,349,0,0,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1457,9042,66.0,1152,4,2,0,1,0,0,1,...,Attchd,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal
1458,9717,68.0,1078,2,0,0,1,366,0,0,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal


In [2]:
from sklearn.model_selection import train_test_split

# dropping unnecessary columns and X and y creation
housing = housing6.copy()
y = housing.pop('Expensive')
X = housing.drop(columns=["Id"])

# split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123)


# 2 Pipeline

In [3]:
# select categorical and numerical column names
X_num_columns = X.select_dtypes(include='number').copy()
X_cat_columns = X.select_dtypes(exclude='number').copy()

# distinguish between ordered and unordered categorical features
ordered_categories_column_names = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                                   'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence']
X_cat_ordered_columns = X_cat_columns[ordered_categories_column_names]
X_cat_unordered_columns = X_cat_columns.drop(
    ordered_categories_column_names, axis=1)


##  2.1 Numeric pipe

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

# initialize
imputer = SimpleImputer()

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(SimpleImputer())
numeric_pipe


# 2.2 Catecorical pipes

## 2.2.1 Unorderd

In [5]:
from sklearn.preprocessing import OneHotEncoder

# initialize
my_hot = OneHotEncoder()

# create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
unordered_categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False, handle_unknown='ignore')
)
unordered_categoric_pipe


## 2.2.2 Orderd

In [6]:
from sklearn.preprocessing import OrdinalEncoder

# Ordered categorical features ordered from bad to good
ExterQual_cat = ['Po', 'Fa', 'TA', 'Gd', "Ex"]
ExterCond_cat = ['Po', 'Fa', 'TA', 'Gd', "Ex"]
BsmtQual_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
BsmtCond_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
BsmtExposure_cat = ['NA', 'No', 'Mn', 'Av', 'Gd']
BsmtFinType1_cat = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
BsmtFinType2_cat = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
HeatingQC_cat = ['Po', 'Fa', 'TA', 'Gd', "Ex"]
KitchenQual_cat = ['Po', 'Fa', 'TA', 'Gd', "Ex"]
FireplaceQu_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
GarageFinish_cat = ['NA', 'Unf', 'RFn', 'Fin']
GarageQual_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
GarageCond_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
PoolQC_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
Fence_cat = ['Na', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']

ordinal_cats_list = [ExterQual_cat, ExterCond_cat, BsmtQual_cat, BsmtCond_cat, BsmtExposure_cat, BsmtFinType1_cat,
                     BsmtFinType2_cat, HeatingQC_cat, KitchenQual_cat, FireplaceQu_cat, GarageFinish_cat, GarageQual_cat, GarageCond_cat, PoolQC_cat, Fence_cat]


# initialize
enc = OrdinalEncoder()

# create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
ordered_categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OrdinalEncoder(categories=ordinal_cats_list,
                   handle_unknown='use_encoded_value', unknown_value=-1)
)
ordered_categoric_pipe


## 2.3 Gluing the pipes together

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns.columns),
        ("ordered_cat_pipe", ordered_categoric_pipe, X_cat_ordered_columns.columns),
        ("unordered_cat_pipe", unordered_categoric_pipe,
         X_cat_unordered_columns.columns)
    ]
)

scaler = StandardScaler()


# 3 Decision tree pipe

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score


# initialize the pipe
full_pipeline_dt = make_pipeline(
    preprocessor, scaler, DecisionTreeClassifier()).set_output(transform='pandas')

# train a DecisionTree with GridSearch cross validation
# parameter grid
param_grid_dt = {
    'columntransformer__num_pipe__simpleimputer__strategy': ['mean', 'median'],
    'decisiontreeclassifier__max_depth': range(2, 12),
    'decisiontreeclassifier__min_samples_leaf': range(3, 10, 2),
    'decisiontreeclassifier__min_samples_split': range(3, 40, 5),
    'decisiontreeclassifier__criterion': ['gini', 'entropy']
}

# define cross validation
#search_dt = GridSearchCV(full_pipeline_dt,param_grid_dt,cv=5,verbose=0)
search_dt = RandomizedSearchCV(full_pipeline_dt, param_grid_dt,
                               n_iter=10, cv=5, scoring='accuracy', random_state=123, verbose=0)

# fit
search_dt.fit(X_train, y_train)
y_train_pred_dt = search_dt.predict(X_train)
accuracy_score_dt_train = accuracy_score(y_train, y_train_pred_dt)
y_test_pred_dt = search_dt.predict(X_test)
accuracy_score_dt_test = accuracy_score(y_test, y_test_pred_dt)

print(f"The best parameters are {search_dt.best_params_}")
print("")
print(f"The average accuracy is {search_dt.best_score_}")

# training accuracy
# below we use X_train and not X_train_imputed because imputing is
# built in the pipeline
print(f"The training accuracy is {accuracy_score_dt_train}")

# testing accuracy
print(f"The testing accuracy is {accuracy_score_dt_test}")


The best parameters are {'decisiontreeclassifier__min_samples_split': 13, 'decisiontreeclassifier__min_samples_leaf': 3, 'decisiontreeclassifier__max_depth': 7, 'decisiontreeclassifier__criterion': 'entropy', 'columntransformer__num_pipe__simpleimputer__strategy': 'median'}

The average accuracy is 0.9297824731301126
The training accuracy is 0.9768835616438356
The testing accuracy is 0.9246575342465754


In [9]:
full_pipeline_dt


In [10]:
# re-training the model
search_dt.fit(X, y)

# reading the test.csv
url = "https://drive.google.com/file/d/1yHqaGhmHCu4wCG5ew7dBhu-4kvoSvz-i/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data_test = pd.read_csv(path)

data_test.drop("Id.1", axis=1, inplace=True)
Id = data_test.pop("Id")
expensive = search_dt.best_estimator_.predict(data_test)
expensive = pd.Series(expensive, name="Expensive")
pd.concat([Id, expensive], axis=1).to_csv("decision_tree.csv", index=False)
# file_name="C:/Users/sarra/OneDrive/Bureau/bootcamp.csv"


# 3 Random Forest pipe

In [11]:
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score


# initialize the pipe
scaler = StandardScaler()
rf = RandomForestClassifier(random_state=123)
full_pipeline_rf = make_pipeline(
    preprocessor, scaler, rf).set_output(transform='pandas')

# train a RandomForest with GridSearch cross validation
# parameter grid

param_grid_rf = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median"],
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
    # Number of trees
    "randomforestclassifier__n_estimators": randint(100, 1000),
    # Maximum depth of trees
    "randomforestclassifier__max_depth": randint(2, 21),
    # Minimum number of samples required to split a node
    "randomforestclassifier__min_samples_split": randint(2, 11),
    "randomforestclassifier__min_samples_leaf": randint(1, 10)

}

# define cross validation
#search_rf = GridSearchCV(full_pipeline_rf,param_grid_rf,cv=5,verbose=0)
search_rf = RandomizedSearchCV(full_pipeline_rf, param_grid_rf,
                               n_iter=10, cv=5, scoring='accuracy', random_state=123, verbose=0)

# fit
search_rf.fit(X_train, y_train)
y_train_pred_rf = search_rf.predict(X_train)
accuracy_score_rf_train = accuracy_score(y_train, y_train_pred_rf)
y_test_pred_rf = search_rf.predict(X_test)
accuracy_score_rf_test = accuracy_score(y_test, y_test_pred_rf)

print(f"The best parameters are {search_rf.best_params_}")
print("")
print(f"The average accuracy is {search_rf.best_score_}")

# training accuracy
# below we use X_train and not X_train_imputed because imputing is
# built in the pipeline
print(f"The training accuracy is {accuracy_score_rf_train}")

# testig accuracy
print(f"The testing accuracy is {accuracy_score_rf_test}")


# re-training the model
search_rf.fit(X, y)

# reading the test.csv
url = "https://drive.google.com/file/d/1yHqaGhmHCu4wCG5ew7dBhu-4kvoSvz-i/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data_test = pd.read_csv(path)

data_test.drop("Id.1", axis=1, inplace=True)
Id = data_test.pop("Id")
expensive = search_rf.best_estimator_.predict(data_test)
expensive = pd.Series(expensive, name="Expensive")
pd.concat([Id, expensive], axis=1).to_csv("random_forest.csv", index=False)


The best parameters are {'columntransformer__num_pipe__simpleimputer__strategy': 'mean', 'randomforestclassifier__max_depth': 16, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__min_samples_split': 5, 'randomforestclassifier__n_estimators': 946, 'standardscaler__with_mean': False, 'standardscaler__with_std': True}

The average accuracy is 0.9460548035655332
The training accuracy is 0.9991438356164384
The testing accuracy is 0.9657534246575342


In [12]:
full_pipeline_rf


# 4 KNeighborsClassifier

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score


# initialize the pipe
scaler = StandardScaler()
neigh = KNeighborsClassifier()
full_pipeline_knn = make_pipeline(
    preprocessor, scaler, neigh).set_output(transform='pandas')

# train a KNeighbors with GridSearch cross validation
# parameter grid

param_grid_knn = {
    'columntransformer__num_pipe__simpleimputer__strategy': ['mean', 'median'],
    'kneighborsclassifier__weights': ['uniform', 'distance'],
    'kneighborsclassifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
}

# define cross validation
#search_knn = GridSearchCV(full_pipeline_knn,param_grid_knn,cv=5,verbose=0)
search_knn = RandomizedSearchCV(full_pipeline_knn, param_grid_knn,
                                n_iter=10, cv=5, scoring='accuracy', random_state=123, verbose=0)

# fit
search_knn.fit(X_train, y_train)
y_train_pred_knn = search_knn.predict(X_train)
accuracy_score_knn_train = accuracy_score(y_train, y_train_pred_knn)
y_test_pred_knn = search_knn.predict(X_test)
accuracy_score_knn_test = accuracy_score(y_test, y_test_pred_knn)

print(f"The best parameters are {search_knn.best_params_}")
print("")
print(f"The average accuracy is {search_knn.best_score_}")

# training accuracy
# below we use X_train and not X_train_imputed because imputing is
# built in the pipeline
print(f"The training accuracy is {accuracy_score_knn_train}")

# testing accuracy
print(f"The testing accuracy is {accuracy_score_knn_test}")


# re-training the model
search_knn.fit(X, y)

# reading the test.csv
url = "https://drive.google.com/file/d/1yHqaGhmHCu4wCG5ew7dBhu-4kvoSvz-i/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data_test = pd.read_csv(path)

data_test.drop("Id.1", axis=1, inplace=True)
Id = data_test.pop("Id")
expensive = search_knn.best_estimator_.predict(data_test)
expensive = pd.Series(expensive, name="Expensive")
pd.concat([Id, expensive], axis=1).to_csv("kneighbors.csv", index=False)


The best parameters are {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__algorithm': 'brute', 'columntransformer__num_pipe__simpleimputer__strategy': 'mean'}

The average accuracy is 0.9229301933164594
The training accuracy is 1.0
The testing accuracy is 0.928082191780822


In [14]:
full_pipeline_knn


Best algorithm is Random Forest Pipe