# load the data

In [1]:
# import libraries
import seaborn as sns
import pandas as pd

# load dataset
url = './data/housing_prices.csv'
housing = pd.read_csv(url)


# 2. Prepare and Split the data

In [10]:
# target variable SalePrice
from sklearn.model_selection import train_test_split
X = housing.copy()
y = X.pop("SalePrice")

X.drop("Id", axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)


# 3. preparation for ordinal enconding

In [11]:
ordered_categories_column_names = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                                   'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence']
# 'OverallQual','OverallCond'

ExterQual_cat = ['N_A', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
ExterCond_cat = ['N_A', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
BsmtQual_cat = ['N_A', 'Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
BsmtCond_cat = ['N_A', 'Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
BsmtExposure_cat = ['N_A', 'NA', 'No', 'Mn', 'Av', 'Gd']
BsmtFinType1_cat = ['N_A', 'NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
BsmtFinType2_cat = ['N_A', 'NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
HeatingQC_cat = ['N_A', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
KitchenQual_cat = ['N_A', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
FireplaceQu_cat = ['N_A', 'Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
GarageFinish_cat = ['N_A', 'NA', 'Unf', 'RFn', 'Fin']
GarageQual_cat = ['N_A', 'Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
GarageCond_cat = ['N_A', 'Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
PoolQC_cat = ['N_A', 'Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
Fence_cat = ['N_A', 'Na', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']

ordinal_cats_list = [ExterQual_cat, ExterCond_cat, BsmtQual_cat, BsmtCond_cat, BsmtExposure_cat, BsmtFinType1_cat,
                     BsmtFinType2_cat, HeatingQC_cat, KitchenQual_cat, FireplaceQu_cat, GarageFinish_cat, GarageQual_cat, GarageCond_cat, PoolQC_cat, Fence_cat]


# 4 importing libraries & creating pipes (num, ordered_cat and unorderd_cat)

In [12]:
# import matplotlib.pyplot as plt
# from sklearn.feature_selection import RFE
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
# from sklearn.feature_selection import RFECV
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import GradientBoostingRegressor
KBest = SelectKBest(score_func=f_regression)
selector = VarianceThreshold(threshold=0.01)

X_cat_columns = X.select_dtypes(exclude='number').copy()

# variables for the three pipes
X_num_columns = X.select_dtypes(include='number').copy()
X_cat_ordered_columns = X_cat_columns[ordered_categories_column_names]
X_cat_unordered_columns = X_cat_columns.drop(
    ordered_categories_column_names, axis=1)


# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(
    SimpleImputer(),
)
numeric_pipe

# create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
unordered_categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False, handle_unknown='ignore')
)
unordered_categoric_pipe
# create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
ordered_categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OrdinalEncoder(categories=ordinal_cats_list
                   )
)
ordered_categoric_pipe

# Create a ColumnTransformer to apply the transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipe, X_num_columns.columns),
        ('unordered', unordered_categoric_pipe, X_cat_unordered_columns.columns),
        ('ordered', ordered_categoric_pipe, X_cat_ordered_columns.columns)
    ])


RMSE should be small and the difference should be small between Train and test.
R^2 - -1 to 1 (1 ist best, -1 worst, 0 average)

## 5.14 Model 2 variance threshold, Kbest, GradientBoostingRegressor

In [13]:
full_pipeline_dtR = make_pipeline(preprocessor,
                                  StandardScaler(with_mean=False),
                                  selector,
                                  #   KBest,
                                  GradientBoostingRegressor())
full_pipeline_dtR


In [14]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.01)

full_pipeline_dtR = make_pipeline(preprocessor,
                                  StandardScaler(with_mean=False),
                                  selector,
                                  #   KBest,
                                  GradientBoostingRegressor())

# train a DecisionTree with GridSearch cross validation
# parameter grid
param_grid_dtR = {
    'columntransformer__num__simpleimputer__strategy': ['mean', 'median', 'constant'],
    'gradientboostingregressor__loss': ["squared_error", "absolute_error", "huber", "quantile"],
    'gradientboostingregressor__criterion': ["friedman_mse", "squared_error"],
    'gradientboostingregressor__n_estimators': range(50, 150, 20),
    # 'selectkbest__k': range(5, 85, 5)
}

# define cross validation
#search_dt = GridSearchCV(full_pipeline_dt,param_grid_dt,cv=5,verbose=0)
search_dtR = RandomizedSearchCV(full_pipeline_dtR, param_grid_dtR,
                                n_iter=2, cv=7, scoring='neg_mean_squared_log_error', random_state=123, verbose=1)


# fit
search_dtR.fit(X_train, y_train)
y_train_pred_dtR = search_dtR.predict(X_train)
y_test_pred_dtR = search_dtR.predict(X_test)

train_rmse = mean_squared_error(y_train, y_train_pred_dtR, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred_dtR, squared=False)

print(f"decisionTreeRegressor Train RMSE: {train_rmse:.2f}")
print(f"decisionTreeRegressor Test RMSE: {test_rmse:.2f}")

print(f"The best parameters are {search_dtR.best_params_}")

# Evaluate the model using R^2 score
train_r2_m14 = r2_score(y_train, y_train_pred_dtR)
test_r2_m14 = r2_score(y_test, y_test_pred_dtR)

print(f"Train R^2 Score: {train_r2_m14:.2f}")
print(f"Model Test R^2 Score: {test_r2_m14:.2f}")


Fitting 7 folds for each of 2 candidates, totalling 14 fits
decisionTreeRegressor Train RMSE: 12627.55
decisionTreeRegressor Test RMSE: 21753.23
The best parameters are {'gradientboostingregressor__n_estimators': 130, 'gradientboostingregressor__loss': 'squared_error', 'gradientboostingregressor__criterion': 'friedman_mse', 'columntransformer__num__simpleimputer__strategy': 'mean'}
Train R^2 Score: 0.97
Model Test R^2 Score: 0.93


# Test for kaggle

In [15]:
# import Kaggle test data
#competition_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# import Kaggle test data
url = "https://drive.google.com/file/d/1jnn7sVeWjrKyWe2DDkpbtGpM-vCmWsnW/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
competition_data = pd.read_csv(path)


In [16]:
my_test_X = competition_data.drop(columns=["Id"])


In [17]:
my_submission = pd.DataFrame(competition_data["Id"])
my_submission["SalePrice"] = search_dtR.predict(my_test_X)

my_submission.to_csv('my_submission_3.csv', index=False)


#from google.colab import files
# files.download('my_submission_1.csv')
