# Loading Data

In [4]:
from data_utils import _read_data, _fill_missing_values, _feature_engineering, _one_hot_encoding

df = _read_data()

# Data Analysis

## Distributions

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set(style="whitegrid")

plt.figure(figsize=(10, 5))
sns.histplot(df['SalePrice'], kde=True, color='skyblue')
plt.title('Distribution of Sale Prices', fontsize=20)
plt.xlabel('Sale Price', fontsize=15)
plt.ylabel('Frequency', fontsize=15)
plt.xticks(rotation=90, fontsize=12)
sns.despine()

plt.show()

ModuleNotFoundError: No module named 'matplotlib'

# Profiling

In [None]:
# from ydata_profiling import ProfileReport
# in case of error: pip install typing_extensions==4.7.1 --upgrade
# profile = ProfileReport(df, title="Profiling Report")

# profile.to_file("reports/data_report.html")

# Missing Values

In [None]:
missing_values = df.isnull().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values > 0]


sns.set(style="whitegrid")
plt.figure(figsize=(10, 5))
sns.barplot(x=missing_values.index, y=missing_values.values, color='skyblue')

plt.title('Number of Missing Values in Each Column', fontsize=20)
plt.xlabel('Columns', fontsize=15)
plt.ylabel('Number of Missing Values', fontsize=15)
plt.xticks(rotation=90, fontsize=12)
sns.despine()
plt.show()

In [None]:
# PoolQC            1453    – code as 'absence'
# MiscFeature       1406    – code as 'absence'
# Alley             1369    – code as 'absence'
# Fence             1179    – code as 'absence'
# MasVnrType        872     – code as 'absence'
# FireplaceQu       690     – code as 'absence'
# GarageType        81      – code as 'absence'
# GarageCond        81      – code as 'absence'
# GarageFinish      81      – code as 'absence'
# GarageQual        81      – code as 'absence'
# BsmtFinType2      38      – code as 'absence'
# BsmtExposure      38      – code as 'absence'
# BsmtQual          37      – code as 'absence'
# BsmtCond          37      – code as 'absence'
# BsmtFinType1      37      – code as 'absence'
# ----------------------------------------------
# GarageYrBlt       81      – fill with YearBuilt
# ----------------------------------------------
# Electrical        1       – fill in with the most frequently occurring value in the column
# MasVnrArea        8       – fill in with the most frequently occurring value in the column
# LotFrontage       259     – fill in with the most frequently occurring value in the column

In [None]:
df = _fill_missing_values(df=df)

# Numerical Features Histograms

In [None]:
(df
    .select_dtypes(
        include=[np.number])
    .hist(
        figsize=(16, 20), 
        bins=50, 
        xlabelsize=8,
        ylabelsize=8,
        color='skyblue')
);

## Engineered Features 

In [None]:
df = _feature_engineering(df=df)

In [None]:
# Setting the style
sns.set(style="whitegrid")

# Creating figure and axes
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

# Histogram for 'AgeOfProperty'
sns.histplot(df['AgeOfProperty'], ax=axs[0], color='skyblue', kde=True)
axs[0].set_title('Histogram of AgeOfProperty')
axs[0].set_xlabel('Age of Property')
axs[0].set_ylabel('Frequency')

# Histogram for 'TotalSqFt'
sns.histplot(df['TotalSqFt'], ax=axs[1], color='skyblue', kde=True)
axs[1].set_title('Histogram of TotalSqFt')
axs[1].set_xlabel('Total Square Feet')
axs[1].set_ylabel('Frequency')

# Histogram for 'GarageBuildYearsAfterHouse'
sns.histplot(df['GarageBuildYearsAfterHouse'], ax=axs[2], color='skyblue', kde=True)
axs[2].set_title('Histogram of GarageBuildYearsAfterHouse')
axs[2].set_xlabel('Years After House')
axs[2].set_ylabel('Frequency')

# Adding histograms for 'TotalBathrooms'
sns.histplot(df['TotalBathrooms'], ax=axs[0, 0], color='skyblue', kde=True)
axs[0, 0].set_title('Histogram of TotalBathrooms')
axs[0, 0].set_xlabel('Total Bathrooms')
axs[0, 0].set_ylabel('Frequency')

# Adding histograms for 'TotalPorchSF'
sns.histplot(df['TotalPorchSF'], ax=axs[0, 1], color='skyblue', kde=True)
axs[0, 1].set_title('Histogram of TotalPorchSF')
axs[0, 1].set_xlabel('Total Porch Sq Ft')
axs[0, 1].set_ylabel('Frequency')

# Displaying the plots
plt.tight_layout()
plt.show()

## OneHot Encoded Features

In [None]:
df = _one_hot_encoding(df=df)

In [None]:
df.head()

## Loading Data

# Data split - Train & Test

In [None]:
from data_utils import load_raw_data, load_training_data
df = load_raw_data(feature_engineering=True)
x_train, x_test, y_train, y_test = load_training_data(feature_engineering=True)

## Regression Modeling

In [None]:
from sklearn.linear_model import Lasso, ElasticNet, LinearRegression
from sklearn.metrics import r2_score

linear_regressor = LinearRegression()
linear_regressor.fit(x_train, y_train)

lasso_regressor = Lasso(random_state=0,
                        alpha=1)
lasso_regressor.fit(x_train, y_train)

elasticnet_regressor = ElasticNet(random_state=0,
                                  alpha=1, 
                                  l1_ratio=0.18)
elasticnet_regressor = elasticnet_regressor.fit(x_test, y_test)

In [None]:
lry_pred = linear_regressor.predict(x_test)
r2 = r2_score(y_test, lry_pred)

print(f'Linear Regressor - R²: {r2}')

lasso_pred = lasso_regressor.predict(x_test)
r2 = r2_score(y_test, lasso_pred)

print(f'Lasso Regressor - R²: {r2}')

elasticnet_pred = elasticnet_regressor.predict(x_test)
r2 = r2_score(y_test, elasticnet_pred)

print(f'Elastic Net Regressor - R²: {r2}')

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt_regressor = DecisionTreeRegressor(random_state=0,
                                     max_depth=10,
                                     min_samples_leaf=10,
                                     min_samples_split=50)
dt_regressor.fit(x_train, y_train)

In [None]:
dty_pred = dt_regressor.predict(x_test)

r2 = r2_score(y_test, dty_pred)

print(f'Decision Tree Regressor - R²: {r2}')

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

rf_regressor = RandomForestRegressor(n_estimators=1500,
                                     random_state=0,
                                     max_depth=17,
                                     min_samples_split=3,
                                     min_samples_leaf=10)
rf_regressor.fit(x_train, y_train)

gb_regressor = GradientBoostingRegressor(n_estimators=2000,
                                         learning_rate=0.05,
                                         max_depth=3,
                                         random_state=0,
                                         max_features='sqrt',
                                         min_samples_split=10,
                                         min_samples_leaf=13,
                                         loss='huber')
gb_regressor.fit(x_train, y_train)

In [None]:
rfy_pred = rf_regressor.predict(x_test)
r2 = r2_score(y_test, rfy_pred)

print(f'Random Forest Regressor - R²: {r2}')

gby_pred = gb_regressor.predict(x_test)

r2 = r2_score(y_test, gby_pred)

print(f'Gradient Boosting Regressor- R²: {r2}')

In [None]:
from sklearn.svm import SVR

svr_regressor = SVR(C=40,
                    gamma=0.0001,
                    degree=1,
                    kernel='linear')

svr_regressor.fit(x_train, y_train)
svry_pred = svr_regressor.predict(x_test)

In [None]:
r2 = r2_score(y_test, svry_pred)

print(f'Support Vector Regression - R²: {r2}')

In [None]:
import xgboost as xgb
from lightgbm import LGBMRegressor


lgb_regressor = LGBMRegressor(random_state=0,
                              num_leaves=11,
                              n_estimators=6000,
                              learning_rate=0.01,
                              verbose=10,
                              max_bin=129,
                              bagging_fraction=1,
                              bagging_freq=2,
                              bagging_seed=10,
                              feature_fraction=0.1,
                              feature_fraction_seed=1,
                              objective='regression')
lgb_regressor.fit(x_train, y_train)

xgb_regressor = xgb.XGBRegressor(objective ='reg:squarederror',
                                 max_depth = 2,
                                 alpha = 0.8298,
                                 random_state=0,
                                 n_estimators=3407,
                                 learning_rate=0.0377,
                                 min_child_weight=0.4565,
                                 subsample=0.9964,
                                 reg_lambda=0.0316,
                                 gamma=0.0009,
                                 colsample_bytree=0.2308,
                                 nthread=-1)

xgb_regressor.fit(x_train, y_train)

In [None]:
lgb_pred = lgb_regressor.predict(x_test)
r2 = r2_score(y_test, lgb_pred)

print(f'LightGB - R²: {r2}')

xgby_pred = xgb_regressor.predict(x_test)
r2 = r2_score(y_test, xgby_pred)

print(f'XGBoost - R²: {r2}')