In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.metrics import make_scorer, mean_squared_log_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import IsolationForest
from xgboost import XGBRegressor

In [None]:
df_train = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
df_train = df_train.drop(['Id'], axis=1)
df_train

In [None]:
df_test = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')
df_test

In [None]:
%cat /kaggle/input/home-data-for-ml-course/data_description.txt

# Preliminary Data Analysis

This part is optional; here, I decided to preliminarily examine the features that appear suspicious

In [None]:
# Let's examine the number of filled values in the columns.

df_train.describe().iloc[0].value_counts()

In [None]:
# Let's examine the number of filled values in the columns.

df_train.describe(include=object).iloc[0].value_counts()

In [None]:
# Retrieving feature names with a high number of missing values.

unique_counts = df_train.describe(include='object').iloc[0]
missing_data_columns = unique_counts[unique_counts <= 800].index.tolist()
missing_data_columns

In [None]:
# Identifying features with suspiciously low variability for further inspection.

suspicious_low_variability_features = [col for col in df_train.columns.to_list() 
 if df_train[col].value_counts().iloc[0] >= sum(~(df_train[col].isna())) * 0.7 
 and df_train[col].value_counts().iloc[1] < sum(~(df_train[col].isna())) * 0.2]

for name_col in suspicious_low_variability_features:
    if name_col in missing_data_columns:
        suspicious_low_variability_features.remove(name_col)
        
suspicious_low_variability_features

## Data Visualization #1

In [None]:
# Replacing NaN for visualization.

df_train_copy = df_train.copy()

columns_to_fill = {
    'Alley': 'No alley access',
    'MasVnrType': 'No Masonry',
    'FireplaceQu': 'No Fireplace',
    'PoolQC': 'No Pool',
    'Fence': 'No Fence',
    'MiscFeature': 'No feature',
    'GarageQual': 'No Garage',
    'GarageCond': 'No Garage',
    'BsmtCond': 'No Basement',
    'BsmtFinType2': 'No Basement'
}

for column, value in columns_to_fill.items():
    df_train_copy[column] = df_train_copy[column].fillna(value)
    
# Checking the number of missing values in the features I will be visualizing.
df_train_copy[suspicious_low_variability_features + missing_data_columns].isna().sum().sum()

In [None]:
for col in missing_data_columns:
    fig, axes = plt.subplots(1, 3, figsize=(18, 8))
    
    # Countplot: Display the number of samples for each category. 
    sns.countplot(x=col, data=df_train_copy, ax=axes[0])
    axes[0].set_title(f'Countplot of {col}')
    
    # Barplot: Display the average SalePrice for each category. 
    sns.barplot(x=col, y='SalePrice', data=df_train_copy, ax=axes[1])
    axes[1].set_title(f'Barplot of {col} vs SalePrice')
    
    # Boxplot: Display the SalePrice distribution for each category. 
    sns.boxplot(x=col, y='SalePrice', data=df_train_copy, ax=axes[2])
    axes[2].set_title(f'Boxplot of {col} vs SalePrice')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Splitting columns into numerical and categorical for visualization.

suspicious_low_variability_features_numeric = df_train_copy[suspicious_low_variability_features].select_dtypes(include=['int64', 'float64'])

suspicious_low_variability_features_categ = df_train_copy[suspicious_low_variability_features].select_dtypes(include=['object'])

suspicious_low_variability_features_numeric.describe()

In [None]:
for col in (list(suspicious_low_variability_features_categ.columns) + ['BsmtHalfBath', 'KitchenAbvGr']):
    fig, axes = plt.subplots(1, 3, figsize=(18, 8))
    
    # Countplot: Display the number of samples for each category.
    sns.countplot(x=col, data=df_train_copy, ax=axes[0])
    axes[0].set_title(f'Countplot of {col}')
    
    # Barplot: Display the average SalePrice for each category. 
    sns.barplot(x=col, y='SalePrice', data=df_train_copy, ax=axes[1])
    axes[1].set_title(f'Barplot of {col} vs SalePrice')
    
    # Boxplot: Display the SalePrice distribution for each category. 
    sns.boxplot(x=col, y='SalePrice', data=df_train_copy, ax=axes[2])
    axes[2].set_title(f'Boxplot of {col} vs SalePrice')
    
    plt.tight_layout()
    plt.show()

In [None]:
suspicious_low_variability_features_numeric = [col for col in list(suspicious_low_variability_features_numeric.columns) if col not in ['BsmtHalfBath', 'KitchenAbvGr']]

for feature in suspicious_low_variability_features_numeric:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Violinplot.
    sns.violinplot(x=df_train_copy[feature], ax=axes[0])
    axes[0].set_title(f'Violinplot of {feature}')
    
    # Scatterplot.
    sns.scatterplot(x=feature, y='SalePrice', data=df_train_copy, ax=axes[1])
    axes[1].set_title(f'{feature} vs SalePrice')
    
    plt.show()

# Building a data processing pipeline using existing visualization tools and adding additional visualization as needed.

In [None]:
class CustomFeatureEngineer(BaseEstimator, TransformerMixin):
    """A base class with a placeholder fit method for feature engineering transformers."""
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X
    
    
class CreateCopy(CustomFeatureEngineer):
    def transform(self, X, y=None):
        """Return a copy of the DataFrame."""
        return X.copy()

    
class HandleMissingValues(CustomFeatureEngineer):
    def transform(self, X, y=None):
        """Handling missing values for specific columns."""
        # Creating a dictionary with column names and values for filling.
        fill_values = {
            'Alley': 'No alley access',
            'MasVnrType': 'No Masonry',
            'FireplaceQu': 'No Fireplace',
            'Fence': 'No Fence',
            'GarageQual': 'No Garage',
            'BsmtCond': 'No Basement',
            'BsmtQual': 'No Basement',
            'BsmtExposure': 'No Basement',
            'BsmtFinType1': 'No Basement',
            'Electrical': 'SBrkr',
            'GarageType': 'No Garage',
            'GarageFinish': 'No Garage',
            'MiscFeature': 'No feature',
            'BsmtFinType2': 'No Basement'
        }

        # Filling missing values in the respective columns.
        X.fillna(value=fill_values, inplace=True)

        return X

    
class ChangingEntries(CustomFeatureEngineer):
    def transform(self, X, y=None):
        """Based on visualization and logic, reorganizing data."""
        def transform_MSZoning(zone):
            if zone in ['RM']:
                return 'RM'
            elif zone in ['FV']:
                return 'FV'
            elif zone in ['RL', 'RP']:
                return 'RL_and_RP'
            else:
                return 'Other'

        X['MSZoning'] = X['MSZoning'].apply(transform_MSZoning)
        
        X.loc[~(X.LotConfig == 'CulDSac'), 'LotConfig'] = 'Other'
        X.loc[(X.BldgType == '2FmCon') | (X.BldgType == 'Duplx'), 'BldgType'] = 'MultiFamily'
        X.loc[~((X.RoofStyle == 'Hip') | (X.RoofStyle == 'Gable')), 'RoofStyle'] = 'Other'
        
        X.loc[~((X.ExterCond == 'Fa') | (X.ExterCond == 'Po')), 'ExterCond'] = 'Gd'
        X.loc[X.ExterCond == 'Po', 'ExterCond'] = 'Fa'
        
        X.loc[X.Electrical == 'Mix', 'Electrical'] = 'FuseP'
        
        X.loc[X.GarageQual == 'Ex', 'GarageQual'] = 'Gd'
        X.loc[X.GarageQual == 'Po', 'GarageQual'] = 'No Garage'

        return X
    
    
class DropUnnecessaryFeatures(CustomFeatureEngineer):
    def transform(self, X, y=None):
        """Removing due to weak visual differences and significant class imbalance. 
        Also removing features that were used for transformations in prior steps."""
        return X.drop(['PoolQC', 'Utilities', 'Condition2', 'RoofMatl',
                       'Heating', 'GarageCond', 'BsmtHalfBath',
                       'KitchenAbvGr'], axis=1)
        

In [None]:
# Creating and applying a pipeline to obtain the data frame with which I will continue to work.

preprocessing_pipeline = Pipeline([
    ('create_copy', CreateCopy()),  # Creating a copy
    ('custom_feature_engineer', CustomFeatureEngineer()),  # Here, numerical and categorical features will be separated.
    ('handle_missing_values', HandleMissingValues()),  # Handling missing values.
    ('changing_entries', ChangingEntries()),  # Modifying entries.
    ('drop_features', DropUnnecessaryFeatures())  # Removing unnecessary features.
])

# Applying the pipeline.
df_transformed = preprocessing_pipeline.fit_transform(df_train)

df_transformed

# Data Visualization #2

In [None]:
# Visualization of numerical features for making decisions on their processing.

numeric_features = df_transformed.select_dtypes(include=['int64', 'float64'])

filtered_features = numeric_features.describe().loc[:, numeric_features.describe().loc['count'] != 1460]
filtered_features

In [None]:
for col in filtered_features.columns:
    sns.boxplot(y=col, data=df_transformed)
    plt.show()

In [None]:
# GarageYrBlt - Filling with the median.
garage_median = df_transformed['GarageYrBlt'].median()
df_transformed['GarageYrBlt'].fillna(garage_median, inplace=True)

# MasVnrArea - Filling with the median.
masvn_median = df_transformed['MasVnrArea'].median()
df_transformed['MasVnrArea'].fillna(masvn_median, inplace=True)

# LotFrontage - Truncated mean.
lot_values = df_transformed['LotFrontage'].dropna()
lower_bound, upper_bound = np.percentile(lot_values, [2.5, 97.5])
truncated_mean = lot_values[(lot_values >= lower_bound) & (lot_values <= upper_bound)].mean()
df_transformed['LotFrontage'].fillna(truncated_mean, inplace=True)

In [None]:
# Let's build a correlation table.

numeric_features.corr()

In [None]:
# Functions to find the most correlated features.

def get_redundant_pairs(df):
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(numeric_features, 10))

In [None]:
# Based on the correlation analysis, let's remove a few features.

df_transformed = df_transformed.drop(['GarageCars', 'GarageYrBlt', 'TotRmsAbvGrd',
                                      'TotalBsmtSF', 'OverallQual'], axis=1)
df_transformed

In [None]:
categorical_features = df_transformed.select_dtypes(include=['object'])

filtered_categorical_features = categorical_features.describe().loc[:, categorical_features.describe().loc['count'] != 1460]
filtered_categorical_features

In [None]:
# Creating a dictionary with column names and values for filling.
fill_values = {
    'BsmtQual': 'No Basement',
    'BsmtExposure': 'No Basement',
    'BsmtFinType1': 'No Basement',
    'Electrical': 'SBrkr',
    'GarageType': 'No Garage',
    'GarageFinish': 'No Garage'
}

# Filling missing values in the respective columns.
df_transformed.fillna(value=fill_values, inplace=True)

In [None]:
categorical_features = df_transformed.select_dtypes(include=['object'])

for feature in categorical_features.columns:
    fig, axes = plt.subplots(1, 3, figsize=(18, 8))
    
    # Countplot: Display the number of samples for each category.
    ax1 = sns.countplot(x=feature, data=df_transformed, ax=axes[0])
    axes[0].set_title(f'Countplot of {feature}')
    axes[0].set_xlabel(feature)
    axes[0].set_ylabel('Count')
    ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45)  
    
    # Barplot: Display the average SalePrice for each category.
    ax2 = sns.barplot(x=feature, y="SalePrice", data=df_transformed, ax=axes[1])
    axes[1].set_title(f'Barplot of {feature} vs SalePrice')
    axes[1].set_xlabel(feature)
    axes[1].set_ylabel('SalePrice')
    ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45) 
    
    # Boxplot: Display the SalePrice distribution for each category.
    ax3 = sns.boxplot(x=feature, y='SalePrice', data=df_transformed, ax=axes[2])
    axes[2].set_title(f'Boxplot of {feature} vs SalePrice')
    axes[2].set_xlabel(feature)
    axes[2].set_ylabel('SalePrice')
    ax3.set_xticklabels(ax3.get_xticklabels(), rotation=45) 
    
    plt.tight_layout()
    plt.show()

# Creating the remaining data processing classes and forming the final pipeline before model training.

In [None]:
class HandleNumericFeatures(CustomFeatureEngineer):
    def transform(self, X, y=None):
        """Applying transformations to numerical features"""        
        # GarageYrBlt - Filling with the median.
        garage_median = X['GarageYrBlt'].median()
        X['GarageYrBlt'].fillna(garage_median, inplace=True)

        # MasVnrArea - Filling with the median.
        masvn_median = X['MasVnrArea'].median()
        X['MasVnrArea'].fillna(masvn_median, inplace=True)

        # LotFrontage - Truncated mean.
        lot_values = X['LotFrontage'].dropna()
        lower_bound, upper_bound = np.percentile(lot_values, [2.5, 97.5])
        truncated_mean = lot_values[(lot_values >= lower_bound) & (lot_values <= upper_bound)].mean()
        X['LotFrontage'].fillna(truncated_mean, inplace=True)
        
        # Previously, I removed these features based on correlation assessment,
        # but after some experiments, I realized it's better to keep them.
        # X = X.drop(['GarageCars', 'TotRmsAbvGrd',
        #            'TotalBsmtSF', 'OverallQual', 'GarageYrBlt'], axis=1)
        return X

In [None]:
class HandleCategoricalFeatures(CustomFeatureEngineer):
    def __init__(self, random_state=None):
        self.columns_to_label_encode = ['MSZoning', 'Alley', 'LotConfig', 'BldgType', 'HouseStyle', 'RoofStyle',
                                        'MasVnrType', 'Foundation', 'CentralAir', 'Electrical', 'GarageFinish',
                                        'PavedDrive', 'Fence', 'GarageType', 'SaleCondition',
                                        'MiscFeature', 'Street', 'Condition1', 'SaleType']
        self.label_encoders = {}  # Dictionary for storing encoders.
        for col in self.columns_to_label_encode:
            self.label_encoders[col] = LabelEncoder()
        
        self.ordinal_encoder = OrdinalEncoder()
        self.columns_to_ordinal_encode = ['LotShape', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                                          'BsmtExposure', 'BsmtFinType1', 'HeatingQC', 'KitchenQual',
                                          'FireplaceQu', 'GarageQual', 'LandContour', 'LandSlope',
                                          'BsmtFinType2', 'Functional']
        self.ordinal_mappings = {
            'LotShape': {
                'Reg': 0,
                'IR1': 1,
                'IR2': 2, 
                'IR3': 3  
            },
            'ExterQual': {
                'Ex': 0,
                'Gd': 1,
                'TA': 2,
                'Fa': 3,
                'Po': 4
            },
            'ExterCond': {
                'Gd': 0,
                'Fa': 1
            },
            'BsmtQual': {
                'Ex': 0,
                'Gd': 1,
                'TA': 2,
                'Fa': 3,
                'Po': 4,
                'No Basement': 5
            },
            'BsmtCond': {
                'Ex': 0,
                'Gd': 1,
                'TA': 2,
                'Fa': 3,
                'Po': 4,
                'No Basement': 5
            },
            'BsmtExposure': {
                'Gd': 0,
                'Av': 1,
                'Mn': 2,
                'No': 3,
                'No Basement': 4
            },
            'BsmtFinType1': {
                'GLQ': 0,
                'ALQ': 1,
                'BLQ': 2,
                'Rec': 3,
                'LwQ': 4,
                'Unf': 5,
                'No Basement': 6
            },
            'HeatingQC': {
                'Ex': 0,
                'Gd': 1,
                'TA': 2,
                'Fa': 3,
                'Po': 4
            },
            'KitchenQual': {
                'Ex': 0,
                'Gd': 1,
                'TA': 2,
                'Fa': 3,
                'Po': 4
            },
            'FireplaceQu': {
                'Ex': 0,
                'Gd': 1,
                'TA': 2,
                'Fa': 3,
                'Po': 4,
                'No Fireplace': 5
            },
            'GarageQual': {
                'Gd': 0,
                'TA': 1,
                'Fa': 2,
                'No Garage': 3
            },
            'LandContour': {
                'Lvl': 0,
                'Bnk': 1,
                'HLS': 2,
                'Low': 3
            },
            'LandSlope': {
                'Gtl': 0,
                'Mod': 1,
                'Sev': 2
            },
            'BsmtFinType2': {
                'GLQ': 0,
                'ALQ': 1,
                'BLQ': 2,
                'Rec': 3,
                'LwQ': 4,
                'Unf': 5,
                'No Basement': 6
            },
            'Functional': {
                'Typ': 0,
                'Min1': 1,
                'Min2': 2,
                'Mod': 3,
                'Maj1': 4,
                'Maj2': 5,
                'Sev': 6,
                'Sal': 7
            }
        }
        
        # Add Target Encoding.
        self.columns_to_target_encode = ['Neighborhood', 'Exterior1st', 'Exterior2nd']
        self.target_encodings = {}
        self.global_mean = 0
        
        # Fix the random_state.
        self.random_state = random_state
        self.random_generator = np.random.RandomState(random_state)
    
    def fit(self, X, y=None):
        # Perform label encoding at the fit stage to preserve category information.
        for col in self.columns_to_label_encode:
            self.label_encoders[col].fit(X[col])
        
        # The average value of the target variable.
        self.global_mean = y.mean()
        
        # Calculate Target Encoding for each category.
        for column in self.columns_to_target_encode:
            aggregated = X.groupby(column).agg({y.name: 'mean'})
            self.target_encodings[column] = aggregated[y.name].to_dict()
            
        return self
    
    def transform(self, X, y=None):
        """Apply transformations to categorical features""" 
        # label encoding.
        for col in self.columns_to_label_encode:
            X[col] = X[col].map(lambda s: self.label_encoders[col].transform([s])[0] if s in self.label_encoders[col].classes_ else 15)
        
        # Ordinal encoding.
        for column in self.columns_to_ordinal_encode:
            X[column] = X[column].map(self.ordinal_mappings[column])
            
        # Target Encoding with regularization.
        for column in self.columns_to_target_encode:
            # Add random noise to encoded values using a generator with a fixed state.
            noise = self.random_generator.normal(0, 0.01, size=X[column].shape)
            X[column] = X[column].map(self.target_encodings[column]).fillna(self.global_mean) + noise


        return X

In [None]:
# Creating and applying a pipeline to obtain the data frame with which I will continue to work.

preprocessing_pipeline = Pipeline([
    ('create_copy', CreateCopy()),  # Creating a copy.
    ('custom_feature_engineer', CustomFeatureEngineer()),  # Placeholder for the fit method.
    ('handle_missing_values', HandleMissingValues()),  # Handling missing values.
    ('changing_entries', ChangingEntries()),  # Modifying entries.
    ('drop_features', DropUnnecessaryFeatures()),  # Removing unnecessary features.
    ('numeric_feature_handler', HandleNumericFeatures()),  # Processing numerical features.
    ('categorical_feature_handler', HandleCategoricalFeatures(random_state=100))  # Processing categorical features.
])

# Applying the pipeline.
df_train_end = preprocessing_pipeline.fit_transform(df_train, 
                                                    df_train.SalePrice)

df_train_end

In [None]:
df_test_end = preprocessing_pipeline.transform(df_test)

In [None]:
# There were 6 NaN values found in the test data. For simplicity, I will fill them with the median values.

medians = df_test_end.median()

df_test_end = df_test_end.fillna(medians)
df_test_end.isna().sum().sum()

# Creating new features

Here, I am creating new features based on the existing data and adding new features from an additional dataset with macro indicators. I chose these particular features to add based on experiments that will not be shown in this notebook.

In [None]:
# Loading an additional dataset, which I created based on the year of property sale and geographical location.
df_new = pd.read_csv('/kaggle/input/enhancedhousingmarketdata/EnhancedHousingMarketData.csv')
df_new.head(10)

In [None]:
# Adding new features to the original dataset.
df_train_end.rename(columns={'YrSold': 'Year', 'MoSold': 'Month'}, inplace=True)
df_test_end.rename(columns={'YrSold': 'Year', 'MoSold': 'Month'}, inplace=True)

df_train_end = df_train_end.merge(df_new[['Year', 'Month', 'AverageWeeklyWagePrivate', 'TotalRealGDP']], on=['Year', 'Month'], how='left')
df_test_end = df_test_end.merge(df_new[['Year', 'Month', 'AverageWeeklyWagePrivate', 'TotalRealGDP']], on=['Year', 'Month'], how='left')

df_train_end.rename(columns={'Year': 'YrSold', 'Month': 'MoSold'}, inplace=True)
df_test_end.rename(columns={'Year': 'YrSold', 'Month': 'MoSold'}, inplace=True)
df_train_end

In [None]:
#AgeAtSale: Year of sale (YrSold) minus the year of construction (YearBuilt). This feature reflects the age of the house at the time of sale.
df_train_end['AgeAtSale'] = df_train_end['YrSold'] - df_train_end['YearBuilt']

#YearsSinceRemodel: Year of sale (YrSold) minus the year of the last renovation (YearRemodAdd). Shows how many years have passed since the last renovation.
df_train_end['YearsSinceRemodel'] = df_train_end['YrSold'] - df_train_end['YearRemodAdd']

#TotalSqFt: The sum of the areas of all floors (1stFlrSF, 2ndFlrSF) plus the total basement area (TotalBsmtSF). This gives the total living area.
df_train_end['TotalSqFt'] = df_train_end['1stFlrSF'] + df_train_end['2ndFlrSF'] + df_train_end['TotalBsmtSF']

#Bathrooms: The sum of all bathrooms (FullBath + 0.5 * HalfBath + BsmtFullBath + 0.5 * BsmtHalfBath). This is the total number of bathrooms in the house.
df_train_end['Bathrooms'] = df_train_end['FullBath'] + (0.5 * df_train_end['HalfBath']) + df_train_end['BsmtFullBath'] + (0.5 * df_train['BsmtHalfBath'])

#TotalPorchSF: The sum of all types of porches (OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch). Gives the total porch area.
df_train_end['TotalPorchSF'] = df_train_end['OpenPorchSF'] + df_train_end['EnclosedPorch'] + df_train_end['3SsnPorch'] + df_train_end['ScreenPorch']

#PropertyShape: A categorical feature that combines LotShape and LandContour, can provide a more detailed view of the geometry and topography of the property.
df_train_end['PropertyShape'] = df_train_end['LotShape'] + df_train_end['LandContour']

#OverallQuality&Condition: A combination of overall quality (OverallQual) and overall condition (OverallCond) as a sum.
df_train_end['OverallQuality&Condition'] = df_train_end['OverallQual'] + df_train_end['OverallCond']

#TotalBsmtFinSF: The sum of BsmtFinSF1 and BsmtFinSF2, giving the total finished basement area.
df_train_end['TotalBsmtFinSF'] = df_train_end['BsmtFinSF1'] + df_train_end['BsmtFinSF2']

#NeighborhoodQuality: A combination of Neighborhood and OverallQual to assess the quality of the property in the context of its area.
df_train_end['NeighborhoodQuality'] = (df_train_end['Neighborhood'] * df_train_end['OverallQual']) * 0.1

#FrontageToAreaRatio: The ratio of street frontage in linear feet (LotFrontage) to the lot size (LotArea). This feature can reflect the proportionality of the facade to the size of the lot.
df_train_end['FrontageToAreaRatio'] = df_train_end['LotFrontage'] / df_train_end['LotArea']

#TotalLivArea: The total living area, including the basement (GrLivArea + TotalBsmtSF). This provides a more comprehensive view of the available living space.
df_train_end['TotalLivArea'] = df_train_end['GrLivArea'] + df_train_end['TotalBsmtSF']

#RoomAverageSize: The average room size, based on the total living area (GrLivArea) and the total number of rooms (TotRmsAbvGrd).
df_train_end['RoomAverageSize'] = df_train_end['GrLivArea'] / df_train_end['TotRmsAbvGrd']

#AgeOfGarage: Year of sale (YrSold) minus the year the garage was built (GarageYrBlt). Shows the age of the garage at the time of sale.
df_train_end['AgeOfGarage'] = df_train_end['YrSold'] - df_train_end['GarageYrBlt']

#TotalOutdoorArea: The sum of all outdoor areas (WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea).
df_train_end['TotalOutdoorArea'] = df_train_end['WoodDeckSF'] + df_train_end['OpenPorchSF'] + df_train_end['EnclosedPorch'] + df_train_end['3SsnPorch'] + df_train_end['ScreenPorch'] + df_train_end['PoolArea']

#SeasonSold: The month of sale (MoSold) converted into the season of the year, to capture seasonal trends.
#Function to convert month into season
def map_month_to_season(month):
    if month in [3, 4, 5]:
        return 1
    elif month in [6, 7, 8]:
        return 2
    elif month in [9, 10, 11]:
        return 3
    else:
        return 4

df_train_end['SeasonSold'] = df_train_end['MoSold'].apply(map_month_to_season)

In [None]:
#AgeAtSale: Year of sale (YrSold) minus the year of construction (YearBuilt). This feature reflects the age of the house at the time of sale.
df_test_end['AgeAtSale'] = df_test_end['YrSold'] - df_test_end['YearBuilt']

#YearsSinceRemodel: Year of sale (YrSold) minus the year of the last renovation (YearRemodAdd). Shows how many years have passed since the last renovation.
df_test_end['YearsSinceRemodel'] = df_test_end['YrSold'] - df_test_end['YearRemodAdd']

#TotalSqFt: The sum of the areas of all floors (1stFlrSF, 2ndFlrSF) plus the total basement area (TotalBsmtSF). This gives the total living area.
df_test_end['TotalSqFt'] = df_test_end['1stFlrSF'] + df_test_end['2ndFlrSF'] + df_test_end['TotalBsmtSF']

#Bathrooms: The sum of all bathrooms (FullBath + 0.5 * HalfBath + BsmtFullBath + 0.5 * BsmtHalfBath). This is the total number of bathrooms in the house.
df_test_end['Bathrooms'] = df_test_end['FullBath'] + (0.5 * df_test_end['HalfBath']) + df_test_end['BsmtFullBath'] + (0.5 * df_test['BsmtHalfBath'])

#TotalPorchSF: The sum of all types of porches (OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch). Gives the total porch area.
df_test_end['TotalPorchSF'] = df_test_end['OpenPorchSF'] + df_test_end['EnclosedPorch'] + df_test_end['3SsnPorch'] + df_test_end['ScreenPorch']

#PropertyShape: A categorical feature that combines LotShape and LandContour, can provide a more detailed view of the geometry and topography of the property.
df_test_end['PropertyShape'] = df_test_end['LotShape'] + df_test_end['LandContour']

#OverallQuality&Condition: A combination of overall quality (OverallQual) and overall condition (OverallCond) as a sum.
df_test_end['OverallQuality&Condition'] = df_test_end['OverallQual'] + df_test_end['OverallCond']

#TotalBsmtFinSF: The sum of BsmtFinSF1 and BsmtFinSF2, giving the total finished basement area.
df_test_end['TotalBsmtFinSF'] = df_test_end['BsmtFinSF1'] + df_test_end['BsmtFinSF2']

#NeighborhoodQuality: A combination of Neighborhood and OverallQual to assess the quality of the property in the context of its area.
df_test_end['NeighborhoodQuality'] = (df_test_end['Neighborhood'] * df_test_end['OverallQual']) * 0.1

#FrontageToAreaRatio: The ratio of street frontage in linear feet (LotFrontage) to the lot size (LotArea). This feature can reflect the proportionality of the facade to the size of the lot.
df_test_end['FrontageToAreaRatio'] = df_test_end['LotFrontage'] / df_test_end['LotArea']

#TotalLivArea: The total living area, including the basement (GrLivArea + TotalBsmtSF). This provides a more comprehensive view of the available living space.
df_test_end['TotalLivArea'] = df_test_end['GrLivArea'] + df_test_end['TotalBsmtSF']

#RoomAverageSize: The average room size, based on the total living area (GrLivArea) and the total number of rooms (TotRmsAbvGrd).
df_test_end['RoomAverageSize'] = df_test_end['GrLivArea'] / df_test_end['TotRmsAbvGrd']

#AgeOfGarage: Year of sale (YrSold) minus the year the garage was built (GarageYrBlt). Shows the age of the garage at the time of sale.
df_test_end['AgeOfGarage'] = df_test_end['YrSold'] - df_test_end['GarageYrBlt']

#TotalOutdoorArea: The sum of all outdoor areas (WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea).
df_test_end['TotalOutdoorArea'] = df_test_end['WoodDeckSF'] + df_test_end['OpenPorchSF'] + df_test_end['EnclosedPorch'] + df_test_end['3SsnPorch'] + df_test_end['ScreenPorch'] + df_test_end['PoolArea']

#SeasonSold: The month of sale (MoSold) converted into the season of the year, to capture seasonal trends.
#Function to convert month into season
def map_month_to_season(month):
    if month in [3, 4, 5]:
        return 1
    elif month in [6, 7, 8]:
        return 2
    elif month in [9, 10, 11]:
        return 3
    else:
        return 4

df_test_end['SeasonSold'] = df_test_end['MoSold'].apply(map_month_to_season)


# Model and predictions

In [None]:
def rmsle(y_true, y_pred):
    if (y_true < 0).any() or (y_pred < 0).any():
        print(f"Negative values found: y_true min = {y_true.min()}, y_pred min = {y_pred.min()}")
        
    min_positive = y_pred[y_pred > 0].min()
    y_pred[y_pred <= 0] = min_positive
    
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

## XGBRegresor

In [None]:
X = df_train_end.drop(columns=['SalePrice'], axis=1)
y = df_train_end['SalePrice']

def anomalies(X):

    # Creating an Isolation Forest instance
    iso_forest = IsolationForest(max_samples=0.7, contamination=0.015, 
                                 bootstrap=True, n_jobs=-1, random_state=100)
    
    iso_forest.fit(X)
    # Obtaining anomaly labels for the training dataset (-1 for anomalies and 1 for normal points)
    train_outliers = iso_forest.predict(X)
    
    return train_outliers

# Identifying the indices of anomalies
index_anomalies = anomalies(X)

# Filtering out the anomalies from the training set based on the index
X = X[index_anomalies == 1]
y = y[index_anomalies == 1]

# It's important to ensure that the indices of X_train and y_train still match after filtering
assert X.index.equals(y.index), "Indices of X_train and y_train do not match after filtering anomalies."

model = XGBRegressor(n_estimators=1250,
                     learning_rate=0.03,
                     min_child_weight=2,
                     subsample=0.3,
                     colsample_bytree=0.2,
                     max_depth=4, 
                     random_state=100,
                     reg_alpha=2.5,
                     num_parallel_tree=6)

model.fit(X, y)

df_test['SalePrice'] = model.predict(df_test_end.drop(['Id'], axis=1))
df_test[['Id', 'SalePrice']].to_csv('submission.csv', index=False)