In [20]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer, StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score, make_scorer

class Config:
    """A class that stores all configurations and constants."""
    # Mapping ordinal features
    ORDINAL_FEATURE_MAP = {
        'ExterQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'BsmtQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'BsmtCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'BsmtExposure': ['None', 'No', 'Mn', 'Av', 'Gd'],
        'BsmtFinType1': ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
        'BsmtFinType2': ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
        'HeatingQC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'KitchenQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'GarageFinish': ['None', 'Unf', 'RFn', 'Fin'],
        'GarageQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'GarageCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'PavedDrive': ['N', 'P', 'Y'],
        'Fence': ['None', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'],
        'OverallQual': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'OverallCond': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'Functional': ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
        'FireplaceQu': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
        'PoolQC': ['None', 'Fa', 'TA', 'Gd', 'Ex'],
        'LotShape': ['IR3', 'IR2', 'IR1', 'Reg'],
        'LandContour': ['Low', 'HLS', 'Bnk', 'Lvl'],
        'LandSlope': ['Sev', 'Mod', 'Gtl']
    }

    # Lists of features grouped by different types
    NOMINAL_FEATURES = [
        'MSSubClass', 'MSZoning', 'Alley', 'LotConfig', 'Neighborhood',
        'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
        'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation',
        'Heating', 'Electrical', 'GarageType', 'MiscFeature', 'SaleType',
        'SaleCondition'
    ]

    SKEWED_FEATURES = [
        'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 
        'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 
        'WoodDeckSF', 'OpenPorchSF', 'AgeHouse', 'AgeSinceRemod', 'TotalSF'
    ]
    
    NUMERIC_FEATURES = [
        'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFullBath', 
        'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
        'GarageYrBlt', 'GarageCars', 'YrSold', 'IsRemodeled', 'TotalBaths', 'OverallScore'
    ]

class HousingPricePredictor:
    """Class used for training a model and predicting house prices."""
    def __init__(self, config):
        self.config = config
        self.preprocessor = None
        self.model = LassoCV(cv = 10, max_iter = 20000, random_state = 1, n_jobs = -1)

    @staticmethod
    def _print_evaluation_metrics(y_true, y_pred, dataset_name):
        """Print evaluation metrics for given dataset."""
        print(f"\n=== Metryki dla zbioru {dataset_name} ===")
        print(f"R²: {r2_score(np.expm1(y_true), np.expm1(y_pred)):.4f}")
        print(f"RMSLE: {mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred), squared=False):.4f}")
        print(f"MAE: {mean_absolute_error(np.expm1(y_true), np.expm1(y_pred)):.2f}")
        
    def _clean_data(self, data):
        """Handles missing values correctly based on their meaning."""
        df = data.copy()
        # Define strategies of handling with missing data
        cols_na_as_none = [
            'Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
            'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish',
            'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'MasVnrType'
        ]
        cols_na_as_zero = [
            'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath',
            'BsmtHalfBath', 'GarageArea', 'GarageCars', 'MasVnrArea'
        ]
        # Columns to fill with the most frequent value per category
        cols_na_as_mode = [
            'MSZoning', 'Functional', 'Electrical',
            'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType'
        ]
        # Fill missed values in specified columns
        for col in cols_na_as_none:
            df[col] = df[col].fillna('None')
    
        for col in cols_na_as_zero:
            df[col] = df[col].fillna(0)

        for col in cols_na_as_mode:
            df[col] = df[col].fillna(df[col].mode()[0])

        # Fill with median from neighborhood, then with global median if there are empty values left
        df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(
            lambda x: x.fillna(x.median()))
        
        if df['LotFrontage'].isnull().any():
            df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].median())

        # If no info about GarageYrBlt then fill with house year built
        df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['GarageYrBlt'].median())
        
        # Switch formats of some variables for convinience
        df['MSSubClass'] = df['MSSubClass'].astype(str)
        df['CentralAir'] = df['CentralAir'].map({'Y': 1, 'N': 0})
        
        return df

    def _apply_feature_engineering(self, data):
        """Add custom engineering features to improve predictive power of the model."""
        df = data.copy()
        df['AgeHouse'] = (df['YrSold'] - df['YearBuilt']).clip(lower=0)
        df['AgeSinceRemod'] = (df['YrSold'] - df['YearRemodAdd']).clip(lower=0)
        df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
        df['IsRemodeled'] = (df['YearRemodAdd'] > df['YearBuilt']).astype(int)
        df['TotalBaths'] = df['FullBath'] + 0.5 * df['HalfBath'] + df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']
        df['OverallScore'] = df['OverallQual'] * df['OverallCond']
        df = df.drop(['Utilities', 'Street', 'MoSold'], axis = 1, errors='ignore')
        return df

    def _filter_outliers(self, X, y):
        """Detect and remove outliers from training set."""
        # Select only skewed features with high feature importance for selection
        outlier_detection_features = ['GrLivArea', 'OverallQual', 'TotalSF', 
                                      'YearBuilt', 'TotalBaths', 'GarageArea']
        
        clf = IsolationForest(n_estimators = 100, contamination = 0.005, random_state = 1)
        outlier_preds = clf.fit_predict(X[outlier_detection_features].fillna(0).values) # Return 1 for normal and -1 for outliers
        
        non_outlier_indices = np.where(outlier_preds == 1)[0]
        
        return X.iloc[non_outlier_indices], y.iloc[non_outlier_indices]

    def _build_preprocessor(self):
        """Build pipeline for data preprocessing."""
        # Transform ordinal map into keys and values
        ordinal_features = list(self.config.ORDINAL_FEATURE_MAP.keys())
        ordinal_categories = list(self.config.ORDINAL_FEATURE_MAP.values())
            
        # Pipeline for numerical features, fill missing values with median
        numeric_transformer = Pipeline(steps = [
            ('imputer', SimpleImputer(strategy = 'median')),
            ('scaler', StandardScaler())
        ])
        
        # Pipeline for skewed numerical features
        skewed_transformer = Pipeline(steps = [
            ('imputer', SimpleImputer(strategy = 'median')),
            ('log', FunctionTransformer(np.log1p, validate = False, feature_names_out = 'one-to-one')),
            ('scaler', StandardScaler())
        ])

        # Pipeline for ordinal features
        ordinal_transformer = Pipeline(steps = [
            ('imputer', SimpleImputer(strategy = 'most_frequent')),
            ('encoder', OrdinalEncoder(
                categories = ordinal_categories,
                handle_unknown = 'use_encoded_value',
                unknown_value = -1
            )),
            ('scaler', StandardScaler())
        ])

        # Pipeline for nominal features
        nominal_transformer = Pipeline(steps = [
            ('imputer', SimpleImputer(strategy = 'most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
        ])
        
        # Combine all transformers in single ColumnTransformer
        preprocessor = ColumnTransformer(transformers = [
            ('num', numeric_transformer, self.config.NUMERIC_FEATURES),
            ('skewed', skewed_transformer, self.config.SKEWED_FEATURES),
            ('ord', ordinal_transformer, ordinal_features),
            ('nom', nominal_transformer, self.config.NOMINAL_FEATURES)
        ], remainder = 'passthrough')
        return preprocessor

    def train(self, X, y):
        """Train final model."""
        # Split data into train and validation set
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 1)

        self.preprocessor = self._build_preprocessor()
        
        pipeline = Pipeline(steps=[
            ('preprocessor', self.preprocessor),
            ('regressor', self.model)
        ])
        pipeline.fit(X_train, y_train)
    
        # Evaluation on train and validation sets
        y_train_pred = pipeline.predict(X_train)
        y_val_pred = pipeline.predict(X_val)
    
        # Show performance metrics
        best_alpha = pipeline.named_steps['regressor'].alpha_
        print(f"\nBest alpha found by LassoCV: {best_alpha:.4f}")
        self._print_evaluation_metrics(y_train, y_train_pred, "Training")
        self._print_evaluation_metrics(y_val, y_val_pred, "Validation")

        # Train final model
        pipeline.fit(X, y)
        self.model = pipeline
        
    def predict(self, X_test):
        """Generate predictions on test data."""
        return self.model.predict(X_test)

    def run(self):
        """Runs whole pipeline."""
        # Load both datasets
        train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
        test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
        test_ids = test_df['Id']
    
        # Separate features and target
        X = train_df.drop(['SalePrice', 'Id'], axis = 1, errors = 'ignore')
        y = np.log1p(train_df['SalePrice'])
        X_test = test_df.drop(['Id'], axis = 1, errors = 'ignore')
            
        # Clean the data
        X = self._clean_data(X)
        X_test = self._clean_data(X_test)
        
        # Add new features to the model
        X = self._apply_feature_engineering(X)
        X_test = self._apply_feature_engineering(X_test)
    
        # Remove houses with extremaly skewed features
        X, y = self._filter_outliers(X, y)
    
        # Reset indices after removing unwanted features
        X = X.reset_index(drop = True)
        y = y.reset_index(drop = True)
        X_test = X_test.reset_index(drop = True)
    
        # Train model
        self.train(X, y)
        
        # Predictions
        final_predictions = self.predict(X_test)
        
        # Generate submission file
        submission = pd.DataFrame({'Id': test_ids, 'SalePrice': np.expm1(final_predictions)})        
        submission.to_csv('submission.csv', index = False)

# Main function
if __name__ == '__main__':
    config = Config()
    predictor = HousingPricePredictor(config)
    predictor.run()



Best alpha found by LassoCV: 0.0043

=== Metryki dla zbioru Training ===
R²: 0.9096
RMSLE: 0.1153
MAE: 14996.82

=== Metryki dla zbioru Validation ===
R²: 0.9055
RMSLE: 0.1298
MAE: 15171.83
