In [None]:
import pandas as pd
import numpy as np
import sweetviz as sv
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, mstats, stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

warnings.filterwarnings('ignore')

test_data = pd.read_csv('../data/raw/test.csv').copy()
test_data.drop(columns=['Id'], inplace=True)

In [None]:
test_data.info()

In [None]:
test_data.shape

## Build the necessary pipelines

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

# Transformer to fill LotFrontage with neighborhood median
class FillLotFrontageByNeighborhood(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Learn neighborhood medians
        self.neigh_medians_ = X.groupby('Neighborhood')['LotFrontage'].median()
        return self
    
    def transform(self, X):
        X = X.copy()
        def fill_func(row):
            if pd.isna(row['LotFrontage']):
                return self.neigh_medians_.get(row['Neighborhood'], np.nan)
            else:
                return row['LotFrontage']
        X['LotFrontage'] = X.apply(fill_func, axis=1)
        return X

# Transformer to fill garage-related columns
class FillGarageCols(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        garage_cols = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
        for col in garage_cols:
            X[col] = X[col].fillna('None')
        X['GarageYrBlt'] = X['GarageYrBlt'].fillna(0)
        return X

# Transformer to fill basement-related columns
class FillBsmtCols(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        bsmt_cols = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
        for col in bsmt_cols:
            X[col] = X[col].fillna('None')
        return X

# Transformer to fill specified columns with 'None'
class FillNoneCols(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].fillna('None')
        return X

# Transformer to fill MasVnrType with most frequent value
class FillMasVnrType(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = X['MasVnrType'].mode()[0]
        return self
    def transform(self, X):
        X = X.copy()
        X['MasVnrType'] = X['MasVnrType'].fillna(self.most_frequent_)
        return X

# Transformer to fill MasVnrArea with 0
class FillMasVnrArea(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X['MasVnrArea'] = X['MasVnrArea'].fillna(0)
        return X

# Transformer to fill Electrical with most frequent value
class FillElectrical(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = X['Electrical'].mode()[0]
        return self
    def transform(self, X):
        X = X.copy()
        X['Electrical'] = X['Electrical'].fillna(self.most_frequent_)
        return X

# Build the pipeline
none_fill_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']

feature_imputation_pipeline = Pipeline([
    ('fill_none_cols', FillNoneCols(none_fill_cols)),
    ('fill_lot_frontage', FillLotFrontageByNeighborhood()),
    ('fill_garage_cols', FillGarageCols()),
    ('fill_bsmt_cols', FillBsmtCols()),
    ('fill_masvnr_type', FillMasVnrType()),
    ('fill_masvnr_area', FillMasVnrArea()),
    ('fill_electrical', FillElectrical())
])


class QualMappingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.qual_mapping = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
        self.bsmt_exposure_mapping = {'No': 0, 'Mn': 1, 'Av': 2, 'Gd': 3}
        self.bsmt_fin_type_mapping = {'Unf': 0, 'LwQ': 1, 'Rec': 2, 'BLQ': 3, 'ALQ': 4, 'GLQ': 5}
        self.functional_mapping = {'Maj2': 0, 'Maj1': 1, 'Mod': 2, 'Min2': 3, 'Min1': 4, 'Typ': 5}
        self.paved_drive_mapping = {'N': 0, 'P': 1, 'Y': 2}
        self.saletype_mapping = {'CWD': 0, 'ConLI': 1, 'ConLD': 2, 'COD': 3, 'New': 4, 'WD': 5}
        self.salecondition_mapping = {'Partial': 0, 'Family': 1, 'Alloca': 2, 'AdjLand': 3, 'Abnorml': 4, 'Normal': 5}
        self.heating_mapping = {'Floor': 1, 'OthW': 2, 'Wall': 3, 'Grav': 4, 'GasW': 5, 'GasA': 6}
        self.exter_mapping = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
        self.misc_feature_mapping = {'None': 0, 'Shed': 1, 'Othr': 2, 'Gar2': 5, 'TenC': 10}
        self.garage_type_mapping = {'Basment': 0, 'CarPort': 1, '2Types': 2, 'BuiltIn': 3, 'Detchd': 4, 'Attchd': 5}
        self.garage_finish_mapping = {'Unf': 1, 'RFn': 2, 'Fin': 3}
        self.electrical_mapping = {'Mix': 1, 'FuseP': 2, 'FuseF': 3, 'FuseA': 4, 'SBrkr': 5}
        self.ms_zoning_mapping = {'C (all)': 0, 'RH': 1, 'RM': 2, 'RL': 3, 'FV': 4}
        self.street_mapping = {'Grvl': 0, 'Pave': 1}
        self.alley_mapping = {'None': 0, 'Grvl': 1, 'Pave': 2}
        self.lot_shape_mapping = {'IR3': 0, 'IR2': 1, 'IR1': 2, 'Reg': 3}
        self.land_contour_mapping = {'Low': 0, 'Bnk': 1, 'HLS': 2, 'Lvl': 3}
        self.utilities_mapping = {'NoSeWa': 0, 'AllPub': 1}
        self.lot_config_mapping = {'FR3': 0, 'FR2': 1, 'Inside': 2, 'Corner': 3, 'CulDSac': 4}
        self.land_slope_mapping = {'Sev': 0, 'Mod': 1, 'Gtl': 2}
        self.bsmt_fin_type2_mapping = {'Unf': 0, 'Rec': 1, 'LwQ': 2, 'BLQ': 3, 'ALQ': 4, 'GLQ': 5, 'None': 0}
        self.central_air_mapping = {'N': 0, 'Y': 1}
        self.fence_mapping = {'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}
        self.mas_vnr_type_mapping = {'None': 0, 'BrkCmn': 1, 'BrkFace': 2, 'Stone': 3}

    def misc_val_mapping(self, val):
        if pd.isna(val) or val == 0:
            return 0
        elif val < 1000:
            return 1
        elif val < 5000:
            return 2
        else:
            return 3

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        quality_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
                        'PoolQC', 'FireplaceQu', 'GarageQual', 'GarageCond', 'KitchenQual']
        for col in quality_cols:
            df[col] = df[col].map(self.qual_mapping).fillna(0)

        df['MSZoning'] = df['MSZoning'].map(self.ms_zoning_mapping).fillna(0) # Not: MSZoning için null yok gibi görünüyor ama yine de fillna güvenlidir
        df['Street'] = df['Street'].map(self.street_mapping).fillna(0)
        df['Alley'] = df['Alley'].map(self.alley_mapping).fillna(0)
        df['LotShape'] = df['LotShape'].map(self.lot_shape_mapping).fillna(0)
        df['LandContour'] = df['LandContour'].map(self.land_contour_mapping).fillna(0)
        df['Utilities'] = df['Utilities'].map(self.utilities_mapping).fillna(0)
        df['LotConfig'] = df['LotConfig'].map(self.lot_config_mapping).fillna(0)
        df['LandSlope'] = df['LandSlope'].map(self.land_slope_mapping).fillna(0)
        df['BsmtExposure'] = df['BsmtExposure'].map(self.bsmt_exposure_mapping).fillna(0)
        df['BsmtFinType1'] = df['BsmtFinType1'].map(self.bsmt_fin_type_mapping).fillna(0)
        df['BsmtFinType2'] = df['BsmtFinType2'].map(self.bsmt_fin_type2_mapping).fillna(0) # BsmtFinType2 için ayrı mapping
        df['CentralAir'] = df['CentralAir'].map(self.central_air_mapping).fillna(0)
        df['Fence'] = df['Fence'].map(self.fence_mapping).fillna(0)
        df['MasVnrType'] = df['MasVnrType'].map(self.mas_vnr_type_mapping).fillna(0) # MasVnrType için mapping

        df['Functional'] = df['Functional'].map(self.functional_mapping).fillna(0)
        df['PavedDrive'] = df['PavedDrive'].map(self.paved_drive_mapping).fillna(0)
        df['GarageType'] = df['GarageType'].fillna('None').map(self.garage_type_mapping).fillna(0)
        df['SaleType'] = df['SaleType'].map(self.saletype_mapping).fillna(0)
        df['SaleCondition'] = df['SaleCondition'].map(self.salecondition_mapping).fillna(0)
        df['Heating'] = df['Heating'].map(self.heating_mapping).fillna(0)
        df['MiscFeature'] = df['MiscFeature'].map(self.misc_feature_mapping).fillna(0)
        df['GarageFinish'] = df['GarageFinish'].map(self.garage_finish_mapping).fillna(0)
        df['MiscVal'] = df['MiscVal'].apply(self.misc_val_mapping)
        df['Electrical'] = df['Electrical'].map(self.electrical_mapping).fillna(0)

        return df

In [None]:
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_bsmt_quality_interaction=True):
        self.include_bsmt_quality_interaction = include_bsmt_quality_interaction

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        # Calculate total finished basement area
        df['TotalFinishedBsmtSF'] = df['BsmtFinSF1'] + df['BsmtFinSF2']
        
        # Add interaction term if enabled and BsmtQual is numeric
        if self.include_bsmt_quality_interaction and 'BsmtQual' in df.columns and pd.api.types.is_numeric_dtype(df['BsmtQual']):
            df['TotalFinishedBsmtSF_BsmtQual_Interaction'] = df['TotalFinishedBsmtSF'] * df['BsmtQual']

        # Combine full and half bathrooms counts (basement + above ground)
        df['TotalFullBaths'] = df['BsmtFullBath'] + df['FullBath']
        df['TotalHalfBaths'] = df['BsmtHalfBath'] + df['HalfBath']

        # House age and years since last remodel (no negative values)
        df['HouseAge'] = df['YrSold'] - df['YearBuilt']
        df['YearsSinceRemodel'] = (df['YrSold'] - df['YearRemodAdd']).apply(lambda x: x if x >= 0 else 0)

        # Sum all porch area types into one feature
        df['TotalPorchArea'] = df['WoodDeckSF'] + df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']

        return df


class FeatureDropper(BaseEstimator, TransformerMixin):
    def __init__(self, features_to_drop):
        self.features_to_drop = features_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        cols_to_drop = [col for col in self.features_to_drop if col in df.columns]
        return df.drop(columns=cols_to_drop, errors='ignore')


# Features representing original components to drop to avoid redundancy
original_component_features = [
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'FullBath',
    'BsmtHalfBath', 'HalfBath', 'YearBuilt', 'YearRemodAdd',
    'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'
]

# Features weakly correlated with target, dropped for simplicity
weakly_related_features = [
    'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
    'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'Foundation'
]

# All features to drop combined
all_features_to_drop = original_component_features + weakly_related_features

## Test Data Preprocesing

In [None]:
pipeline_full = Pipeline([
    ('feature_imputation', feature_imputation_pipeline),
    ('qual_mapping', QualMappingTransformer()),
    ('engineer_features', FeatureEngineeringTransformer(include_bsmt_quality_interaction=True)),
    ('drop_features', FeatureDropper(features_to_drop=all_features_to_drop))
])

test_data = pipeline_full.fit_transform(test_data)

In [None]:
test_data.head()

In [None]:
test_data.info()

In [None]:
print(list(test_data.isnull().sum()))

In [None]:
import joblib
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

# Load original test data and assign Neighborhood
test_data_org = pd.read_csv('../data/raw/test.csv').copy()
test_data = test_data_org.copy()
test_data['Neighborhood'] = test_data_org['Neighborhood'].values

# Load processed training data and compute smoothed encoding
train_data = pd.read_csv('../data/processed/processed_train_data.csv')
train_data.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')
train_data['SalePrice_log'] = np.log1p(train_data['SalePrice'])  # log transform target

# Compute smoothed mean encoding for Neighborhood
alpha = 10
neighborhood_stats = train_data.groupby('Neighborhood')['SalePrice_log'].agg(['mean', 'count'])
global_mean = train_data['SalePrice_log'].mean()

neighborhood_stats['smoothed'] = (
    neighborhood_stats['mean'] * neighborhood_stats['count'] + global_mean * alpha
) / (neighborhood_stats['count'] + alpha)

# Save encoding to file
neighborhood_stats.reset_index()[['Neighborhood', 'smoothed']].to_csv('../models/neighborhood_encoding.csv', index=False)

# Load the saved encoding
neighborhood_encoding_df = pd.read_csv('../models/neighborhood_encoding.csv')
neighborhood_smoothed = dict(zip(neighborhood_encoding_df['Neighborhood'], neighborhood_encoding_df['smoothed']))

# Apply encoding to test data
test_data['Neighborhood_avg_price'] = test_data['Neighborhood'].map(neighborhood_smoothed)
test_data['Neighborhood_avg_price'].fillna(global_mean, inplace=True)
test_data.drop(columns=['Neighborhood'], inplace=True)

# Apply encoding to train data (for RMSE evaluation)
train_data['Neighborhood_avg_price'] = train_data['Neighborhood'].map(neighborhood_smoothed)
train_data['Neighborhood_avg_price'].fillna(global_mean, inplace=True)
train_data.drop(columns=['Neighborhood', 'SalePrice'], inplace=True)

# Separate target variable
X_train = train_data.drop(columns=['SalePrice_log'])
y_train_log = train_data['SalePrice_log']

# Impute missing values
imputer = SimpleImputer(strategy='mod')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(test_data)

# Load trained model and make predictions
XGB_model = joblib.load('../models/xgb_model_final.joblib')

predictions_log = XGB_model.predict(X_test_imputed)
predictions = np.expm1(predictions_log)  # reverse log transform

# Create submission file
sample_submission = pd.read_csv('../data/raw/sample_submission.csv').copy()
sample_submission['SalePrice'] = predictions
sample_submission.drop(columns=['Unnamed: 0'], inplace=True)
sample_submission.to_csv('../submission/test_predictions.csv', index=False)

print("Test predictions saved.")