In [3]:
# house_price_prediction.ipynb

# Import essential libraries
import os
import pandas as pd
import numpy as np
import joblib
import pickle
import warnings
from scipy.stats import boxcox
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, ElasticNetCV, LassoCV
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Ignore warnings for clean output
warnings.filterwarnings('ignore')

# Define directories
data_dir = '/workspace/milestone-project-heritage-housing-issues/data'
models_dir = os.path.join(data_dir, 'models')  # Subdirectory for models
os.makedirs(models_dir, exist_ok=True)  # Ensure models directory exists

# Set file paths
house_data_file = os.path.join(data_dir, 'house_prices_records.csv')
inherited_houses_file = os.path.join(data_dir, 'inherited_houses.csv')

# Import datasets
house_data = pd.read_csv(house_data_file)
inherited_houses = pd.read_csv(inherited_houses_file)

print(f"House Data Shape: {house_data.shape}")
print(f"Inherited Houses Shape: {inherited_houses.shape}")

# Apply log transformation to SalePrice
house_data['SalePrice_Log'] = np.log1p(house_data['SalePrice'])

# Handle missing values in house_data
print("Handling missing values in house_data...")
zero_fill_features = ['2ndFlrSF', 'EnclosedPorch', 'MasVnrArea', 'WoodDeckSF',
                      'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 'BsmtUnfSF']
for feature in zero_fill_features:
    house_data[feature].fillna(0, inplace=True)

house_data['BedroomAbvGr'].fillna(house_data['BedroomAbvGr'].mode()[0], inplace=True)
house_data['BsmtFinType1'].fillna('None', inplace=True)
house_data['GarageFinish'].fillna('Unf', inplace=True)
house_data['BsmtExposure'].fillna('No', inplace=True)
house_data['KitchenQual'].fillna('TA', inplace=True)

house_data['GarageYrBlt'].fillna(house_data['GarageYrBlt'].median(), inplace=True)
house_data['LotFrontage'].fillna(house_data['LotFrontage'].median(), inplace=True)
house_data['OverallQual'].fillna(house_data['OverallQual'].median(), inplace=True)
house_data['OverallCond'].fillna(house_data['OverallCond'].median(), inplace=True)
house_data['YearBuilt'].fillna(house_data['YearBuilt'].median(), inplace=True)
house_data['YearRemodAdd'].fillna(house_data['YearRemodAdd'].median(), inplace=True)

# Encode categorical features
print("Encoding categorical features in house_data...")
ordinal_mappings = {
    'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtExposure': {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
    'GarageFinish': {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
}

for col, mapping in ordinal_mappings.items():
    if col in house_data.columns:
        house_data[col] = house_data[col].map(mapping)

# Identify numeric features
numeric_feats = house_data.select_dtypes(include=[np.number]).columns
skewness = house_data[numeric_feats].apply(lambda x: x.skew()).sort_values(ascending=False)
skewed_features = skewness[abs(skewness) > 0.75].index.tolist()

# Apply log or box-cox transformation
print("Transforming skewed features in house_data...")
lam_dict = {}
for feat in skewed_features:
    if (house_data[feat] <= 0).any():
        house_data[feat] = np.log1p(house_data[feat])
    else:
        try:
            transformed_data, lam = boxcox(house_data[feat])
            house_data[feat] = transformed_data
            lam_dict[feat] = lam
        except ValueError:
            house_data[feat] = np.log1p(house_data[feat])

with open(os.path.join(models_dir, 'skewed_features.pkl'), 'wb') as f:
    pickle.dump(skewed_features, f)
with open(os.path.join(models_dir, 'lam_dict.pkl'), 'wb') as f:
    pickle.dump(lam_dict, f)

# Feature engineering
print("Performing feature engineering in house_data...")
house_data['TotalSF'] = house_data['TotalBsmtSF'] + house_data['1stFlrSF'] + house_data['2ndFlrSF']
house_data['Qual_TotalSF'] = house_data['OverallQual'] * house_data['TotalSF']

# Prepare data for modeling
print("Preparing data for modeling...")
X = house_data.drop(['SalePrice', 'SalePrice_Log'], axis=1, errors='ignore')
y = house_data['SalePrice_Log']

# Define the features based on your metadata
feature_list = [
    '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'BsmtExposure', 'BsmtFinType1',
    'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageArea', 'GarageFinish',
    'GarageYrBlt', 'GrLivArea', 'KitchenQual', 'LotArea', 'LotFrontage',
    'MasVnrArea', 'EnclosedPorch', 'OpenPorchSF', 'OverallCond', 'OverallQual',
    'WoodDeckSF', 'YearBuilt', 'YearRemodAdd'
]

# Ensure the features are in X
X = X[feature_list]

# Feature selection using Random Forest
print("Performing feature selection using Random Forest...")
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
selected_features = importances[:20].index.tolist()

with open(os.path.join(models_dir, 'selected_features.pkl'), 'wb') as f:
    pickle.dump(selected_features, f)

X = X[selected_features]

# Split data
print("Splitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
print("Scaling features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, os.path.join(models_dir, 'scaler.joblib'))

# Model training
print("Training models...")
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': RidgeCV(alphas=np.logspace(-4, 4, 10)),
    'ElasticNet': ElasticNetCV(alphas=np.logspace(-4, -0.5, 30), l1_ratio=0.5, cv=5, max_iter=10000),
    'Lasso Regression': LassoCV(alphas=np.logspace(-4, -0.5, 30), cv=5, max_iter=10000),
    'Gradient Boosting': GradientBoostingRegressor(
        n_estimators=300, learning_rate=0.05, max_depth=3,
        min_samples_leaf=5, max_features=0.8, random_state=42),
    'Random Forest': RandomForestRegressor(
        n_estimators=100, max_depth=None, max_features='sqrt',
        min_samples_leaf=2, random_state=42),
    'XGBoost': XGBRegressor(
        n_estimators=300, learning_rate=0.05, max_depth=5,
        min_child_weight=3, subsample=0.8, colsample_bytree=0.8, random_state=42)
}

# Model evaluation
results = {'Model': [], 'MAE': [], 'RMSE': [], 'R² Score': []}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train)
    model_filename = f"{name.replace(' ', '_').lower()}_model.joblib"
    joblib.dump(model, os.path.join(models_dir, model_filename))
    predictions = model.predict(X_test_scaled)
    mae = mean_absolute_error(np.expm1(y_test), np.expm1(predictions))
    rmse = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(predictions)))
    r2 = r2_score(np.expm1(y_test), np.expm1(predictions))
    results['Model'].append(name)
    results['MAE'].append(mae)
    results['RMSE'].append(rmse)
    results['R² Score'].append(r2)

results_df = pd.DataFrame(results)
print("Model Evaluation Results:")
print(results_df)
results_df.to_csv(os.path.join(models_dir, 'model_evaluation.csv'), index=False)

# Process inherited houses
print("Processing inherited houses...")

# Handle missing values in inherited_houses
for feature in zero_fill_features:
    inherited_houses[feature].fillna(0, inplace=True)
inherited_houses['BedroomAbvGr'].fillna(house_data['BedroomAbvGr'].mode()[0], inplace=True)
inherited_houses['BsmtFinType1'].fillna('None', inplace=True)
inherited_houses['GarageFinish'].fillna('Unf', inplace=True)
inherited_houses['BsmtExposure'].fillna('No', inplace=True)
inherited_houses['KitchenQual'].fillna('TA', inplace=True)
inherited_houses['GarageYrBlt'].fillna(house_data['GarageYrBlt'].median(), inplace=True)
inherited_houses['LotFrontage'].fillna(house_data['LotFrontage'].median(), inplace=True)
inherited_houses['OverallQual'].fillna(house_data['OverallQual'].median(), inplace=True)
inherited_houses['OverallCond'].fillna(house_data['OverallCond'].median(), inplace=True)
inherited_houses['YearBuilt'].fillna(house_data['YearBuilt'].median(), inplace=True)
inherited_houses['YearRemodAdd'].fillna(house_data['YearRemodAdd'].median(), inplace=True)

# Encode categorical features
for col, mapping in ordinal_mappings.items():
    if col in inherited_houses.columns:
        inherited_houses[col] = inherited_houses[col].map(mapping)

# Feature engineering on inherited houses
inherited_houses['TotalSF'] = inherited_houses['TotalBsmtSF'] + inherited_houses['1stFlrSF'] + inherited_houses['2ndFlrSF']
inherited_houses['Qual_TotalSF'] = inherited_houses['OverallQual'] * inherited_houses['TotalSF']

# Transform skewed features
print("Transforming skewed features in inherited_houses...")
for feat in skewed_features:
    if feat in inherited_houses.columns:
        if (inherited_houses[feat] <= 0).any():
            inherited_houses[feat] = np.log1p(inherited_houses[feat])
        else:
            lam = lam_dict.get(feat)
            if lam is not None:
                try:
                    inherited_houses[feat] = boxcox(inherited_houses[feat], lam)
                except ValueError:
                    inherited_houses[feat] = np.log1p(inherited_houses[feat])
            else:
                inherited_houses[feat] = np.log1p(inherited_houses[feat])

# Ensure the features match
inherited_houses = inherited_houses.reindex(columns=selected_features, fill_value=0)

# Scaling
inherited_houses_scaled = scaler.transform(inherited_houses)

# Predictions
print("Making predictions on inherited houses...")
predictions_df = pd.DataFrame()
for name, model in models.items():
    predictions_log = model.predict(inherited_houses_scaled)
    predictions_actual = np.expm1(predictions_log)
    # Handle infinite or NaN values
    predictions_actual = np.where(np.isfinite(predictions_actual), predictions_actual, np.nan)
    predictions_actual = np.nan_to_num(predictions_actual, nan=np.nanmean(predictions_actual))
    predictions_df[name] = predictions_actual

predictions_df.to_csv(os.path.join(models_dir, 'inherited_houses_predictions.csv'), index=False)
print("Predictions saved to 'inherited_houses_predictions.csv'.")

# Optional: Display the predictions
print("Predictions for Inherited Houses:")
print(predictions_df)

House Data Shape: (1460, 24)
Inherited Houses Shape: (4, 23)
Handling missing values in house_data...
Encoding categorical features in house_data...
Transforming skewed features in house_data...
Performing feature engineering in house_data...
Preparing data for modeling...
Performing feature selection using Random Forest...
Splitting data into training and test sets...
Scaling features...
Training models...
Training Linear Regression...
Training Ridge Regression...
Training ElasticNet...
Training Lasso Regression...
Training Gradient Boosting...
Training Random Forest...
Training XGBoost...
Model Evaluation Results:
               Model           MAE          RMSE  R² Score
0  Linear Regression  17819.464468  30869.803484  0.875762
1   Ridge Regression  17805.297367  30876.481047  0.875708
2         ElasticNet  17723.418937  30831.251125  0.876072
3   Lasso Regression  17735.609527  30813.435044  0.876215
4  Gradient Boosting  16964.888419  27959.914880  0.898080
5      Random Forest  