In [1]:
# house_price_prediction.ipynb

# Import essential libraries
import os
import pandas as pd
import numpy as np
import joblib
import pickle
import warnings
from scipy.stats import boxcox
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, ElasticNetCV, LassoCV
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Ignore warnings for clean output
warnings.filterwarnings('ignore')

# Define directories
data_dir = '/Users/conor/Desktop/JohnProject/notebook'
models_dir = os.path.join('models')  # Subdirectory for models
os.makedirs(models_dir, exist_ok=True)  # Ensure models directory exists

# Set file paths
house_data_file = os.path.join('/Users/conor/Desktop/JohnProject/data/house_prices_records.csv')
inherited_houses_file = os.path.join('/Users/conor/Desktop/JohnProject/data/inherited_houses.csv')

# Import datasets
house_data = pd.read_csv(house_data_file)
inherited_houses = pd.read_csv(inherited_houses_file)

print(f"House Data Shape: {house_data.shape}")
print(f"Inherited Houses Shape: {inherited_houses.shape}")

# Display first few rows of the datasets
print("First few rows of house_data:")
print(house_data.head())
print("First few rows of inherited_houses:")
print(inherited_houses.head())

# Apply log transformation to SalePrice
# The sale prices are right-skewed; applying log transformation to normalize the distribution
house_data['SalePrice_Log'] = np.log1p(house_data['SalePrice'])

# Handle missing values in house_data
print("\nHandling missing values in house_data...")

# List of features where missing values likely indicate absence of the feature
zero_fill_features = ['2ndFlrSF', 'EnclosedPorch', 'MasVnrArea', 'WoodDeckSF',
                      'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 'BsmtUnfSF']

for feature in zero_fill_features:
    house_data[feature].fillna(0, inplace=True)
    print(f"Filled missing values in {feature} with 0.")

# Fill missing categorical features with mode or default value
categorical_mode_fill = {
    'BedroomAbvGr': house_data['BedroomAbvGr'].mode()[0],
    'BsmtFinType1': 'None',
    'GarageFinish': 'Unf',
    'BsmtExposure': 'No',
    'KitchenQual': 'TA'
}

for feature, value in categorical_mode_fill.items():
    house_data[feature].fillna(value, inplace=True)
    print(f"Filled missing values in {feature} with '{value}'.")

# Fill missing numerical features with median
numerical_median_fill = ['GarageYrBlt', 'LotFrontage', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd']

for feature in numerical_median_fill:
    median_value = house_data[feature].median()
    house_data[feature].fillna(median_value, inplace=True)
    print(f"Filled missing values in {feature} with median value {median_value}.")

# Verify that there are no missing values left
print("\nChecking for remaining missing values:")
print(house_data.isnull().sum()[house_data.isnull().sum() > 0])

# Encode categorical features
print("\nEncoding categorical features in house_data...")

# Define mappings for ordinal categorical features based on their definitions
ordinal_mappings = {
    'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtExposure': {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
    'GarageFinish': {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
}

for col, mapping in ordinal_mappings.items():
    if col in house_data.columns:
        house_data[col] = house_data[col].map(mapping)
        print(f"Encoded {col} using ordinal mapping.")

# Identify numeric features
numeric_feats = house_data.select_dtypes(include=[np.number]).columns

# Check skewness of numeric features
skewness = house_data[numeric_feats].apply(lambda x: x.skew()).sort_values(ascending=False)
print("\nSkewness of numeric features:")
print(skewness)

# Features with high skewness (threshold can be adjusted)
skewed_features = skewness[abs(skewness) > 0.75].index.tolist()
print("\nFeatures with high skewness (|skewness| > 0.75):")
print(skewed_features)

# Apply log or box-cox transformation to skewed features
print("\nTransforming skewed features in house_data...")

# Dictionary to store lambda values for box-cox transformation
lam_dict = {}

for feat in skewed_features:
    if (house_data[feat] <= 0).any():
        # If the feature has zero or negative values, use log1p transformation
        house_data[feat] = np.log1p(house_data[feat])
        print(f"Applied log1p transformation to {feat}.")
    else:
        # Apply box-cox transformation
        try:
            transformed_data, lam = boxcox(house_data[feat])
            house_data[feat] = transformed_data
            lam_dict[feat] = lam
            print(f"Applied box-cox transformation to {feat} with lambda {lam:.4f}.")
        except ValueError:
            # If box-cox fails, use log1p
            house_data[feat] = np.log1p(house_data[feat])
            print(f"Applied log1p transformation to {feat} (box-cox failed).")

# Save skewed features and lambda values for future use
with open(os.path.join(models_dir, 'skewed_features.pkl'), 'wb') as f:
    pickle.dump(skewed_features, f)
with open(os.path.join(models_dir, 'lam_dict.pkl'), 'wb') as f:
    pickle.dump(lam_dict, f)

# Feature engineering
print("\nPerforming feature engineering in house_data...")

# Create new features based on domain knowledge
house_data['TotalSF'] = house_data['TotalBsmtSF'] + house_data['1stFlrSF'] + house_data['2ndFlrSF']
print("Created TotalSF feature as sum of TotalBsmtSF, 1stFlrSF, and 2ndFlrSF.")

house_data['Qual_TotalSF'] = house_data['OverallQual'] * house_data['TotalSF']
print("Created Qual_TotalSF feature as product of OverallQual and TotalSF.")

# Prepare data for modeling
print("\nPreparing data for modeling...")

# Drop unnecessary columns
X = house_data.drop(['SalePrice', 'SalePrice_Log'], axis=1, errors='ignore')
y = house_data['SalePrice_Log']

# Define the features based on the provided metadata
feature_list = [
    '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'BsmtExposure', 'BsmtFinType1',
    'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageArea', 'GarageFinish',
    'GarageYrBlt', 'GrLivArea', 'KitchenQual', 'LotArea', 'LotFrontage',
    'MasVnrArea', 'EnclosedPorch', 'OpenPorchSF', 'OverallCond', 'OverallQual',
    'WoodDeckSF', 'YearBuilt', 'YearRemodAdd', 'TotalSF', 'Qual_TotalSF'  # Include engineered features
]

# Ensure the features are in X
X = X[feature_list]

# Feature selection using Random Forest
print("\nPerforming feature selection using Random Forest...")

# Use Random Forest to estimate feature importances
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importances
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Feature importances from Random Forest:")
print(importances)

# Select top features (e.g., top 20)
selected_features = importances[:20].index.tolist()
print("\nSelected top features for modeling:")
print(selected_features)

# Save selected features for future use
with open(os.path.join(models_dir, 'selected_features.pkl'), 'wb') as f:
    pickle.dump(selected_features, f)

# Keep only selected features
X = X[selected_features]

# Split data into training and test sets
print("\nSplitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save train and test data for the dashboard
joblib.dump((X_train, X_test, y_train, y_test), os.path.join(models_dir, 'train_test_data.joblib'))

# Scaling features
print("\nScaling features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for future use
joblib.dump(scaler, os.path.join(models_dir, 'scaler.joblib'))

# Model training
print("\nTraining models...")

# Adjusted alpha values for Ridge Regression and Lasso Regression to avoid numerical instability
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': RidgeCV(alphas=np.logspace(-3, 3, 7), cv=5),
    'ElasticNet': ElasticNetCV(alphas=np.logspace(-4, -0.5, 30), l1_ratio=[0.1, 0.5, 0.9], cv=5, max_iter=10000),
    'Lasso Regression': LassoCV(alphas=np.logspace(-3, -0.5, 30), cv=5, max_iter=10000),
    'Gradient Boosting': GradientBoostingRegressor(
        n_estimators=300, learning_rate=0.05, max_depth=3,
        min_samples_leaf=5, max_features=0.8, random_state=42),
    'Random Forest': RandomForestRegressor(
        n_estimators=100, max_depth=None, max_features='sqrt',
        min_samples_leaf=2, random_state=42),
    'XGBoost': XGBRegressor(
        n_estimators=300, learning_rate=0.05, max_depth=5,
        min_child_weight=3, subsample=0.8, colsample_bytree=0.8, random_state=42)
}

# Model evaluation
print("\nEvaluating models...")
results = {'Model': [], 'MAE': [], 'RMSE': [], 'R² Score': []}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train)
    model_filename = f"{name.replace(' ', '_').lower()}_model.joblib"
    # Save the trained model
    joblib.dump(model, os.path.join(models_dir, model_filename))
    # Make predictions on the test set
    predictions = model.predict(X_test_scaled)
    # Calculate performance metrics
    y_test_exp = np.expm1(y_test)
    predictions_exp = np.expm1(predictions)
    # Handle any negative predictions due to model limitations
    predictions_exp[predictions_exp < 0] = 0
    mae = mean_absolute_error(y_test_exp, predictions_exp)
    rmse = np.sqrt(mean_squared_error(y_test_exp, predictions_exp))
    r2 = r2_score(y_test_exp, predictions_exp)
    # Store results
    results['Model'].append(name)
    results['MAE'].append(mae)
    results['RMSE'].append(rmse)
    results['R² Score'].append(r2)
    print(f"{name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R² Score: {r2:.4f}")

# Create a DataFrame with the results
results_df = pd.DataFrame(results)
print("\nModel Evaluation Results:")
print(results_df)

# Save the results to a CSV file
results_df.to_csv(os.path.join(models_dir, 'model_evaluation.csv'), index=False)

# Save feature importances
# Using the 'importances' Series from Random Forest
feature_importances = importances.reset_index()
feature_importances.columns = ['Feature', 'Importance']
feature_importances.to_csv(os.path.join(models_dir, 'feature_importances.csv'), index=False)
print("\nSaved feature importances to 'feature_importances.csv'.")

# Process inherited houses
print("\nProcessing inherited houses...")

# Handle missing values in inherited_houses
print("Handling missing values in inherited_houses...")
for feature in zero_fill_features:
    inherited_houses[feature].fillna(0, inplace=True)
    print(f"Filled missing values in {feature} with 0.")

for feature, value in categorical_mode_fill.items():
    inherited_houses[feature].fillna(value, inplace=True)
    print(f"Filled missing values in {feature} with '{value}'.")

for feature in numerical_median_fill:
    median_value = house_data[feature].median()
    inherited_houses[feature].fillna(median_value, inplace=True)
    print(f"Filled missing values in {feature} with median value {median_value}.")

# Encode categorical features
print("Encoding categorical features in inherited_houses...")
for col, mapping in ordinal_mappings.items():
    if col in inherited_houses.columns:
        inherited_houses[col] = inherited_houses[col].map(mapping)
        print(f"Encoded {col} using ordinal mapping.")

# Feature engineering on inherited houses
print("Performing feature engineering on inherited_houses...")
inherited_houses['TotalSF'] = inherited_houses['TotalBsmtSF'] + inherited_houses['1stFlrSF'] + inherited_houses['2ndFlrSF']
print("Created TotalSF feature.")
inherited_houses['Qual_TotalSF'] = inherited_houses['OverallQual'] * inherited_houses['TotalSF']
print("Created Qual_TotalSF feature.")

# Transform skewed features
print("\nTransforming skewed features in inherited_houses...")
for feat in skewed_features:
    if feat in inherited_houses.columns:
        if (inherited_houses[feat] <= 0).any():
            inherited_houses[feat] = np.log1p(inherited_houses[feat])
            print(f"Applied log1p transformation to {feat}.")
        else:
            lam = lam_dict.get(feat)
            if lam is not None:
                try:
                    inherited_houses[feat] = boxcox(inherited_houses[feat], lam)
                    print(f"Applied box-cox transformation to {feat} with lambda {lam:.4f}.")
                except ValueError:
                    inherited_houses[feat] = np.log1p(inherited_houses[feat])
                    print(f"Applied log1p transformation to {feat} (box-cox failed).")
            else:
                inherited_houses[feat] = np.log1p(inherited_houses[feat])
                print(f"Applied log1p transformation to {feat} (no lambda found).")

# Ensure the features match
inherited_houses = inherited_houses.reindex(columns=selected_features, fill_value=0)
print("\nReindexed inherited_houses to match selected features.")

# Scaling
print("Scaling inherited houses features...")
inherited_houses_scaled = scaler.transform(inherited_houses)

# Predictions
print("\nMaking predictions on inherited houses...")
predictions_df = pd.DataFrame()
for name, model in models.items():
    predictions_log = model.predict(inherited_houses_scaled)
    predictions_actual = np.expm1(predictions_log)
    # Handle negative predictions
    predictions_actual[predictions_actual < 0] = 0
    # Store predictions
    predictions_df[name] = predictions_actual
    print(f"Predictions made using {name}.")

# Save predictions to CSV
predictions_df.to_csv(os.path.join(models_dir, 'inherited_houses_predictions.csv'), index=False)
print("\nPredictions saved to 'inherited_houses_predictions.csv'.")

# Optional: Display the predictions
print("\nPredictions for Inherited Houses:")
print(predictions_df)

# Save the final model (best performing model)
best_model_name = results_df.sort_values('RMSE').iloc[0]['Model']
print(f"\nBest performing model is {best_model_name}. Saving as final_model.joblib.")
joblib.dump(models[best_model_name], os.path.join(models_dir, 'final_model.joblib'))


House Data Shape: (1460, 24)
Inherited Houses Shape: (4, 23)
First few rows of house_data:
   1stFlrSF  2ndFlrSF  BedroomAbvGr BsmtExposure  BsmtFinSF1 BsmtFinType1  \
0       856     854.0           3.0           No         706          GLQ   
1      1262       0.0           3.0           Gd         978          ALQ   
2       920     866.0           3.0           Mn         486          GLQ   
3       961       NaN           NaN           No         216          ALQ   
4      1145       NaN           4.0           Av         655          GLQ   

   BsmtUnfSF  EnclosedPorch  GarageArea GarageFinish  ...  LotFrontage  \
0        150            0.0         548          RFn  ...         65.0   
1        284            NaN         460          RFn  ...         80.0   
2        434            0.0         608          RFn  ...         68.0   
3        540            NaN         642          Unf  ...         60.0   
4        490            0.0         836          RFn  ...         84.0   

 

['/Users/conor/Desktop/JohnProject/notebook/data1/models1/final_model.joblib']