# House Price Prediction & Analysis

The ojective is to understand key drivrs of house prices of houses and build a Predictive model 

### 1. Set up & configuration

In [None]:
# import libraries
# Core libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# Modeling Libraries
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor

# Utils
RANDOM_STATE = 42

### 2. Load Data

In [None]:
# Function for loading the data
def wrangle(filepath):
    df = pd.read_csv(filepath)
    
    return df

In [None]:
#loading the train and the test data
df = wrangle(r"C:\Users\User\Desktop\Completed\House_Price_Prediction\project_files\train.csv")
df_test = wrangle(r"C:\Users\User\Desktop\Completed\House_Price_Prediction\project_files\test.csv")

## Targets and Basic Check

In [None]:
# Ensure the target is in the dataset
assert "SalePrice" in df.columns, "Expected, column 'SalePrice' as the target"

# Set the target
target = "SalePrice"

# Sepparate feature and target
features = [col for col in df.columns if  col not in ("SalePrice", "Id")]
X = df[features]
y = df[target].copy()

# Categorize numerical features and categorical Features
num_cols = X.select_dtypes(include = [np.number]).columns
cat_cols = X.select_dtypes(exclude = [np.number]).columns

print(f"Num rows: {len(df)}, Num features: {X.shape[1]}")
print(f"Numeric features: {len(num_cols)} | Categorical features: {len(cat_cols)}")

### EDA: Distribution and Missing Values

In [None]:
# Display the first 5 column of the train dataset
X.head()

In [None]:
# To explore information about data
X.info()

In [None]:
# Calculate percentage of missing values
missing = df.isna().mean().sort_values(ascending = False)
missing = missing[missing > 0] * 100
missing.head(10)

In [None]:
# Plot topmost missing values
top_missing = missing.head(20)
plt.figure(figsize = (6, 4))
top_missing.sort_values().plot(kind = "barh")

# Label Axis
plt.xlabel("Missing Frequency")
plt.ylabel("Features")

# Add a title
plt.title("Distribution of Missing Values")
plt.tight_layout()
plt.show();


In [None]:
# Correlation of the features with numerical features
num_cols = X.select_dtypes(include=[np.number])
corr = num_cols.corr()
#  Visualie correlation with a heatmap
plt.figure(figsize = [8,6])
sns.heatmap(corr, annot = False, vmin=-1, center = 0, vmax=1);

In [None]:
# Correlations with target (numeric only)
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
corrs = df[numeric_cols + [target]].corr(numeric_only=True)[target].drop(target).sort_values(ascending=False)
corrs.head(10)

In [None]:
# Visualize top correlated features
top_correlated = corrs.head(15)

# Bar Plot
plt.figure(figsize=(6,4))
top_correlated.sort_values().plot(kind="barh")

# label Axis
plt.xlabel("Correlation")
plt.ylabel("Feature")

# Add a title
plt.title("Top correlated Numeric features to House Saleprice");

In [None]:
# Scatter Plot of Key numerical features to sales
key_numeric = [col for col in 
    ["OverallQual","GrLivArea", 
     "GarageCars", "GarageArea", 
     "TotalBsmtSF", "1stFlrSF",
     "FullBath ", "TotRmsAbvGrd",
     "YearBuilt", "YearRemodAdd"] if col in df.columns
]  

for cols in key_numeric:
    plt.figure(figsize = (6,4))
    plt.scatter(df[cols], y)
    
    # Add a title
    plt.title(f" {cols} vs Sales Price")
    
    # Label Axis
    plt.xlabel(cols)
    plt.ylabel("Sales Price");

In [None]:
# Create the figure and axes
fig, ax = plt.subplots(figsize=(15, 6))

# Create the histogram with KDE
sns.histplot(df["SalePrice"], color="c", bins=50, kde=True, ax=ax, alpha=0.8)

# Adding a title
plt.title("Distribution of prices of different Houses");

#Labelling the axis
plt.ylabel("Frequency");
plt.xlabel("Price");

# Show the plot
plt.show();

The distribution of prices of house shows **skwness toward the right**. To make the distribution more normal, log function will be apllied during model evaluation phase

## Feature Engineering

In [None]:
# Keep original copy of the dataset
X_feature = X.copy()

# features engineering subset
if set(['TotalBsmtSF','1stFlrSF','2ndFlrSF']).issubset(X_feature.columns):
    X_feature["TotalSF"] =  X_feature["TotalBsmtSF"] +  X_feature["1stFlrSF"] +  X_feature["2ndFlrSF"]

if set(['YearBuilt','YrSold']).issubset(X_feature.columns):
    X_feature["House_Age"] = X_feature["YrSold"] - X_feature["YearBuilt"]

if set(['YearRemodAdd','YrSold']).issubset(X_feature.columns):
    X_feature["SinceRemodel"] = X_feature["YrSold"] - X_feature["YearRemodAdd"]


# Bathrooms
full = X_feature["FullBath"] if "FullBath" in X_feature.columns else 0
half = X_feature["HalfBath"] if "HalfBath" in X_feature.columns else 0
bfull = X_feature["BsmtFullBath"] if "BsmtFullBath" in X_feature.columns else 0
bhalf = X_feature['BsmtHalfBath'] if 'BsmtHalfBath' in X_feature.columns else 0

if isinstance (full, (pd.Series,)):
    X_feature["TotalBath"] = full + 0.5*half + bfull + .5*bhalf

# Binary Amenities
for col in ['PoolArea','GarageArea','TotalBsmtSF','MasVnrArea','Fireplaces']:
    if col in X_feature.columns:
        X_feature["Has_" + col] = (X_feature[col].fillna(0) > 0).astype(int)
print(f"Engineered features added. New shape: {X_feature.shape}")

In [None]:
 X_feature.head

## Prepocessing and Model Pipelines

In [None]:
# Categorize numerical features and categorical Features
numerical_cols = X.select_dtypes(include = [np.number]).columns.tolist()
categorical_cols = X.select_dtypes(exclude = [np.number]).columns.tolist()

# Numerical values transformer
numeric_transfomer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "median"))
        
    ]
)

# Caegorical values transformer
cat_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown = "ignore"))
    ]
)

# Apply the transformers
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transfomer, numerical_cols),
        ("cat", cat_transformer, categorical_cols)
    ]
)

# Define Models
ridge_model = Ridge(random_state = RANDOM_STATE)
rf_model = RandomForestRegressor(random_state = RANDOM_STATE)
gbr_model = GradientBoostingRegressor(random_state = RANDOM_STATE)

# Store the models
models = {
    "Ridge": ridge_model,
    "RandomForest": rf_model,
    "GradientBoosting": gbr_model
}

# Evaluation via cross_validation
y_log = np.log1p(y)

# Define cv of 5 folds 
cv = KFold(n_splits = 5, shuffle = True, random_state = RANDOM_STATE)

# Evaluate eac model
def cv_rsme(model):
    pipe = Pipeline(steps = [("preprocess", preprocess), ("model", model)])
    neg_rsme = cross_val_score(pipe, X_feature, y_log, scoring = "neg_root_mean_squared_error", cv = cv, n_jobs = -1)
    return -neg_rsme.mean(), -neg_rsme.std()

results = {name: cv_rsme(model) for name, model in models.items()}
pd.DataFrame(results, index = ["Rsme_mean(log)", "Rsme_std(log)"]).T.sort_values("Rsme_mean(log)")

## Hyperparameter Tuning

In [None]:
# Pick best model by Cv
cv_df = pd.DataFrame(results, index = ["Rsme_mean(log)", "Rsme_std(log)"]).T.sort_values("Rsme_mean(log)")
cv_best = cv_df.index[0]
print(f"Best base cv Model:", {cv_best})

# Pick the best model
best_model = models[cv_best]

# Tune Parameters
param_grids = {
    "Ridge": {
        "model__alpha": np.logspace(3, 2, 20)
    },
    "RandomForest":{
        "model__n_estimators": [200,400,800],
        "model__max_depth": [None, 10, 20, 30],
        "model__min_samples_split": [2,5,10],
        "model__min_samples_leaf":[1,2,4]
        
    },
    "GradientBoosting":{
        "model__n_estimators": [200, 400, 600],
        "model__learning_rate": [0.03, 0.05, 0.08, 0.1],
        "model__max_depth": [2, 3, 4],
        "model__subsample": [.8, 1.0]
    }
}

pipe = Pipeline(steps = [("preprocess",preprocess), ("model", best_model)])
param_grid = param_grids.get(cv_best, {})

if param_grid:
    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_grid,
        n_iter = 25,
        cv = cv,
        scoring = "neg_root_mean_squared_error",
        n_jobs = -1,
        random_state = RANDOM_STATE,
        verbose = 1
    )
    search.fit(X_feature, y_log)
    best_pipe = search.best_estimator_
    print(f"Best params: {search.best_params_}")
    print(f"Best CV Rsme(log): {-search.best_score_}")

else:
    print("There is no hyperpameters to tune for this model, using base pipeline")
    best_pipe = pipe

## Fit Final Model & Interpret Features

In [None]:
# Fit Model
best_pipe.fit(X_feature, y_log)

In [None]:
# Get Features names after preprocessing
ohe = best_pipe.named_steps["preprocess"].named_transformers_["cat"].named_steps["onehot"]
# Extract numerical features
num_feat = best_pipe.named_steps["preprocess"].transformers_[0][2]
# Etract categorical features
cat_feat = ohe.get_feature_names_out(best_pipe.named_steps["preprocess"].transformers_[1][2])
# All features 
feature_names = np.r_[num_feat, cat_feat]

# Extract importances(coefficients)
model = best_pipe.named_steps["model"]
importance = None
if hasattr(model, "feature_importances_"):
    importance = model.feature_importances_
elif hasattr(model, "coe_"):
    coef = model.coef_.ravel() if hasattr(model._coef_, "ravel") else model.coef_
    importance = np.abs(coef)

else:
    print("Model does not provide native coefficients")

# Create a dataFrame of the Features ad Importance
if importance is not None:
    imp_df = pd.DataFrame({"Feature": feature_names, "Importance": importance})
    imp_df = imp_df.sort_values("Importance", ascending = False).head(25)
    imp_df


In [None]:
# Plot feature Importance
plt.figure(figsize = (8,6))
imp_df.plot(kind = "barh");

In [None]:
# Plot feature Importance
if 'imp_df' in locals():
    plt.figure(figsize=(8,8))
    plt.barh(imp_df['Feature'][::-1], imp_df['Importance'][::-1])
    # Add a title
    plt.title('Top Feature Importances / Coefficient')

    # Label Axis
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.show()