In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [51]:
df = pd.read_csv('C:\Harish\DATA SCIENCE COURSE\Course\Major Projects\House Price Prediction\data\processed\processed_data.csv')

  df = pd.read_csv('C:\Harish\DATA SCIENCE COURSE\Course\Major Projects\House Price Prediction\data\processed\processed_data.csv')


#### Feature Engineering groups for House Price prediction
- Size and Area features
- Age and Time features
- Quality and Condition features
- Other Miscellaneous features

#### **Size & Area Features**

##### Capture total usable space and living quality through area-based features.

| Feature Name      | Description                                                       |
|-------------------|-------------------------------------------------------------------|
| `GrLivArea`        | Above-ground living area                                         |  
| `TotalBsmtSF`      | Total basement area                                              |
| `1stFlrSF`         | First floor square footage                                       |
| `2ndFlrSF`         | Second floor square footage                                      |
| `GarageArea`       | Area of the garage                                               |
| `LotArea`          | Total area of the plot/land                                      |
| `MasVnrArea`       | Masonry veneer area                                              |
| `TotalPorchSF`     | Total porch area (sum of open, enclosed, 3-season, screen)       |
| `TotalHouseSF`     | 1st + 2nd floor + basement                                       |  
| `TotalSqFeet`      | GrLivArea + TotalBsmtSF                                          |  
| `BsmtFinishedSF`   | BsmtFin SF 1 + BsmtFin SF 2 (finished basement only)             |  
| `TotalBath`        | Full + half baths above and below ground (half bath = 0.5)       |  

In [52]:
print(df.columns.tolist())

['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck 

| Feature Name     | In Dataset? | 
| ---------------- | ----------- | 
| `GrLivArea`      |  Yes      |                                                         |
| `TotalBsmtSF`    |  Yes       |                                                          |
| `1stFlrSF`       |  Yes       |   |
| `2ndFlrSF`       |  Yes       |                              |
| `GarageArea`     |  Yes       |                                                          |
| `LotArea`        |  Yes       |                                                          |
| `MasVnrArea`     |  Yes       |                            |
| `OpenPorchSF`    |  Yes       |                            |
| `EnclosedPorch`  |  Yes       |                            |
| `3SsnPorch`      |  Yes       |                            |
| `ScreenPorch`    |  Yes       |                            |
| `BsmtFin SF 1`   |  Yes       |                            |
| `BsmtFin SF 2`   |  Yes       |                            |
| `Bsmt Full Bath` |  Yes       |                            |
| `Bsmt Half Bath` |  Yes       |                            |
| `Full Bath`      |  Yes       |                            |
| `Half Bath`      |  Yes       |                                                          |


##### Features to be created

| Feature Name     | How to Create                                                         |
| ---------------- | --------------------------------------------------------------------- |
| `TotalPorchSF`   | `Open Porch SF + Enclosed Porch + 3Ssn Porch + Screen Porch`          |
| `TotalHouseSF`   | `1st Flr SF + 2nd Flr SF + Total Bsmt SF`                             |
| `TotalSqFeet`    | `GrLivArea + TotalBsmtSF`                                             |
| `BsmtFinishedSF` | `BsmtFin SF 1 + BsmtFin SF 2`                                         |
| `TotalBath`      | `Full Bath + Half Bath * 0.5 + Bsmt Full Bath + Bsmt Half Bath * 0.5` |


In [53]:
def create_size_features(df):

    # Total porch area
    df["TotalPorchSF"] = (
        df["Open Porch SF"] +
        df["Enclosed Porch"] +
        df["3Ssn Porch"] +
        df["Screen Porch"]
    )

    # Total house square footage (including basement)
    df["TotalHouseSF"] = (
        df["1st Flr SF"] +
        df["2nd Flr SF"] +
        df["Total Bsmt SF"]
    )

    # Total square feet above and below ground
    df["TotalSqFeet"] = df["Gr Liv Area"] + df["Total Bsmt SF"]

    # Finished basement area
    df["BsmtFinishedSF"] = df["BsmtFin SF 1"] + df["BsmtFin SF 2"]

    # Total number of bathrooms (full + half + basement)
    df["TotalBath"] = (
        df["Full Bath"] +
        df["Half Bath"] * 0.5 +
        df["Bsmt Full Bath"] +
        df["Bsmt Half Bath"] * 0.5
    )

    return df


In [54]:
df = create_size_features(df)

In [55]:
new__size_cols = ['TotalPorchSF', 'TotalHouseSF', 'TotalSqFeet', 'BsmtFinishedSF', 'TotalBath']
df[new__size_cols].head(2)

Unnamed: 0,TotalPorchSF,TotalHouseSF,TotalSqFeet,BsmtFinishedSF,TotalBath
0,-0.532918,0.548251,0.373698,0.137125,-0.443355
1,0.688065,-1.840844,-1.579442,0.613091,-2.349095


#### Age Features

##### Capture the age and renovation history of the house to reflect depreciation or modernization.

| Feature Name       | Description                                                              |
|--------------------|--------------------------------------------------------------------------|
| `YearBuilt`         | Year when the house was originally constructed                          |
| `YearRemodAdd`      | Year of last remodel (can be same as YearBuilt if never remodeled)      |
| `GarageYrBlt`       | Year when the garage was built                                           |
| `YrSold`            | Year when the house was sold                                             |
| `HouseAge`          | Age of house at the time of sale (YrSold - YearBuilt)                   |
| `RemodAge`          | Years since last remodel at time of sale (YrSold - YearRemodAdd)        |
| `GarageAge`         | Age of garage at the time of sale (YrSold - GarageYrBlt)                |


In [56]:
def create_age_features(df):
    df['HouseAge'] = df['Yr Sold'] - df['Year Built']
    df['RemodAge'] = df['Yr Sold'] - df['Year Remod/Add']
    df['GarageAge'] = df['Yr Sold'] - df['Garage Yr Blt']
    return df

df = create_age_features(df)
new_age_cols = ['HouseAge', 'RemodAge', 'GarageAge']
df[new_age_cols].head(2)

Unnamed: 0,HouseAge,RemodAge,GarageAge
0,2.054036,2.841987,2.388912
1,2.020967,2.794041,2.349733


#### Quality & Condition Features

##### Represent the construction quality and overall physical condition of the property.

| Feature Name        | Description                                                         |
|---------------------|---------------------------------------------------------------------|
| `OverallQual`        | Overall material and finish quality (ordinal 1–10)                 |
| `OverallCond`        | Overall condition of the house (ordinal 1–10)                      |
| `ExterQual`          | Exterior quality (Excellent/Good/Fair/Poor)                        |
| `ExterCond`          | Exterior condition (Excellent/Good/Fair/Poor)                      |
| `KitchenQual`        | Kitchen quality                                                    |
| `GarageQual`         | Garage quality                                                     |
| `GarageCond`         | Garage condition                                                   |
| `BsmtQual`           | Basement height/quality                                            |
| `BsmtCond`           | Basement general condition                                         |
| `HeatingQC`          | Heating quality and condition                                      |
| `FireplaceQu`        | Fireplace quality                                                  |
| `QualityScore`       | Combined score: OverallQual × OverallCond                         |


In [57]:
def create_quality_features(df):
    df['QualityScore'] = df['Overall Qual'] * df['Overall Cond']

    return df

df = create_quality_features(df)
new_quality_cols = ['QualityScore']
df[new_quality_cols].head(2)

Unnamed: 0,QualityScore
0,0.034079
1,-0.305069


#### Binary & Flag Features

##### Capture key boolean indicators and flags that describe useful property traits.

| Feature Name        | Description                                                             |
|---------------------|-------------------------------------------------------------------------|
| `HasPool`           | 1 if the property has a pool (`Pool Area` > 0), else 0                  |
| `HasGarage`         | 1 if the property has a garage (`Garage Area` > 0), else 0              |
| `HasFireplace`      | 1 if there is at least one fireplace, else 0                            |
| `HasPorch`          | 1 if `TotalPorchSF` > 0, else 0                                         |
| `HasDeck`           | 1 if `Wood Deck SF` > 0, else 0                                         |
| `HasBsmt`           | 1 if `Total Bsmt SF` > 0, else 0                                        |
| `HasRemodel`        | 1 if the house was remodeled (`Year Built` ≠ `Year Remod/Add`), else 0 |

In [58]:
# def create_binary_features(df):
#     df['HasPool'] = df['Pool Area'].apply(lambda x: 1 if x > 0 else 0)
#     df['HasGarage'] = df['Garage Area'].apply(lambda x: 1 if x > 0 else 0)
#     df['HasFireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
#     df['HasPorch'] = df['TotalPorchSF'].apply(lambda x: 1 if x > 0 else 0)
#     df['HasDeck'] = df['Wood Deck SF'].apply(lambda x: 1 if x > 0 else 0)
#     df['HasBsmt'] = df['Total Bsmt SF'].apply(lambda x: 1 if x > 0 else 0)
#     df['HasRemodel'] = (df['Year Built'] != df['Year Remod/Add']).astype(int)
#     return df
import numpy as np

def create_binary_flags(df):
    df['HasPool'] = np.where(df['Pool Area'] > 0, 1, 0)
    df['HasGarage'] = np.where(df['Garage Area'] > 0, 1, 0)
    df['HasFireplace'] = np.where(df['Fireplaces'] > 0, 1, 0)
    df['HasPorch'] = np.where(df['TotalPorchSF'] > 0, 1, 0)
    df['HasDeck'] = np.where(df['Wood Deck SF'] > 0, 1, 0)
    df['HasBsmt'] = np.where(df['Total Bsmt SF'] > 0, 1, 0)
    df['HasRemodel'] = np.where(df['Year Built'] != df['Year Remod/Add'], 1, 0)
    
    return df

df = create_binary_flags(df)
new_binary_cols = ['HasPool', 'HasGarage', 'HasFireplace', 'HasPorch', 'HasDeck', 'HasBsmt', 'HasRemodel']
df[new_binary_cols].head(2)

Unnamed: 0,HasPool,HasGarage,HasFireplace,HasPorch,HasDeck,HasBsmt,HasRemodel
0,0,1,1,0,1,1,1
1,0,1,0,1,1,0,1


In [59]:
print(df.columns.tolist())
print(len(df.columns.tolist()))

['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck 

##### Let's drop columns that are unnecessary, being replaced by feature engineered features, captured via binary flags or aggregation

In [60]:
def drop_redundant_columns(df):
    drop_cols = [
        'Order', 'PID', 'Mo Sold', 'Yr Sold',
        '1st Flr SF', '2nd Flr SF',
        'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF',
        'Gr Liv Area', 'Total Bsmt SF',
        'Full Bath', 'Half Bath', 'Bsmt Full Bath', 'Bsmt Half Bath',
        'Fireplaces', 'Pool Area', 'Garage Area',
        'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch',
        'Screen Porch', '3Ssn Porch'
    ]
    return df.drop(columns=[col for col in drop_cols if col in df.columns])

In [61]:
df = drop_redundant_columns(df)

In [62]:
print(len(df.columns.tolist()))

75


##### Ordinal Feature Encoding

| Column           | Description             | Values (Low -> High)                                               |
| ---------------- | ----------------------- | ----------------------------------------------------------------- |
| `Exter Qual`     | Exterior quality        | `Po` < `Fa` < `TA` < `Gd` < `Ex`                                  |
| `Exter Cond`     | Exterior condition      | `Po` < `Fa` < `TA` < `Gd` < `Ex`                                  |
| `Bsmt Qual`      | Basement height         | `NA` < `Po` < `Fa` < `TA` < `Gd` < `Ex`                           |
| `Bsmt Cond`      | Basement condition      | `NA` < `Po` < `Fa` < `TA` < `Gd` < `Ex`                           |
| `Bsmt Exposure`  | Walkout or garden level | `No` < `Mn` < `Av` < `Gd`                                         |
| `BsmtFin Type 1` | Basement finish type    | `Unf` < `LwQ` < `Rec` < `BLQ` < `ALQ` < `GLQ`                     |
| `BsmtFin Type 2` | Second basement finish  | same as above                                                     |
| `Heating QC`     | Heating quality         | `Po` < `Fa` < `TA` < `Gd` < `Ex`                                  |
| `Kitchen Qual`   | Kitchen quality         | `Po` < `Fa` < `TA` < `Gd` < `Ex`                                  |
| `Garage Finish`  | Garage interior finish  | `Unf` < `RFn` < `Fin`                                             |
| `Garage Qual`    | Garage quality          | `NA` < `Po` < `Fa` < `TA` < `Gd` < `Ex`                           |
| `Garage Cond`    | Garage condition        | same as above                                                     |
| `Paved Drive`    | Driveway paving         | `N` < `P` < `Y`                                                   |
| `Functional`     | Home functionality      | `Sal` < `Sev` < `Maj2` < `Maj1` < `Mod` < `Min2` < `Min1` < `Typ` |


In [63]:
def encode_ordinal_features(df):
    quality_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, np.nan: 0}
    exposure_map = {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4, np.nan: 0}
    bsmtfin_map = {'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6, np.nan: 0}
    garage_finish_map = {'Unf': 1, 'RFn': 2, 'Fin': 3, np.nan: 0}
    paved_drive_map = {'N': 1, 'P': 2, 'Y': 3}
    functional_map = {'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8}

    maps = {
        'Exter Qual': quality_map,
        'Exter Cond': quality_map,
        'Bsmt Qual': quality_map,
        'Bsmt Cond': quality_map,
        'Bsmt Exposure': exposure_map,
        'BsmtFin Type 1': bsmtfin_map,
        'BsmtFin Type 2': bsmtfin_map,
        'Heating QC': quality_map,
        'Kitchen Qual': quality_map,
        'Garage Finish': garage_finish_map,
        'Garage Qual': quality_map,
        'Garage Cond': quality_map,
        'Paved Drive': paved_drive_map,
        'Functional': functional_map,
    }

    for col, mapping in maps.items():
        if col in df.columns:
            df[col] = df[col].map(mapping)

    return df

In [64]:
df = encode_ordinal_features(df)

In [65]:
ordinal_cols = [
    'Exter Qual', 'Exter Cond', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
    'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating QC', 'Kitchen Qual',
    'Garage Finish', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Functional'
]

for col in ordinal_cols:
    if col in df.columns:
        print(f"{col}: {df[col].unique()}")
df[ordinal_cols].head(2)

Exter Qual: [3 4 5 2]
Exter Cond: [3 4 2 1 5]
Bsmt Qual: [3 4 5 0 2 1]
Bsmt Cond: [4 3 0 1 2 5]
Bsmt Exposure: [4 1 2 3 0]
BsmtFin Type 1: [4 3 5 6 1 2 0]
BsmtFin Type 2: [1 2 4 3 0 6 5]
Heating QC: [2 3 5 4 1]
Kitchen Qual: [3 4 5 2 1]
Garage Finish: [3 1 2 0]
Garage Qual: [3 0 2 4 5 1]
Garage Cond: [3 0 2 4 5 1]
Paved Drive: [2 3 1]
Functional: [8 5 7 6 4 3 2 1]


Unnamed: 0,Exter Qual,Exter Cond,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin Type 2,Heating QC,Kitchen Qual,Garage Finish,Garage Qual,Garage Cond,Paved Drive,Functional
0,3,3,3,4,4,4,1,2,3,3,3,3,2,8
1,3,3,3,3,1,3,2,3,3,1,3,3,3,8


In [66]:
final_df = df.copy()

final_df.to_csv(r"C:\Harish\DATA SCIENCE COURSE\Course\Major Projects\House Price Prediction\data\processed\processed_data_with_features.csv", index=False)

print("✅ Processed data with engineered features saved successfully.")

print(final_df.info())


✅ Processed data with engineered features saved successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 75 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MS SubClass      2930 non-null   float64
 1   MS Zoning        2930 non-null   object 
 2   Lot Frontage     2440 non-null   float64
 3   Lot Area         2930 non-null   float64
 4   Street           2930 non-null   object 
 5   Alley            198 non-null    object 
 6   Lot Shape        2930 non-null   object 
 7   Land Contour     2930 non-null   object 
 8   Utilities        2930 non-null   object 
 9   Lot Config       2930 non-null   object 
 10  Land Slope       2930 non-null   object 
 11  Neighborhood     2930 non-null   object 
 12  Condition 1      2930 non-null   object 
 13  Condition 2      2930 non-null   object 
 14  Bldg Type        2930 non-null   object 
 15  House Style      2930 non-null   object 
 16

In [67]:
final_df.head(2)

Unnamed: 0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,...,RemodAge,GarageAge,QualityScore,HasPool,HasGarage,HasFireplace,HasPorch,HasDeck,HasBsmt,HasRemodel
0,-0.877005,RL,3.072506,2.744381,Pave,,IR1,Lvl,AllPub,Corner,...,2.841987,2.388912,0.034079,0,1,1,0,1,1,1
1,-0.877005,RH,0.461265,0.187097,Pave,,Reg,Lvl,AllPub,Inside,...,2.794041,2.349733,-0.305069,0,1,0,1,1,0,1
