In [1]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv1D, MaxPooling1D, Flatten, Input, Reshape

In [18]:
df = pd.read_csv("/content/AmesHousing.csv")

In [19]:
df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order            2930 non-null   int64  
 1   PID              2930 non-null   int64  
 2   MS SubClass      2930 non-null   int64  
 3   MS Zoning        2930 non-null   object 
 4   Lot Frontage     2440 non-null   float64
 5   Lot Area         2930 non-null   int64  
 6   Street           2930 non-null   object 
 7   Alley            198 non-null    object 
 8   Lot Shape        2930 non-null   object 
 9   Land Contour     2930 non-null   object 
 10  Utilities        2930 non-null   object 
 11  Lot Config       2930 non-null   object 
 12  Land Slope       2930 non-null   object 
 13  Neighborhood     2930 non-null   object 
 14  Condition 1      2930 non-null   object 
 15  Condition 2      2930 non-null   object 
 16  Bldg Type        2930 non-null   object 
 17  House Style   

In [21]:
print(f"Shape: {df.shape}")


Shape: (2930, 82)


In [25]:

def data_clean(df):
    """Cleans the Ames Housing data and prints intermediate steps."""
    print("CLEANING DATA ===")

    # Columns to drop with reasoning
    cols_to_drop = {
        'Order': 'Redundant index column',
        'PID': 'Unique property identifier (not useful for modeling)',
        'Alley': '93% missing values',
        'Pool QC': '99% missing values',
        'Fence': '80% missing values',
        'Misc Feature': '96% missing values',
        'Garage Yr Blt': 'Highly correlated with YearBuilt',
        'Mo Sold': 'Potential target leakage',
        'Yr Sold': 'Potential target leakage'
    }

    print(f"\n→ Dropping columns: {list(cols_to_drop.keys())}")
    df_clean = df.drop(columns=list(cols_to_drop.keys()))

    # Show missing value summary before filling
    print("\n→ Missing values BEFORE filling (Top 10):")
    print(df_clean.isnull().sum().sort_values(ascending=False).head(10))

    # Fill missing values
    df_filled = df_clean.copy()
    for col in df_filled.columns:
        if df_filled[col].dtype == 'object':
            df_filled[col] = df_filled[col].fillna('None')
        else:
            df_filled[col] = df_filled[col].fillna(df_filled[col].median())

    # Show missing value summary after filling
    total_missing = df_filled.isnull().sum().sum()
    print(f"\n→ Missing values AFTER filling: {total_missing} total")

    print("\n→ Cleaned data shape:", df_filled.shape)
    print("\n→ Preview of cleaned data:")
    display(df_filled.head(3))

    return df_filled

# Run the cleaning function
cleaned_data = data_clean(df)


CLEANING DATA ===

→ Dropping columns: ['Order', 'PID', 'Alley', 'Pool QC', 'Fence', 'Misc Feature', 'Garage Yr Blt', 'Mo Sold', 'Yr Sold']

→ Missing values BEFORE filling (Top 10):
Mas Vnr Type      1775
Fireplace Qu      1422
Lot Frontage       490
Garage Qual        159
Garage Cond        159
Garage Finish      159
Garage Type        157
Bsmt Exposure       83
BsmtFin Type 2      81
Bsmt Qual           80
dtype: int64

→ Missing values AFTER filling: 0 total

→ Cleaned data shape: (2930, 73)

→ Preview of cleaned data:


Unnamed: 0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Sale Type,Sale Condition,SalePrice
0,20,RL,141.0,31770,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,210,62,0,0,0,0,0,WD,Normal,215000
1,20,RH,80.0,11622,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,140,0,0,0,120,0,0,WD,Normal,105000
2,20,RL,81.0,14267,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,393,36,0,0,0,0,12500,WD,Normal,172000


In [27]:
def feature_engineering(df):
    """Performs feature engineering with key printouts."""
    print("FEATURE ENGINEERING ===")
    CURRENT_YEAR = 2025

    df_eng = df.copy()

    # Create new engineered features
    df_eng['House_Age'] = CURRENT_YEAR - df_eng['Year Built']
    df_eng['Years_Since_Remodel'] = CURRENT_YEAR - df_eng['Year Remod/Add']
    df_eng['Total_SF'] = (
        df_eng['Total Bsmt SF'] +
        df_eng['1st Flr SF'] +
        df_eng['2nd Flr SF']
    )
    df_eng['Total_Bathrooms'] = (
        df_eng['Full Bath'] +
        0.5 * df_eng['Half Bath'] +
        df_eng['Bsmt Full Bath'] +
        0.5 * df_eng['Bsmt Half Bath']
    )
    df_eng['Has_Pool'] = (df_eng['Pool Area'] > 0).astype(int)

    # Identify and remove columns that were used to create new features
    cols_to_remove = [
        'Year Built', 'Year Remod/Add', 'Total Bsmt SF',
        '1st Flr SF', '2nd Flr SF', 'Full Bath', 'Half Bath',
        'Bsmt Full Bath', 'Bsmt Half Bath', 'Pool Area'
    ]

    # Display newly created features
    new_features = [col for col in df_eng.columns if col not in df.columns]
    print("\n→ Added features:")
    print(new_features)

    # Drop original component columns
    print("\n→ Dropping used columns:", cols_to_remove)
    df_final = df_eng.drop(columns=cols_to_remove)

    print("\n→ Final shape after engineering:", df_final.shape)
    print("\n→ Preview of engineered data:")
    display(df_final.head(3))

    return df_final

# Apply the feature engineering function
engineered_data = feature_engineering(cleaned_data)


FEATURE ENGINEERING ===

→ Added features:
['House_Age', 'Years_Since_Remodel', 'Total_SF', 'Total_Bathrooms', 'Has_Pool']

→ Dropping used columns: ['Year Built', 'Year Remod/Add', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Full Bath', 'Half Bath', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Pool Area']

→ Final shape after engineering: (2930, 68)

→ Preview of engineered data:


Unnamed: 0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,Screen Porch,Misc Val,Sale Type,Sale Condition,SalePrice,House_Age,Years_Since_Remodel,Total_SF,Total_Bathrooms,Has_Pool
0,20,RL,141.0,31770,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,WD,Normal,215000,65,65,2736.0,2.0,0
1,20,RH,80.0,11622,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,120,0,WD,Normal,105000,64,64,1778.0,1.0,0
2,20,RL,81.0,14267,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,0,12500,WD,Normal,172000,67,67,2658.0,1.5,0


In [29]:

def data_prepare(df, target='SalePrice', test_size=0.2, random_state=42):
    """
    Prepares data for modeling:
    - Splits into train/test sets
    - Separates numerical and categorical features
    - Displays dataset diagnostics
    """
    print("FINAL DATA PREPROCESSING")

    # Separate target and features
    print(f"\n→ Target variable: '{target}'")
    if target not in df.columns:
        raise ValueError(f"Target column '{target}' not found in DataFrame.")

    X = df.drop(columns=[target])
    y = df[target]

    # Identify column types
    numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    print(f"\n→ Detected {len(numeric_cols)} numeric features:")
    print(numeric_cols)

    print(f"\n→ Detected {len(categorical_cols)} categorical features:")
    print(categorical_cols)

    # Train/test split
    print(f"\n→ Splitting data: {int((1 - test_size) * 100)}% train / {int(test_size * 100)}% test")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    print("\n→ Split shapes:")
    print(f"  X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"  X_test : {X_test.shape}, y_test : {y_test.shape}")

    return X_train, X_test, y_train, y_test, numeric_cols, categorical_cols

# Execute preprocessing
X_train, X_test, y_train, y_test, numeric_cols, categorical_cols = data_prepare(engineered_data)


FINAL DATA PREPROCESSING

→ Target variable: 'SalePrice'

→ Detected 28 numeric features:
['MS SubClass', 'Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Misc Val', 'House_Age', 'Years_Since_Remodel', 'Total_SF', 'Total_Bathrooms', 'Has_Pool']

→ Detected 39 categorical features:
['MS Zoning', 'Street', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual',

In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, LSTM, Input, Dropout

# ==== 1. PREPROCESSING ====
def build_preprocessor(numeric_cols, categorical_cols):
    return ColumnTransformer([
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

preprocessor = build_preprocessor(numeric_cols, categorical_cols)

# ==== 2A. RANDOM FOREST ====
rf_pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
rf_pipeline.fit(X_train, y_train)
rf_preds = rf_pipeline.predict(X_test)

# ==== 2B. SUPPORT VECTOR MACHINE ====
svm_pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', SVR(C=100, epsilon=0.2))
])
svm_pipeline.fit(X_train, y_train)
svm_preds = svm_pipeline.predict(X_test)

# ==== 2C. CNN-1D ====
# Prepare data for deep learning
X_train_nn = preprocessor.fit_transform(X_train)
X_test_nn = preprocessor.transform(X_test)

X_train_cnn = X_train_nn.toarray().reshape(-1, X_train_nn.shape[1], 1)
X_test_cnn = X_test_nn.toarray().reshape(-1, X_test_nn.shape[1], 1)

cnn_model = Sequential([
    Input(shape=(X_train_cnn.shape[1], 1)),
    Conv1D(32, kernel_size=3, activation='relu'),
    Conv1D(16, kernel_size=3, activation='relu'),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1)
])
cnn_model.compile(optimizer='adam', loss='mse')
cnn_model.fit(X_train_cnn, y_train, epochs=10, batch_size=32, verbose=0)
cnn_preds = cnn_model.predict(X_test_cnn).flatten()

# ==== 2D. RNN (LSTM) ====
rnn_model = Sequential([
    Input(shape=(X_train_cnn.shape[1], 1)),
    LSTM(32),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1)
])
rnn_model.compile(optimizer='adam', loss='mse')
rnn_model.fit(X_train_cnn, y_train, epochs=10, batch_size=32, verbose=0)
rnn_preds = rnn_model.predict(X_test_cnn).flatten()

# ==== 3. EVALUATION ====
def evaluate_model(name, y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return {
        'Model': name,
        'RMSE': round(rmse, 2),
        'R²': round(r2, 4)
    }

results = [
    evaluate_model("Random Forest", y_test, rf_preds),
    evaluate_model("SVM", y_test, svm_preds),
    evaluate_model("CNN-1D", y_test, cnn_preds),
    evaluate_model("RNN (LSTM)", y_test, rnn_preds),
]

results_df = pd.DataFrame(results)
print("\n🔍 Model Performance Comparison:")
print(results_df.sort_values(by='RMSE'))


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step

🔍 Model Performance Comparison:
           Model       RMSE      R²
0  Random Forest   25108.56  0.9214
2         CNN-1D   67488.20  0.4319
1            SVM   84714.01  0.1049
3     RNN (LSTM)  208871.36 -4.4415
