In [14]:
# PART 1: Downloading and Cleaning
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("AmesHousing.csv", index_col=0)

# Identify unnamed columns and PID to drop
unnamed_cols = [col for col in df.columns if 'unnamed' in col.lower() or 'no meaning' in col.lower()]
columns_to_drop = unnamed_cols + ['PID']

# Drop identified columns
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')

print(f"Dropped {len(unnamed_cols)} unnamed columns and PID")
print(f"DataFrame shape: {df_cleaned.shape}")
print("Note: Target creation and splitting moved to Part 2 to prevent data leakage.")

Dropped 0 unnamed columns and PID
DataFrame shape: (2930, 80)
Note: Target creation and splitting moved to Part 2 to prevent data leakage.


In [15]:
# Part 2 Missing Value Imputation

# Show the number of missing values before we start
print("Missing values before imputation:")
print(df_cleaned.isnull().sum().sort_values(ascending=False).head(10))
print("-" * 30)

# Drop rows where the dependent variable is missing
DV = 'SalePrice'
df_sample1 = df_cleaned.dropna(subset=[DV]).copy() # Use .copy() to avoid warnings

# --- 1. Impute "Meaningful NA" Categoricals ---
# These are columns where 'NA' is a category (e.g., "No Basement"), not missing data.
meaningful_na_columns = [
    'Alley', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 
    'BsmtFin Type 1', 'BsmtFin Type 2', 'FireplaceQu',
    'Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond',
    'Pool QC', 'Fence', 'Misc Feature', 'Mas Vnr Type'
]

for col in meaningful_na_columns:
    if col in df_sample1.columns:
        df_sample1[col] = df_sample1[col].fillna('None')

# --- 2. Numerical Imputation ---
# We create df_sample2 by filling all numerical NAs
df_sample2 = df_sample1.copy()

# A. Smart Imputation (Context-Aware): Fill with 0
# If a house has no basement, its basement-related numericals should be 0, not a median.

# Basement-related numericals, if there is no Basement, then the other Basement columns get 0.
if 'Bsmt Qual' in df_sample2.columns:
    mask = (df_sample2['Bsmt Qual'] == 'None')
    bsmt_num_cols = ['Total Bsmt SF', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Bsmt Full Bath', 'Bsmt Half Bath']
    for col in bsmt_num_cols:
        if col in df_sample2.columns:
            df_sample2.loc[mask, col] = df_sample2.loc[mask, col].fillna(0)

# Garage-related numericals: If there is no Garage, then the other Garage columns get 0.
if 'Garage Type' in df_sample2.columns:
    mask = (df_sample2['Garage Type'] == 'None')
    garage_num_cols = ['Garage Cars', 'Garage Area', 'Garage Yr Blt']
    for col in garage_num_cols:
        if col in df_sample2.columns:
            df_sample2.loc[mask, col] = df_sample2.loc[mask, col].fillna(0)

# Masonry veneer numericals. If there is no Masonry Veneer Numerical, then the other Masonry columns get 0.
if 'Mas Vnr Type' in df_sample2.columns:
    mask = (df_sample2['Mas Vnr Type'] == 'None')
    if 'Mas Vnr Area' in df_sample2.columns:
         df_sample2.loc[mask, 'Mas Vnr Area'] = df_sample2.loc[mask, 'Mas Vnr Area'].fillna(0)
 
# 'Lot Frontage' is likely similar for houses in the same 'Neighborhood', so I use the group median of lot frontage for the neighborhood to impute..
if 'Lot Frontage' in df_sample2.columns and 'Neighborhood' in df_sample2.columns:
    # Fill NAs with the median Lot Frontage of that specific neighborhood
    df_sample2['Lot Frontage'] = df_sample2.groupby('Neighborhood')['Lot Frontage'].transform(lambda x: x.fillna(x.median()))
    # If any NAs remain (e.g., a whole neighborhood was NA), fill with the overall median
    df_sample2['Lot Frontage'] = df_sample2['Lot Frontage'].fillna(df_sample2['Lot Frontage'].median())

# C. Generic Median Imputation (Fallback)
# Now, find ALL remaining numerical columns and fill them with their median.
# This will handle columns like 'Lot Area' and any NAs our previous logic missed.
all_numerical_cols = df_sample2.select_dtypes(include=np.number).columns
df_sample2[all_numerical_cols] = df_sample2[all_numerical_cols].fillna(value=df_sample2[all_numerical_cols].median())


# --- 3. Categorical Imputation ---
# We create df_sample4 by filling all remaining categorical NAs
df_sample4 = df_sample2.copy()


# B. Generic Mode Imputation (Fallback)
# Find ALL remaining categorical/object columns and fill with their mode.
all_categorical_cols = df_sample4.select_dtypes(include=['object', 'category']).columns

for col in all_categorical_cols:
     df_sample4[col] = df_sample4[col].fillna(df_sample4[col].mode()[0])


# --- 4. Final Check ---
# This command should now return 0. There should now be no missing values in our dataset.
total_missing = df_sample4.isnull().sum().sum()
print("-" * 30)
print(f"Total missing values remaining in df_sample4: {total_missing}")



Missing values before imputation:
Pool QC          2917
Misc Feature     2824
Alley            2732
Fence            2358
Mas Vnr Type     1775
Fireplace Qu     1422
Lot Frontage      490
Garage Cond       159
Garage Yr Blt     159
Garage Finish     159
dtype: int64
------------------------------
------------------------------
Total missing values remaining in df_sample4: 0


In [16]:
# PART 3: Variable Transformation & Splitting
from sklearn.model_selection import train_test_split

# 1. Define Variable Lists
nvar_list_original = [
    'Lot Frontage', 'Lot Area', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2',
    'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
    'Gr Liv Area', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch',
    '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val',
    'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Bsmt Full Bath',
    'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr',
    'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Mo Sold', 'Yr Sold'
]

cvar_list_original = [
    'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config',
    'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style',
    'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
    'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature',
    'Sale Type', 'Sale Condition', 'Lot Shape', 'Utilities', 'Land Slope', 
    'Exter Qual', 'Exter Cond', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 
    'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating QC', 'Electrical', 'Kitchen Qual', 
    'Functional', 'Fireplace Qu', 'Garage Finish', 'Garage Qual', 'Garage Cond', 
    'Paved Drive', 'Pool QC', 'Fence'
]

# 2. Handle Missing Values BEFORE Splitting
# Simple fill for demonstration so the math doesn't break
df_cleaned[nvar_list_original] = df_cleaned[nvar_list_original].fillna(0)
df_cleaned[cvar_list_original] = df_cleaned[cvar_list_original].fillna("Missing")

# 3. SPLIT DATA FIRST 
X = df_cleaned[nvar_list_original + cvar_list_original]
y_raw = df_cleaned["SalePrice"]

X_train, X_test, y_raw_train, y_raw_test = train_test_split(
    X, y_raw, test_size=0.2, random_state=1
)

# TARGET VARIABLE CREATION 
# Calculate median ONLY on Train
median_price_train = y_raw_train.median()
print(f"Training Median Price: ${median_price_train:,.2f}")

# Apply to both Train and Test
y_train = (y_raw_train <= median_price_train).astype(int)
y_test = (y_raw_test <= median_price_train).astype(int)

#  NUMERICAL STANDARDIZATION  
# We create copies 
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Calculate Mean and Std ONLY on Train
train_means = X_train[nvar_list_original].mean()
train_stds = X_train[nvar_list_original].std()

# Apply formula to Train
X_train_scaled[nvar_list_original] = (X_train[nvar_list_original] - train_means) / train_stds

# Apply SAME formula (using Train mean/std) to Test
X_test_scaled[nvar_list_original] = (X_test[nvar_list_original] - train_means) / train_stds


#  CATEGORICAL DUMMY CODING 

# 1. Convert to category type
X_train_scaled[cvar_list_original] = X_train_scaled[cvar_list_original].astype('category')
X_test_scaled[cvar_list_original] = X_test_scaled[cvar_list_original].astype('category')

# 2. Get Dummies separately
X_train_dummies = pd.get_dummies(X_train_scaled, prefix_sep='_', dtype=int)
X_test_dummies = pd.get_dummies(X_test_scaled, prefix_sep='_', dtype=int)

# 3. ALIGN COLUMNS
# X_test might be missing columns that X_train has (or vice versa).
# We enforce X_test to have exactly the same columns as X_train, filling missing ones with 0.
X_test_dummies = X_test_dummies.reindex(columns=X_train_dummies.columns, fill_value=0)

# 4. Drop Redundant Dummies (The Mode)
# We find the mode in TRAIN and drop that specific column from BOTH
cols_to_drop = []

for var in cvar_list_original:
    # Find mode in TRAIN
    mode_value = X_train[var].mode()[0] 
    dummy_col = f"{var}_{mode_value}"
    
    # If this dummy column exists in our dummified dataset, mark it for deletion
    if dummy_col in X_train_dummies.columns:
        cols_to_drop.append(dummy_col)

# Drop the columns from both datasets
X_train_final = X_train_dummies.drop(columns=cols_to_drop, errors='ignore')
X_test_final = X_test_dummies.drop(columns=cols_to_drop, errors='ignore')

print("Train shape:", X_train_final.shape)
print("Test shape:", X_test_final.shape)
print("Columns are aligned and normalized without leakage.")

Training Median Price: $163,500.00
Train shape: (2344, 289)
Test shape: (586, 289)
Columns are aligned and normalized without leakage.


In [17]:
# Our DV is InBudget_Not in Budget   (1 = Not in Budget, 0 = In Budget)


In [18]:
# PART 5: Neural Network
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import numpy as np

# Initialize parameters
alpha = 0.1
hidden_layer_sizes = (5,)

# Initialize and train model on the processed Training data
clf = MLPClassifier(
    solver='lbfgs', 
    alpha=alpha,
    hidden_layer_sizes=hidden_layer_sizes, 
    max_iter=2000,
    random_state=1
)

clf.fit(X_train_final, y_train)

# Custom summary function
def summary_nn(model):
    print("\nNeural Network Model Summary")
    print("Section 0. Penalty level alpha:", model.alpha)
    
    # Only showing first few weights to keep output clean
    print("\nSection 1. Input (I) -> Hidden (H) Weights (First 5):")
    if hasattr(model, 'coefs_'):
        for i in range(min(5, model.coefs_[0].shape[0])):
            for j in range(model.coefs_[0].shape[1]):
                print(f"I:{i+1} -> H:{j+1} - W: {model.coefs_[0][i][j]:.4f}")
    
    print("\nSection 2. Hidden Node Biases:")
    if hasattr(model, 'intercepts_'):
        for j in range(len(model.intercepts_[0])):
            print(f"H:{j+1} - B: {model.intercepts_[0][j]:.4f}")

summary_nn(clf)

# --- GRID SEARCH ---

param_grid = {
    'hidden_layer_sizes': [(3,), (5,), (10,)],  
    'alpha': [0.0001, 0.1, 1.0] 
}

gridsearch = GridSearchCV(
    MLPClassifier(solver='lbfgs', max_iter=2000, random_state=1),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)

print("\nStarting Grid Search...")
gridsearch.fit(X_train_final, y_train)
clf_best = gridsearch.best_estimator_

print("Best Neural Network Parameters:")
print(gridsearch.best_params_)

# --- EVALUATION ON TEST SET ---
# We use the manually processed X_test_final here
y_test_proba = clf_best.predict_proba(X_test_final)[:, 1]
auc_score = roc_auc_score(y_test, y_test_proba)
print(f"\nAUC on Test Set: {auc_score:.4f}")


Neural Network Model Summary
Section 0. Penalty level alpha: 0.1

Section 1. Input (I) -> Hidden (H) Weights (First 5):
I:1 -> H:1 - W: 0.0334
I:1 -> H:2 - W: 0.3868
I:1 -> H:3 - W: -0.2359
I:1 -> H:4 - W: 0.0631
I:1 -> H:5 - W: 0.0420
I:2 -> H:1 - W: -0.6904
I:2 -> H:2 - W: 0.2444
I:2 -> H:3 - W: -0.4050
I:2 -> H:4 - W: -0.1350
I:2 -> H:5 - W: -0.8950
I:3 -> H:1 - W: -0.0375
I:3 -> H:2 - W: 0.0024
I:3 -> H:3 - W: -0.2864
I:3 -> H:4 - W: -0.0647
I:3 -> H:5 - W: -0.7760
I:4 -> H:1 - W: -0.7697
I:4 -> H:2 - W: -0.1639
I:4 -> H:3 - W: -0.3054
I:4 -> H:4 - W: -0.2636
I:4 -> H:5 - W: -0.2750
I:5 -> H:1 - W: 0.1651
I:5 -> H:2 - W: 0.0237
I:5 -> H:3 - W: 0.6485
I:5 -> H:4 - W: 0.0811
I:5 -> H:5 - W: 0.2240

Section 2. Hidden Node Biases:
H:1 - B: 0.0210
H:2 - B: 0.8960
H:3 - B: 1.2738
H:4 - B: -0.3911
H:5 - B: 0.6765

Starting Grid Search...
Best Neural Network Parameters:
{'alpha': 1.0, 'hidden_layer_sizes': (5,)}

AUC on Test Set: 0.9868
