# Ames Housing Dataset - Preprocessing

This notebook preprocesses the **Ames Housing Dataset** - a real dataset with many features.

**Dataset**: Ames Housing (via sklearn/OpenML)  
**Samples**: ~1,460  
**Features**: 79 (large feature space!)  
**Target**: SalePrice

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.datasets import fetch_openml
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
print("Libraries loaded!")

## 1. Load Dataset from OpenML

The Ames Housing dataset is available directly via sklearn's fetch_openml.

In [None]:
# Load Ames Housing from OpenML
print("Downloading Ames Housing dataset...")
ames = fetch_openml(name="house_prices", as_frame=True, parser='auto')

df = ames.frame
print(f"Dataset loaded: {df.shape[0]} samples, {df.shape[1]} columns")

In [None]:
# Define paths
DATA_DIR = Path('../data')
PROCESSED_DIR = DATA_DIR / 'processed'
PROCESSED_DIR.mkdir(exist_ok=True)

# Save raw data
df.to_csv(DATA_DIR / 'ames_housing_raw.csv', index=False)
print(f"Raw data saved to {DATA_DIR / 'ames_housing_raw.csv'}")

In [None]:
df.head()

In [None]:
print(f"Shape: {df.shape}")
print(f"\nColumn types:")
print(df.dtypes.value_counts())

## 2. Data Exploration

In [None]:
# Target variable
TARGET = 'SalePrice'

print(f"Target: {TARGET}")
print(f"  Min: ${df[TARGET].min():,.0f}")
print(f"  Max: ${df[TARGET].max():,.0f}")
print(f"  Mean: ${df[TARGET].mean():,.0f}")
print(f"  Median: ${df[TARGET].median():,.0f}")

In [None]:
# Target distribution
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].hist(df[TARGET], bins=50, color='steelblue', edgecolor='white')
ax[0].set_xlabel('Sale Price ($)')
ax[0].set_title('Sale Price Distribution')
ax[1].hist(np.log1p(df[TARGET]), bins=50, color='coral', edgecolor='white')
ax[1].set_xlabel('Log(Sale Price)')
ax[1].set_title('Log-transformed Distribution')
plt.tight_layout()
plt.show()

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(1)
missing_df = pd.DataFrame({'Missing': missing, 'Percent': missing_pct})
missing_df = missing_df[missing_df['Missing'] > 0].sort_values('Missing', ascending=False)

print(f"Columns with missing values: {len(missing_df)}")
print(missing_df.head(20))

In [None]:
# Feature types
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

if TARGET in numerical_cols:
    numerical_cols.remove(TARGET)

print(f"Numerical features: {len(numerical_cols)}")
print(f"Categorical features: {len(categorical_cols)}")
print(f"Total features: {len(numerical_cols) + len(categorical_cols)}")

## 3. Preprocessing

In [None]:
df_processed = df.copy()

# Drop columns with too many missing values (>50%)
drop_cols = missing_df[missing_df['Percent'] > 50].index.tolist()
if drop_cols:
    print(f"Dropping columns with >50% missing: {drop_cols}")
    df_processed = df_processed.drop(columns=drop_cols)

# Drop Id column if present
if 'Id' in df_processed.columns:
    df_processed = df_processed.drop(columns=['Id'])
    print("Dropped 'Id' column")

print(f"Shape after dropping: {df_processed.shape}")

In [None]:
# Update column lists
numerical_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df_processed.select_dtypes(include=['object', 'category']).columns.tolist()

if TARGET in numerical_cols:
    numerical_cols.remove(TARGET)

print(f"Numerical: {len(numerical_cols)}, Categorical: {len(categorical_cols)}")

In [None]:
# Fill missing numerical values with median
for col in numerical_cols:
    if df_processed[col].isnull().sum() > 0:
        median_val = df_processed[col].median()
        df_processed[col] = df_processed[col].fillna(median_val)

# Fill missing categorical values with mode or 'None'
for col in categorical_cols:
    if df_processed[col].isnull().sum() > 0:
        df_processed[col] = df_processed[col].fillna('None')

print(f"Missing values after filling: {df_processed.isnull().sum().sum()}")

In [None]:
# Encode categorical variables
print(f"Encoding {len(categorical_cols)} categorical columns...")

for col in categorical_cols:
    # Label encode
    df_processed[col] = pd.factorize(df_processed[col])[0]

print(f"Final shape: {df_processed.shape}")
print(f"All columns numerical: {df_processed.select_dtypes(include=['object']).shape[1] == 0}")

In [None]:
# Final check
print(f"\nFinal Dataset:")
print(f"  Samples: {df_processed.shape[0]}")
print(f"  Features: {df_processed.shape[1] - 1}")
print(f"  Target: {TARGET}")
print(f"  Missing values: {df_processed.isnull().sum().sum()}")

df_processed.describe().round(2)

## 4. Correlation Analysis

In [None]:
# Top correlations with target
corr_with_target = df_processed.corr()[TARGET].drop(TARGET).sort_values(ascending=False)
print("Top 15 features correlated with SalePrice:")
print(corr_with_target.head(15))

In [None]:
# Correlation bar plot
top_corr = corr_with_target.head(15)
plt.figure(figsize=(10, 6))
plt.barh(range(len(top_corr)), top_corr.values, color='steelblue')
plt.yticks(range(len(top_corr)), top_corr.index)
plt.xlabel('Correlation with SalePrice')
plt.title('Top 15 Features by Correlation')
plt.gca().invert_yaxis()
plt.tight_layout()
Path('../results/figures').mkdir(parents=True, exist_ok=True)
plt.savefig('../results/figures/ames_correlation.png', dpi=150)
plt.show()

## 5. Save Processed Data

In [None]:
# Save CSV
df_processed.to_csv(PROCESSED_DIR / 'ames_housing_processed.csv', index=False)

# Save numpy arrays
X = df_processed.drop(TARGET, axis=1).values
y = df_processed[TARGET].values
feature_names = [c for c in df_processed.columns if c != TARGET]

np.save(PROCESSED_DIR / 'ames_housing_X.npy', X)
np.save(PROCESSED_DIR / 'ames_housing_y.npy', y)
np.save(PROCESSED_DIR / 'ames_housing_feature_names.npy', np.array(feature_names))

print(f"Saved:")
print(f"  X: {X.shape} ({X.shape[1]} features - large feature space!)")
print(f"  y: {y.shape}")
print(f"  Features: {len(feature_names)}")

In [None]:
print("\n" + "="*60)
print("PREPROCESSING COMPLETE!")
print("="*60)
print(f"\nDataset: Ames Housing")
print(f"Samples: {X.shape[0]}")
print(f"Features: {X.shape[1]} (LARGE FEATURE SPACE)")
print(f"\nTo use in regression_tree.ipynb:")
print("  Change load_dataset to use 'ames_housing'")