<a href="https://colab.research.google.com/github/karenlc4/Spoon-Knife/blob/main/housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install scikit-learn==1.5.2

Collecting scikit-learn==1.5.2
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.5.2


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
import seaborn as sns

# Load the housing dataset from a URL into a pandas DataFrame
housing = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')

# --- Data Exploration ---
# Display the first few rows to understand the dataset's structure
housing.head()
# Show data types and non-null counts for each column
housing.info()
# Check for missing values in each column
housing.isnull().sum()
# Generate summary statistics (mean, min, max, etc.) for numerical columns
housing.describe()
# Visualize the distribution of house prices with a histogram and kernel density estimate
sns.histplot(housing['price'], kde=True)

# --- Data Cleaning ---
# Remove outliers: keep houses with 10 or fewer bedrooms to filter unrealistic data
housing = housing[housing['bedrooms'] <= 10]
# Remove houses with zero bathrooms, as this is likely invalid data
housing = housing[housing['bathrooms'] != 0]

# --- Prepare Features and Target ---
# Create a copy of the dataset to preserve the original
X_full = housing.copy()
# Extract the target variable 'price' into y_full
y_full = X_full['price']
# Drop the 'price' column from X_full to create the feature set
X_full.drop(columns='price', inplace=True)

# --- Train-Test Split ---
# Split data into training (80%) and testing (20%) sets
# random_state=42 ensures reproducibility of the split
X_train_full, X_test_full, y_train_price, y_test_price = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42
)

# --- Feature Engineering Function ---
def engineer_features(df, zipcode_price_map=None, fit_scaler=False, scaler=None):
    # Create a copy of the input DataFrame to avoid modifying the original
    df = df.copy()

    # Convert 'date' column to datetime format for easier manipulation
    df['date'] = pd.to_datetime(df['date'])
    # Extract year and month from the sale date as new features
    df['year_sold'] = df['date'].dt.year
    df['month_sold'] = df['date'].dt.month
    # Drop the original 'date' column as it's no longer needed
    df.drop(columns='date', inplace=True)

    # Create a binary feature indicating if the house was renovated (1 if renovated, 0 if not)
    df['was_renovated'] = df['yr_renovated'].apply(lambda x: 1 if x > 0 else 0)
    # Drop the original 'yr_renovated' column
    df.drop(columns='yr_renovated', inplace=True)

    # Add average house price per zipcode as a feature (if zipcode_price_map is provided)
    if zipcode_price_map is not None:
        df['zipcode_price'] = df['zipcode'].map(zipcode_price_map)
    # Drop the 'zipcode' column as it's replaced by zipcode_price
    df.drop(columns='zipcode', inplace=True)

    # Calculate the age of the house at the time of sale
    df['age'] = df['year_sold'] - df['yr_built']
    # Drop the 'yr_built' column as it's replaced by 'age'
    df.drop(columns='yr_built', inplace=True)

    # Drop the 'id' column as it's not useful for prediction
    df.drop(columns='id', inplace=True)

    # Define columns to scale (numerical features with different ranges)
    scale_cols = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement',
                  'sqft_living15', 'sqft_lot15', 'age']

    # Scale numerical features to [0,1] range using MinMaxScaler
    if fit_scaler:
        # For training data: fit the scaler and transform the data
        scaler = MinMaxScaler()
        df[scale_cols] = scaler.fit_transform(df[scale_cols])
    else:
        # For test data: apply the trained scaler to transform the data
        df[scale_cols] = scaler.transform(df[scale_cols])

    # Return the processed DataFrame and the scaler (for use with test data)
    return df, scaler

# --- Create Zipcode Price Mapping ---
# Create a temporary DataFrame with price to calculate average price per zipcode
X_train_with_price = X_train_full.copy()
X_train_with_price['price'] = y_train_price.values
# Compute the mean price for each zipcode in the training data
zipcode_price_map = X_train_with_price.groupby('zipcode')['price'].mean()

# --- Apply Feature Engineering ---
# Process training data and fit the scaler
X_train, scaler = engineer_features(X_train_full, zipcode_price_map, fit_scaler=True)
# Process test data using the same scaler and zipcode price map
X_test, _ = engineer_features(X_test_full, zipcode_price_map, fit_scaler=False, scaler=scaler)

# --- Transform Target Variable ---
# Apply log transformation to the target variable (price) to reduce skewness
y_train = np.log(y_train_price)
y_test = np.log(y_test_price)

# --- Train XGBoost Model ---
# Initialize the XGBoost regressor with specific hyperparameters
model = XGBRegressor(
    n_estimators=300,       # Number of boosting rounds (trees)
    max_depth=5,            # Maximum depth of each tree
    learning_rate=0.1,      # Step size for updates during training
    subsample=0.8,          # Fraction of samples used per tree
    colsample_bytree=0.8,   # Fraction of features used per tree
    random_state=42         # Ensure reproducibility
)
# Train the model on the processed training data
model.fit(X_train, y_train)

# --- Make Predictions and Evaluate ---
# Predict on the test set (in log scale)
y_pred_log = model.predict(X_test)
# Convert predictions back to original scale using exponential
y_pred = np.exp(y_pred_log)
# Convert true test values back to original scale
y_true = np.exp(y_test)

# Calculate evaluation metrics
rmse = np.sqrt(mean_squared_error(y_true, y_pred))  # Root Mean Squared Error
mae = mean_absolute_error(y_true, y_pred)           # Mean Absolute Error
medae = median_absolute_error(y_true, y_pred)       # Median Absolute Error
r2 = r2_score(y_true, y_pred)                       # R-squared score

# Print evaluation results
print(f"RMSE: {rmse:.2f}")          # Measures average prediction error
print(f"MAE: {mae:.2f}")            # Average absolute prediction error
print(f"Median Abs Error: {medae:.2f}")  # Median absolute prediction error
print(f"R² Score: {r2:.4f}")        # Proportion of variance explained

RMSE: 104731.43
MAE: 61476.17
Median Abs Error: 36314.11
R² Score: 0.9137


In [5]:
holdout = pd.read_csv("https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test_mini.csv")

# Create a new feature 'zipcode_price' by mapping zipcodes to their average price
# Uses the zipcode_price_map created from the training data in the previous code
holdout['zipcode_price'] = holdout['zipcode'].map(zipcode_price_map)
# Fill missing zipcode prices (for zipcodes not in training data) with the mean of zipcode_price_map
# This ensures no missing values in the new feature
holdout['zipcode_price'] = holdout['zipcode_price'].fillna(zipcode_price_map.mean())

# Apply the same feature engineering as used in training
# Uses the engineer_features function from the previous code
# fit_scaler=False ensures the scaler (trained on training data) is only applied, not refitted
# scaler is the MinMaxScaler object from the training process
X_holdout, _ = engineer_features(holdout, zipcode_price_map, fit_scaler=False, scaler=scaler)

# Ensure the holdout dataset has the same column order as the training dataset
# This is critical because XGBoost expects features in the same order as during training
X_holdout = X_holdout[X_train.columns]

# Make predictions on the holdout dataset using the trained XGBoost model
# Predictions are in log scale (since the model was trained on log-transformed prices)
log_predictions = model.predict(X_holdout)
# Convert predictions back to the original price scale using exponential
predictions = np.exp(log_predictions)

# Create a DataFrame with the predictions, with a single column named "price"
output = pd.DataFrame(predictions, columns=["price"])
# Export the predictions to a CSV file named "team3-module3-predictions.csv"
# index=False prevents writing row indices to the CSV
output.to_csv("team3-module3-predictions.csv", index=False)