In [1]:
import pandas as pd

file_path = '/Users/jorgemartinez/Documents/NYDSA #3 Machine Learning Project/Machine Learning Project Proposal/Ames_HousePrice.csv'
df = pd.read_csv(file_path)

In [2]:
# First, let's drop columns that are unnamed or have no meaning, and also drop PID
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Identify unnamed columns and PID
unnamed_cols = [col for col in df.columns if 'unnamed' in col.lower() or 'no meaning' in col.lower()]
columns_to_drop = unnamed_cols + ['PID']

# Drop identified columns
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')

print(f"Dropped {len(unnamed_cols)} unnamed columns and PID")
print(f"DataFrame shape after dropping columns: {df_cleaned.shape}")

Dropped 1 unnamed columns and PID
DataFrame shape after dropping columns: (2580, 80)


In [3]:
# List of categorical variables where NA means something specific
meaningful_na_columns = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                        'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
                        'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                        'PoolQC', 'Fence', 'MiscFeature', 'MasVnrType']

# Fill NA values in these columns with appropriate labels
for col in meaningful_na_columns:
    if col in df.columns:
        df[col] = df[col].fillna('None')  # Replace NA with 'None'

In [4]:
# Step 1: For basement-related measurements
for col in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']:
    if col in df.columns:
        # Find rows where BsmtQual was originally NA (indicating no basement)
        mask = df['BsmtQual'] == 'None'  # Now 'None' after our replacement above
        # Fill NAs with 0 for houses with no basement
        df.loc[mask, col] = df.loc[mask, col].fillna(0)
        # Fill any remaining NAs with 0
        df[col] = df[col].fillna(0)

# Step 2: For garage-related measurements
if 'GarageArea' in df.columns and 'GarageCars' in df.columns:
    mask = df['GarageType'] == 'None'  # Now 'None' after our replacement above
    # Fill NAs with 0 for houses with no garage
    df.loc[mask, 'GarageArea'] = df.loc[mask, 'GarageArea'].fillna(0)
    df.loc[mask, 'GarageCars'] = df.loc[mask, 'GarageCars'].fillna(0)
    # Fill any remaining NAs with 0
    df['GarageArea'] = df['GarageArea'].fillna(0)
    df['GarageCars'] = df['GarageCars'].fillna(0)

# Step 3: For Masonry veneer area
if 'MasVnrArea' in df.columns:
    mask = df['MasVnrType'] == 'None'
    # Fill NAs with 0 for houses with no masonry veneer
    df.loc[mask, 'MasVnrArea'] = df.loc[mask, 'MasVnrArea'].fillna(0)
    # Fill any remaining NAs with 0
    df['MasVnrArea'] = df['MasVnrArea'].fillna(0)

In [5]:
# Select numerical columns
numerical_df = df.select_dtypes(include='number')

# Drop numerical columns with any REMAINING missing values
numerical_df_no_na = numerical_df.dropna(axis=1)

# Select categorical columns
categorical_df = df.select_dtypes(include='object')

print("Numerical columns with no missing values:", numerical_df_no_na.shape)
print("Categorical columns:", categorical_df.shape)

Numerical columns with no missing values: (2580, 35)
Categorical columns: (2580, 43)


In [6]:
from sklearn.preprocessing import OneHotEncoder

def drop_least_frequent_category(df_cat):
    new_df = pd.DataFrame()
    for col in df_cat.columns:
        counts = df_cat[col].value_counts()
        # Get the category with the fewest instances
        drop_cat = counts.idxmin()

        # Perform one-hot encoding and drop the least frequent category
        dummies = pd.get_dummies(df_cat[col], prefix=col)
        dummies = dummies.drop(f"{col}_{drop_cat}", axis=1)

        new_df = pd.concat([new_df, dummies], axis=1)
    return new_df

In [7]:
# Apply the custom one-hot encoding
categorical_encoded = drop_least_frequent_category(categorical_df)

print("Shape after encoding categorical variables:", categorical_encoded.shape)

Shape after encoding categorical variables: (2580, 231)


In [8]:
# Drop the target column from numerical_df if it's included
X = pd.concat([numerical_df_no_na.drop(columns=['SalePrice'], errors='ignore'),
               categorical_encoded], axis=1)

# Target variable
y = df['SalePrice']

print("Final shape of X:", X.shape)

Final shape of X: (2580, 265)


In [9]:
# Import necessary libraries
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import numpy as np
import matplotlib.pyplot as plt

# Set up 5-fold cross-validation
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [10]:
# Function for cross-validation
def evaluate_model_cv(model, X, y):
    # Get R² scores
    r2_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    
    # Get RMSE scores
    mse_scores = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(mse_scores)
    
    # Print results
    print(f"Model: {model.__class__.__name__}")
    print(f"Average RMSE: ${rmse_scores.mean():,.2f}")
    print(f"Average R²: {r2_scores.mean():.4f}")
    print("-" * 30)
    
    return {
        'model_name': model.__class__.__name__,
        'rmse_mean': rmse_scores.mean(),
        'r2_mean': r2_scores.mean()
    }

In [11]:
# 1. Multiple Linear Regression with cross-validation
print("Multiple Linear Regression:")
mlr = LinearRegression()
mlr_results = evaluate_model_cv(mlr, X, y)

Multiple Linear Regression:
Model: LinearRegression
Average RMSE: $22,623.88
Average R²: 0.9078
------------------------------


In [12]:
# 2. Ridge Regression with cross-validation
print("Ridge Regression (alpha=1.0):")
ridge = Ridge(alpha=1.0) 
ridge_results = evaluate_model_cv(ridge, X, y)

Ridge Regression (alpha=1.0):
Model: Ridge
Average RMSE: $21,847.25
Average R²: 0.9140
------------------------------


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [13]:
# 3. Lasso Regression with cross-validation
print("Lasso Regression (alpha=0.01):")
lasso = Lasso(alpha=0.01)
lasso_results = evaluate_model_cv(lasso, X, y)

Lasso Regression (alpha=0.01):


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Model: Lasso
Average RMSE: $22,552.23
Average R²: 0.9084
------------------------------


  model = cd_fast.enet_coordinate_descent(


In [14]:
# Collect all results
all_results = [mlr_results, ridge_results, lasso_results]

# Create comparison table
results_df = pd.DataFrame(all_results)
print("\nModel Comparison:")
print(results_df)


Model Comparison:
         model_name     rmse_mean   r2_mean
0  LinearRegression  22623.882732  0.907784
1             Ridge  21847.248131  0.913960
2             Lasso  22552.232152  0.908403
