In [39]:
import pandas as pd
try:
    import sklearn
    from sklearn import tree, model_selection, metrics
except ImportError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn"])


file_path = '/Users/jorgemartinez/Documents/NYDSA #3 Machine Learning Project/Machine Learning Project Proposal/Ames_HousePrice.csv'
df = pd.read_csv(file_path)

In [40]:
# First, let's drop columns that are unnamed or have no meaning, and also drop PID
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Identify unnamed columns and PID
unnamed_cols = [col for col in df.columns if 'unnamed' in col.lower() or 'no meaning' in col.lower()]
columns_to_drop = unnamed_cols + ['PID']

# Drop identified columns
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')

print(f"Dropped {len(unnamed_cols)} unnamed columns and PID")
print(f"DataFrame shape after dropping columns: {df_cleaned.shape}")

Dropped 1 unnamed columns and PID
DataFrame shape after dropping columns: (2580, 80)


In [41]:
# List of categorical variables where NA means something specific
meaningful_na_columns = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                        'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
                        'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                        'PoolQC', 'Fence', 'MiscFeature', 'MasVnrType']

# Fill NA values in these columns with appropriate labels
for col in meaningful_na_columns:
    if col in df.columns:
        df[col] = df[col].fillna('None')  # Replace NA with 'None'

In [None]:
# how did I come to the conclusion below?

In [43]:
# Impute the single missing 'Electrical' value with the most common category
df['Electrical'] = df['Electrical'].fillna('SBrkr')  # or use df['Electrical'].mode()[0]

print("Filled missing 'Electrical' value with 'SBrkr'")


Filled missing 'Electrical' value with 'SBrkr'


In [44]:
# Imputing Numerical Columns with 0 for Specific Cases
# Step 1: For basement-related measurements
for col in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']:
    if col in df.columns:
        # Find rows where BsmtQual was originally NA (indicating no basement)
        mask = df['BsmtQual'] == 'None'  # Now 'None' after our replacement above
        # Fill NAs with 0 for houses with no basement
        df.loc[mask, col] = df.loc[mask, col].fillna(0)
        # Fill any remaining NAs with 0
        df[col] = df[col].fillna(0)

# Step 2: For garage-related measurements
if 'GarageArea' in df.columns and 'GarageCars' in df.columns:
    mask = df['GarageType'] == 'None'  # Now 'None' after our replacement above
    # Fill NAs with 0 for houses with no garage
    df.loc[mask, 'GarageArea'] = df.loc[mask, 'GarageArea'].fillna(0)
    df.loc[mask, 'GarageCars'] = df.loc[mask, 'GarageCars'].fillna(0)
    # Fill any remaining NAs with 0
    df['GarageArea'] = df['GarageArea'].fillna(0)
    df['GarageCars'] = df['GarageCars'].fillna(0)

# Step 3: For Masonry veneer area
if 'MasVnrArea' in df.columns:
    mask = df['MasVnrType'] == 'None'
    # Fill NAs with 0 for houses with no masonry veneer
    df.loc[mask, 'MasVnrArea'] = df.loc[mask, 'MasVnrArea'].fillna(0)
    # Fill any remaining NAs with 0
    df['MasVnrArea'] = df['MasVnrArea'].fillna(0)

In [45]:
# Review the cell below to make sure the imputation worked as expected
# Make sure there aren't any features left with missing values

In [46]:
# Select numerical columns
df_columns = df.select_dtypes(include='number')

# Drop numerical columns with any REMAINING missing values
df_columns_no_na = df_columns.dropna(axis=1)

# Select categorical columns
categorical_df = df.select_dtypes(include='object')

print("Numerical columns with no missing values:", df_columns_no_na.shape)
print("Categorical columns:", categorical_df.shape)

Numerical columns with no missing values: (2580, 35)
Categorical columns: (2580, 43)


In [63]:
categorical_df

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Corner,Gtl,SWISU,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
2,C (all),Pave,,Reg,Lvl,AllPub,Inside,Gtl,IDOTRR,Norm,...,Detchd,Unf,TA,Po,N,,,,WD,Normal
3,RL,Pave,,Reg,Lvl,AllPub,Corner,Gtl,OldTown,Norm,...,Detchd,Unf,TA,TA,N,,,,WD,Normal
4,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,BrkSide,Norm,...,Detchd,Unf,Fa,Po,P,,,,WD,Normal
2576,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Edwards,Norm,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal
2577,RH,Pave,,Reg,HLS,AllPub,Inside,Gtl,Crawfor,Norm,...,2Types,Unf,TA,TA,Y,,,,WD,Normal
2578,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal


In [None]:
#See which categorical columns are ordinal versus nominal (ordinal ones get encded normally, 
#nominal ones have to get specifically encoded in the way they are ranked)

In [47]:
# Select categorical columns
categorical_df = df.select_dtypes(include='object')

In [48]:
# Add mroe docstring to the below to make it clearer what I am doing

In [49]:
# Impute missing values for specific columns

# 1. LotFrontage - impute using overall median
lot_frontage_median = df['LotFrontage'].median()
df['LotFrontage'] = df['LotFrontage'].fillna(lot_frontage_median)
print(f"Filled LotFrontage missing values with median: {lot_frontage_median}")

# 2. Basement bathrooms - fill with 0
df['BsmtFullBath'] = df['BsmtFullBath'].fillna(0)
df['BsmtHalfBath'] = df['BsmtHalfBath'].fillna(0)

#Remove the eda code below
# 3. GarageYrBlt - fill differently based on GarageType
# First, get some statistics about houses with garages and missing GarageYrBlt
mask_has_garage_missing_yr = (df['GarageType'] != 'None') & (df['GarageYrBlt'].isna())
houses_with_garage_missing_yr = df[mask_has_garage_missing_yr]
count_has_garage_missing_yr = len(houses_with_garage_missing_yr)

print(f"\nStatistics for houses with garages but missing GarageYrBlt:")
print(f"Number of houses: {count_has_garage_missing_yr}")

if count_has_garage_missing_yr > 0:
    print("Details of these houses:")
    for i, (idx, row) in enumerate(houses_with_garage_missing_yr.iterrows()):
        print(f"House {i+1} (Index {idx}):")
        print(f"  GarageType: {row['GarageType']}")
        print(f"  YearBuilt: {row['YearBuilt']}")
        print(f"  GarageArea: {row['GarageArea']}")
        print(f"  GarageCars: {row['GarageCars']}")
    
    # Get garage type distribution
    garage_type_counts = houses_with_garage_missing_yr['GarageType'].value_counts()
    print(f"\nGarageType distribution for these houses:")
    for garage_type, count in garage_type_counts.items():
        print(f"  {garage_type}: {count} houses")

# Now impute the missing values
if count_has_garage_missing_yr > 0:
    # Calculate median from houses that have garages
    garage_yr_median = df[df['GarageType'] != 'None']['GarageYrBlt'].median()
    df.loc[mask_has_garage_missing_yr, 'GarageYrBlt'] = garage_yr_median
    print(f"\nFilled GarageYrBlt for {count_has_garage_missing_yr} houses with garages using median: {garage_yr_median}")

# For houses with no garage, use 0
mask_no_garage = (df['GarageType'] == 'None') & (df['GarageYrBlt'].isna())
count_no_garage = mask_no_garage.sum()
df.loc[mask_no_garage, 'GarageYrBlt'] = 0
print(f"Filled GarageYrBlt for {count_no_garage} houses without garages with 0")

# Check if we've addressed the missing values in our target columns
check_cols = ['LotFrontage', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt']
missing_counts = df[check_cols].isna().sum()
print("\nMissing values after imputation:")
print(missing_counts)

Filled LotFrontage missing values with median: 68.0

Statistics for houses with garages but missing GarageYrBlt:
Number of houses: 2
Details of these houses:
House 1 (Index 433):
  GarageType: Detchd
  YearBuilt: 1923
  GarageArea: 0.0
  GarageCars: 0.0
House 2 (Index 531):
  GarageType: Detchd
  YearBuilt: 1910
  GarageArea: 360.0
  GarageCars: 1.0

GarageType distribution for these houses:
  Detchd: 2 houses

Filled GarageYrBlt for 2 houses with garages using median: 1978.0
Filled GarageYrBlt for 127 houses without garages with 0

Missing values after imputation:
LotFrontage     0
BsmtFullBath    0
BsmtHalfBath    0
GarageYrBlt     0
dtype: int64


In [50]:
# The code below shows that we don't have any missing values left at this point

In [51]:
df.isnull().sum().sum()

np.int64(0)

In [52]:
# For below:  THe feature you dropped is one you reference, but computationally it does not matter

In [53]:
from sklearn.preprocessing import OneHotEncoder

def drop_most_frequent_category(df_cat):
    new_df = pd.DataFrame()
    for col in df_cat.columns:
        counts = df_cat[col].value_counts()
        # Get the most frequent category
        drop_cat = counts.idxmax()

        # Perform one-hot encoding and drop the most frequent category
        dummies = pd.get_dummies(df_cat[col], prefix=col)
        dummies = dummies.drop(f"{col}_{drop_cat}", axis=1)

        new_df = pd.concat([new_df, dummies], axis=1)
    return new_df


In [54]:
# Apply the custom one-hot encoding
categorical_encoded = drop_most_frequent_category(categorical_df)

print("Shape after encoding categorical variables:", categorical_encoded.shape)

Shape after encoding categorical variables: (2580, 231)


In [55]:
X = pd.concat([df_columns_no_na.drop(columns=['SalePrice'], errors='ignore'),
               categorical_encoded], axis=1)

# Target variable
y = df['SalePrice']

print("Final shape of X:", X.shape)

Final shape of X: (2580, 265)


In [56]:
# Import necessary libraries
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import numpy as np
import matplotlib.pyplot as plt

# Set up 5-fold cross-validation
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [57]:
# Function for cross-validation
def evaluate_model_cv(model, X, y):
    # Get R² scores
    r2_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    
    # Get RMSE scores
    mse_scores = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(mse_scores)
    
    # Print results
    print(f"Model: {model.__class__.__name__}")
    print(f"Average RMSE: ${rmse_scores.mean():,.2f}")
    print(f"Average R²: {r2_scores.mean():.4f}")
    print("-" * 30)
    
    return {
        'model_name': model.__class__.__name__,
        'rmse_mean': rmse_scores.mean(),
        'r2_mean': r2_scores.mean()
    }

In [58]:
# 1. Multiple Linear Regression with cross-validation
print("Multiple Linear Regression:")
mlr = LinearRegression()
mlr_results = evaluate_model_cv(mlr, X, y)

Multiple Linear Regression:
Model: LinearRegression
Average RMSE: $22,746.18
Average R²: 0.9068
------------------------------


In [59]:
# 2. Ridge Regression with cross-validation
print("Ridge Regression (alpha=1.0):")
ridge = Ridge(alpha=1.0) 
ridge_results = evaluate_model_cv(ridge, X, y)

Ridge Regression (alpha=1.0):
Model: Ridge
Average RMSE: $21,862.91
Average R²: 0.9138
------------------------------


  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)


In [62]:
# 3. Lasso Regression with cross-validation
print("Lasso Regression (alpha=1):")
lasso = Lasso(alpha=1)
lasso_results = evaluate_model_cv(lasso, X, y)

Lasso Regression (alpha=1):


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Model: Lasso
Average RMSE: $22,991.88
Average R²: 0.9042
------------------------------


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [None]:
# For pre-processing, do separate notebooks for decision treess, random forests, gradient and xgboosting


In [None]:
# ===============================================================
# Tree-based regressors + cross-validation with THREADING backend

# ===============================================================


from sklearn.model_selection import cross_val_score  # k-fold CV helper
from sklearn.tree import DecisionTreeRegressor        # single decision tree regressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor  # ensembles
from joblib import parallel_backend                   # let us force thread-based parallelism
import sys, subprocess                                # used only to auto-install xgboost if missing

# 5-fold setup here ---
try:
    kf  # do we already have it?
except NameError:
    from sklearn.model_selection import KFold
    # KFold controls how CV splits your data. Shuffle for randomness; fix random_state for reproducibility.
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

# --- XGBoost is not in scikit-learn, so we import separately and install if needed ---
try:
    import xgboost as xgb
except Exception:
    # If import fails, install it into the current Python environment, then import.
    subprocess.check_call([sys.executable, "-m", "pip", "install", "xgboost"])
    import xgboost as xgb

# ---------------------------------------------------------------
# Helper: cross-validate a model using your same `kf` splits.
# We force thread-based parallelism to avoid Python 3.13 cleanup
# errors that appear with process-based parallelism.
# ---------------------------------------------------------------



def evaluate_model_cv(model, X, y, cv=kf):
    """
    Trains/evaluates `model` with k-fold CV:
      - R² (bigger is better; variance explained)
      - RMSE (smaller is better; error in target units, e.g., dollars)
    Uses joblib's threading backend so you don't hit the 3.13 bug.
    """
    # Force joblib to use threads (not processes) for parallel loops.
    # This preserves parallel speedups in most sklearn ops without the cleanup crash.
    with parallel_backend("threading"):
        # cross_val_score trains/evaluates model across `cv` splits and returns the score for each fold.
        # scoring='r2' gives us the R² scores.
        r2_scores  = cross_val_score(model, X, y, cv=cv, scoring="r2", n_jobs=-1)

        # For RMSE, sklearn offers negative MSE so that "higher is better" still holds.
        # We negate it back to MSE, then take sqrt to get RMSE.
        mse_scores = -cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error", n_jobs=-1)

    rmse_scores = np.sqrt(mse_scores)  # convert MSE -> RMSE (same units as SalePrice)

    # Nice on-screen summary for you while training.
    print(f"Model: {model.__class__.__name__}")
    print(f"Average RMSE: ${rmse_scores.mean():,.2f}")   # money-style formatting
    print(f"Average R²: {r2_scores.mean():.4f}")
    print("-" * 30)

    # Return a small dict for the comparison table later.
    return {
        "model_name": model.__class__.__name__,
        "rmse_mean": float(rmse_scores.mean()),
        "r2_mean": float(r2_scores.mean()),
    }

# ============================
# Models: defaults + why/when
# ============================

# 1) Decision Tree (single tree)
#    - `max_depth` is the main "knob" to prevent overfitting.
#    - Lower depth -> simpler tree (higher bias, lower variance).
#    - Try values like 3, 5, 7, 10 and see how CV R²/RMSE change.
dt = DecisionTreeRegressor(
    max_depth=5,     # <- TUNE: increase for more complexity
    random_state=42  # reproducible results
)
dt_results = evaluate_model_cv(dt, X, y, cv=kf)

# 2) Random Forest
#    - An ensemble of decision trees, trained on bootstrapped samples.
#    - Key knobs:
#        n_estimators: more trees generally better (to a point), slower training
#        max_features: how many features to consider when splitting ("sqrt" is common for high-dim)
#        min_samples_leaf: increases leaf size => smoother predictions, less overfit
#    - NOTE: n_jobs=1 avoids nested processes since our CV already parallelizes with threads.
rf = RandomForestRegressor(
    n_estimators=300,   # <- TUNE: 200–1000 (tradeoff speed/accuracy)
    max_features="sqrt",# <- TUNE: "sqrt", "log2", or a float (0.3–0.8)
    min_samples_leaf=1, # <- TUNE: 1–10; larger reduces overfitting
    random_state=42,
    n_jobs=1            # avoid process-based forks inside the model
)
rf_results = evaluate_model_cv(rf, X, y, cv=kf)

# 3) Gradient Boosting (scikit-learn)
#    - Builds trees sequentially to correct previous errors.
#    - Key knobs:
#        n_estimators: more boosting rounds (default=100)
#        learning_rate: step size per round (smaller -> need more rounds; often 0.05–0.1)
#        max_depth or max_leaf_nodes via base_estimator params (controls tree complexity)
#    - Good baseline booster without extra installs.
gbr = GradientBoostingRegressor(
    random_state=42
    # <- TUNE: learning_rate=0.05, n_estimators=300, max_depth=3 via max_depth in base tree
)
gbr_results = evaluate_model_cv(gbr, X, y, cv=kf)

# 4) XGBoost Regressor
#    - Often stronger/faster with better regularization and handling of sparsity.
#    - Key knobs:
#        n_estimators: boosting rounds (200–1000 typical)
#        learning_rate: 0.03–0.1 common
#        max_depth: tree depth (3–10 typical; 6 is a sweet spot often)
#        subsample, colsample_bytree: stochasticity to reduce overfit (0.5–0.9 typical)
#        reg_lambda, reg_alpha: L2 / L1 regularization
#    - NOTE: n_jobs=1 to avoid nested processes.
xgb_model = xgb.XGBRegressor(
    n_estimators=500,        # <- TUNE: try 300–1000 with matching learning_rate
    learning_rate=0.05,      # <- TUNE: 0.03–0.1; lower needs more n_estimators
    max_depth=6,             # <- TUNE: 3–10 depending on data complexity
    subsample=0.8,           # <- TUNE: 0.6–0.9; lower can reduce overfitting
    colsample_bytree=0.8,    # <- TUNE: 0.6–0.9; feature subsampling
    objective="reg:squarederror",
    reg_lambda=1.0,          # <- TUNE: increase if overfitting
    random_state=42,
    n_jobs=1                 # avoid nested process-based parallelism
)
xgb_results = evaluate_model_cv(xgb_model, X, y, cv=kf)

# ==========================================
# Combine with your earlier linear models
# (mlr_results, ridge_results, lasso_results)
# ==========================================
all_results = [
    mlr_results, ridge_results, lasso_results,  # from your earlier cell
    dt_results, rf_results, gbr_results, xgb_results
]

# Make a clean comparison table sorted by R² (bigger is better).
results_df = pd.DataFrame(all_results).sort_values("r2_mean", ascending=False).reset_index(drop=True)

print("\nModel Comparison (5-fold CV with threading backend):")
display(results_df)

# ----------------------------------------------------------
# What to adjust next (quick guide):
#   - If train scores >> CV scores: you're likely overfitting → reduce max_depth,
#     increase min_samples_leaf, add regularization (XGB: raise reg_lambda, lower max_depth),
#     or add more data.
#   - If both train & CV are low: underfitting → allow more complexity
#     (deeper trees, more estimators, higher learning rate—but carefully).
#   - If runtime is too long: reduce n_estimators first.
# ----------------------------------------------------------


Model: DecisionTreeRegressor
Average RMSE: $38,836.56
Average R²: 0.7301
------------------------------
Model: RandomForestRegressor
Average RMSE: $24,967.57
Average R²: 0.8885
------------------------------
Model: GradientBoostingRegressor
Average RMSE: $23,455.31
Average R²: 0.9007
------------------------------
Model: XGBRegressor
Average RMSE: $21,946.84
Average R²: 0.9134
------------------------------

Model Comparison (5-fold CV with threading backend):


Unnamed: 0,model_name,rmse_mean,r2_mean
0,Ridge,21862.909063,0.913821
1,XGBRegressor,21946.840027,0.913386
2,Lasso,22592.859707,0.908062
3,LinearRegression,22746.179244,0.906764
4,GradientBoostingRegressor,23455.308049,0.900662
5,RandomForestRegressor,24967.571171,0.888512
6,DecisionTreeRegressor,38836.559523,0.730058
