In [34]:
# Pandas is the Python package for data frames
# Seaborn is for data visualization

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt



In [35]:
# Part 1 Data Acquisition

# Read data from a CSV file into a data frame
df = pd.read_csv("AmesHousing.csv", index_col=0)

# Display the first ten rows of the data frame to examine if it is an individual-level data set
print(df.head(10))

# Display the variable list
print(df.columns.values)

# Display the number of rows and the number of columns in the data set to confirm the portrait shape
# The first element of the output is the number of rows and the second is the number of columns 
df.shape

# Let's drop columns that are unnamed or have no meaning, and also drop PID
# Identify unnamed columns and PID, which has no meaning
unnamed_cols = [col for col in df.columns if 'unnamed' in col.lower() or 'no meaning' in col.lower()]
columns_to_drop = unnamed_cols + ['PID']

# Drop identified columns
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')

print(f"Dropped {len(unnamed_cols)} unnamed columns and PID")
print(f"DataFrame shape after dropping columns: {df_cleaned.shape}")

df_cleaned.shape

             PID  MS SubClass MS Zoning  Lot Frontage  Lot Area Street Alley  \
Order                                                                          
1      526301100           20        RL         141.0     31770   Pave   NaN   
2      526350040           20        RH          80.0     11622   Pave   NaN   
3      526351010           20        RL          81.0     14267   Pave   NaN   
4      526353030           20        RL          93.0     11160   Pave   NaN   
5      527105010           60        RL          74.0     13830   Pave   NaN   
6      527105030           60        RL          78.0      9978   Pave   NaN   
7      527127150          120        RL          41.0      4920   Pave   NaN   
8      527145080          120        RL          43.0      5005   Pave   NaN   
9      527146030          120        RL          39.0      5389   Pave   NaN   
10     527162130           60        RL          60.0      7500   Pave   NaN   

      Lot Shape Land Contour Utilities 

(2930, 80)

In [36]:
# Part 3 Missing Value Imputation

# Show the number of missing values before we start
print("Missing values before imputation:")
print(df_cleaned.isnull().sum().sort_values(ascending=False).head(10))
print("-" * 30)

# Drop rows where the dependent variable is missing
DV = 'SalePrice'
df_sample1 = df_cleaned.dropna(subset=[DV]).copy() # Use .copy() to avoid warnings

# --- 1. Impute "Meaningful NA" Categoricals ---
# These are columns where 'NA' is a category (e.g., "No Basement"), not missing data.
meaningful_na_columns = [
    'Alley', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 
    'BsmtFin Type 1', 'BsmtFin Type 2', 'FireplaceQu',
    'Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond',
    'Pool QC', 'Fence', 'Misc Feature', 'Mas Vnr Type'
]

for col in meaningful_na_columns:
    if col in df_sample1.columns:
        df_sample1[col] = df_sample1[col].fillna('None')

# --- 2. Numerical Imputation ---
# We create df_sample2 by filling all numerical NAs
df_sample2 = df_sample1.copy()

# A. Smart Imputation (Context-Aware): Fill with 0
# If a house has no basement, its basement-related numericals should be 0, not a median.

# Basement-related numericals, if there is no Basement, then the other Basement columns get 0.
if 'Bsmt Qual' in df_sample2.columns:
    mask = (df_sample2['Bsmt Qual'] == 'None')
    bsmt_num_cols = ['Total Bsmt SF', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Bsmt Full Bath', 'Bsmt Half Bath']
    for col in bsmt_num_cols:
        if col in df_sample2.columns:
            df_sample2.loc[mask, col] = df_sample2.loc[mask, col].fillna(0)

# Garage-related numericals: If there is no Garage, then the other Garage columns get 0.
if 'Garage Type' in df_sample2.columns:
    mask = (df_sample2['Garage Type'] == 'None')
    garage_num_cols = ['Garage Cars', 'Garage Area', 'Garage Yr Blt']
    for col in garage_num_cols:
        if col in df_sample2.columns:
            df_sample2.loc[mask, col] = df_sample2.loc[mask, col].fillna(0)

# Masonry veneer numericals. If there is no Masonry Veneer Numerical, then the other Masonry columns get 0.
if 'Mas Vnr Type' in df_sample2.columns:
    mask = (df_sample2['Mas Vnr Type'] == 'None')
    if 'Mas Vnr Area' in df_sample2.columns:
         df_sample2.loc[mask, 'Mas Vnr Area'] = df_sample2.loc[mask, 'Mas Vnr Area'].fillna(0)
 
# 'Lot Frontage' is likely similar for houses in the same 'Neighborhood', so I use the group median of lot frontage for the neighborhood to impute..
if 'Lot Frontage' in df_sample2.columns and 'Neighborhood' in df_sample2.columns:
    # Fill NAs with the median Lot Frontage of that specific neighborhood
    df_sample2['Lot Frontage'] = df_sample2.groupby('Neighborhood')['Lot Frontage'].transform(lambda x: x.fillna(x.median()))
    # If any NAs remain (e.g., a whole neighborhood was NA), fill with the overall median
    df_sample2['Lot Frontage'] = df_sample2['Lot Frontage'].fillna(df_sample2['Lot Frontage'].median())

# C. Generic Median Imputation (Fallback)
# Now, find ALL remaining numerical columns and fill them with their median.
# This will handle columns like 'Lot Area' and any NAs our previous logic missed.
all_numerical_cols = df_sample2.select_dtypes(include=np.number).columns
df_sample2[all_numerical_cols] = df_sample2[all_numerical_cols].fillna(value=df_sample2[all_numerical_cols].median())


# --- 3. Categorical Imputation ---
# We create df_sample4 by filling all remaining categorical NAs
df_sample4 = df_sample2.copy()


# B. Generic Mode Imputation (Fallback)
# Find ALL remaining categorical/object columns and fill with their mode.
all_categorical_cols = df_sample4.select_dtypes(include=['object', 'category']).columns

for col in all_categorical_cols:
     df_sample4[col] = df_sample4[col].fillna(df_sample4[col].mode()[0])


# --- 4. Final Check ---
# This command should now return 0. There should now be no missing values in our dataset.
total_missing = df_sample4.isnull().sum().sum()
print("-" * 30)
print(f"Total missing values remaining in df_sample4: {total_missing}")



Missing values before imputation:
Pool QC          2917
Misc Feature     2824
Alley            2732
Fence            2358
Mas Vnr Type     1775
Fireplace Qu     1422
Lot Frontage      490
Garage Cond       159
Garage Yr Blt     159
Garage Finish     159
dtype: int64
------------------------------
------------------------------
Total missing values remaining in df_sample4: 0


In [37]:
# Part 4 Variable transformation

# Below our two lists that encapsulate all columns in the dataset. 
nvar_list_original = [
    # Continuous
    'Lot Frontage', 'Lot Area', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2',
    'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
    'Gr Liv Area', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch',
    '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val', 'SalePrice',
    
    # Discrete
    'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Bsmt Full Bath',
    'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr',
    'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Mo Sold',
    'Yr Sold'
]

cvar_list_original = [
    # Nominal
    'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config',
    'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style',
    'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
    'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature',
    'Sale Type', 'Sale Condition',
    
    # Ordinal
    'Lot Shape', 'Utilities', 'Land Slope', 'Exter Qual', 'Exter Cond',
    'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2',
    'Heating QC', 'Electrical', 'Kitchen Qual', 'Functional', 'Fireplace Qu',
    'Garage Finish', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Pool QC',
    'Fence'
]

# Make a dataset of only our dependent and indepedent variables
model_cols = cvar_list_original + nvar_list_original
df_sample5 = df_sample4[model_cols].copy()

# --- CHANGE 1: Standardization Removed ---
# We intentionally DO NOT standardize numericals for Tree models.
# We just keep the original values.
# df_sample5[nvar_list_original] = (df_sample4[nvar_list_original] - df_sample4[nvar_list_original].mean())/df_sample4[nvar_list_original].std()

# Set the datatype for the variables in the cvar_list to be categorical in Python
# Set the datatype for the variables in the nvar_list to be numerical in Python 
df_sample6 = df_sample5.copy()
df_sample6[cvar_list_original] = df_sample5[cvar_list_original].astype('category')
df_sample6[nvar_list_original] = df_sample5[nvar_list_original].astype('float64')

# Convert the categorical variables into dummies (Step 1 of dummy coding)
df_sample7 = df_sample6.copy()
# drop_first=False (default) keeps ALL categories (One-Hot Encoding), which is better for Trees.
df_sample7 = pd.get_dummies(df_sample6, prefix_sep='_', dtype=int)

# --- CHANGE 2: "Drop One" Loop Removed ---
# We intentionally keep all dummies for Tree models.
df_sample8 = df_sample7.copy()

# (Loop removed)

# Check resulting columns
print("\nRemaining columns (Full One-Hot Encoding):")
print(df_sample8.columns.values)

# Display the milestone dataframe. Compare it with the original dataframe.
print(df_sample8)
print(df)


Remaining columns (Full One-Hot Encoding):
['Lot Frontage' 'Lot Area' 'Mas Vnr Area' 'BsmtFin SF 1' 'BsmtFin SF 2'
 'Bsmt Unf SF' 'Total Bsmt SF' '1st Flr SF' '2nd Flr SF' 'Low Qual Fin SF'
 'Gr Liv Area' 'Garage Area' 'Wood Deck SF' 'Open Porch SF'
 'Enclosed Porch' '3Ssn Porch' 'Screen Porch' 'Pool Area' 'Misc Val'
 'SalePrice' 'Overall Qual' 'Overall Cond' 'Year Built' 'Year Remod/Add'
 'Bsmt Full Bath' 'Bsmt Half Bath' 'Full Bath' 'Half Bath' 'Bedroom AbvGr'
 'Kitchen AbvGr' 'TotRms AbvGrd' 'Fireplaces' 'Garage Yr Blt'
 'Garage Cars' 'Mo Sold' 'Yr Sold' 'MS SubClass_20' 'MS SubClass_30'
 'MS SubClass_40' 'MS SubClass_45' 'MS SubClass_50' 'MS SubClass_60'
 'MS SubClass_70' 'MS SubClass_75' 'MS SubClass_80' 'MS SubClass_85'
 'MS SubClass_90' 'MS SubClass_120' 'MS SubClass_150' 'MS SubClass_160'
 'MS SubClass_180' 'MS SubClass_190' 'MS Zoning_A (agr)'
 'MS Zoning_C (all)' 'MS Zoning_FV' 'MS Zoning_I (all)' 'MS Zoning_RH'
 'MS Zoning_RL' 'MS Zoning_RM' 'Street_Grvl' 'Street_Pave' 'All

In [38]:
# Part 5 Data Partition
from sklearn.model_selection import train_test_split

# Define lists (No standardization for trees)
nvar_list_original = [
    'Lot Frontage', 'Lot Area', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2',
    'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
    'Gr Liv Area', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch',
    '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val', 'SalePrice',
    'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Bsmt Full Bath',
    'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr',
    'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Mo Sold', 'Yr Sold'
]

cvar_list_original = [
    'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config',
    'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style',
    'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
    'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature',
    'Sale Type', 'Sale Condition', 'Lot Shape', 'Utilities', 'Land Slope', 
    'Exter Qual', 'Exter Cond', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 
    'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating QC', 'Electrical', 'Kitchen Qual', 
    'Functional', 'Fireplace Qu', 'Garage Finish', 'Garage Qual', 'Garage Cond', 
    'Paved Drive', 'Pool QC', 'Fence'
]

# Combine and Partition
model_columns = cvar_list_original + nvar_list_original
df_sample5 = df_sample4[model_cols].copy()

# Convert to proper types
df_sample6 = df_sample5.copy()
df_sample6[cvar_list_original] = df_sample5[cvar_list_original].astype('category')
df_sample6[nvar_list_original] = df_sample5[nvar_list_original].astype('float64')

# One-Hot Encoding (Keep all columns)
df_sample7 = pd.get_dummies(df_sample6, prefix_sep='_', dtype=int)

# Split
target = 'SalePrice'
X = df_sample7.drop(columns=[target])
y = df_sample7[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print(f"Train Shape: {X_train.shape}")
print(f"Test Shape: {X_test.shape}")

Train Shape: (2344, 332)
Test Shape: (586, 332)


In [None]:
# PART 7: Regression Tree 

from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from io import StringIO
import pydotplus
from IPython.display import Image
import numpy as np

# 1. User-Defined Function: Summary Tree (Visualization)
# ADAPTED: Removed 'class_names' because regression predicts values, not classes.
def summary_tree(model_object):
    dot_data = StringIO()
    export_graphviz(model_object, out_file=dot_data, filled=True,
                  rounded=True, special_characters=True, 
                  feature_names=X_train.columns.values) # class_names removed
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    output_imagefile = 'tree.png'
    graph.write_png(output_imagefile)
    return output_imagefile

# 3. Run Regression Tree with GridSearchCV

kfolds = 5
# We tune Max Depth just like the professor, but we also tune min_samples_leaf
# to prevent the tree from getting too messy for the path printer.
param_grid = {
    'max_depth': list(range(1, 21)), # Checked 1 to 20 (100 is too slow/complex for full text output)
    'min_samples_leaf': [5, 10, 20]  # Added to ensure leaf nodes represent groups of houses
}

# ADAPTED: Used DecisionTreeRegressor and scoring='neg_mean_squared_error'
gridsearch = GridSearchCV(
    DecisionTreeRegressor(random_state=1), 
    param_grid, 
    scoring='neg_mean_squared_error', 
    cv=kfolds, 
    n_jobs=-1 
)

# Fit the model
gridsearch.fit(X_train, y_train)
clf_BPT = gridsearch.best_estimator_

# 4. Display the Tree and Stats

# Display the best pruned tree image
try:
    display(Image(summary_tree(clf_BPT)))
except:
    print("Could not generate image (pydotplus might be missing), skipping image.")

# Display stats
print(f"Best Depth: {clf_BPT.get_depth()}")
# Calculate RMSE manually since predict_proba doesn't exist for regression
y_pred = clf_BPT.predict(X_test)
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
print(f"Test R2: {r2_score(y_test, y_pred):.4f}")


# 5. Leaf Node Statistics
# ADAPTED: Modified to print "Average Value" instead of "Class Counts"

def get_treepaths_regression(dtc, df):
    rules_list = []
    values_path = []
    values = dtc.tree_.value

    def RevTraverseTree(tree, node, rules, pathValues):
        try:
            prevnode = tree[2].index(node)
            leftright = '<='
            pathValues.append(values[prevnode])
        except ValueError:
            # failed, so find it as a right node
            prevnode = tree[3].index(node)
            leftright = '>'
            pathValues.append(values[prevnode])

        # Get the rule
        p1 = df.columns[tree[0][prevnode]]
        p2 = tree[1][prevnode]
        rules.append(str(p1) + ' ' + leftright + ' ' + str(p2))

        # If not at top, go up one step
        if prevnode != 0:
            RevTraverseTree(tree, prevnode, rules, pathValues)

    # Get leaf nodes
    leaves = dtc.tree_.children_left == -1
    leaves = np.arange(0,dtc.tree_.node_count)[leaves]

    # Build simplified tree structure
    thistree = [dtc.tree_.feature.tolist()]
    thistree.append(dtc.tree_.threshold.tolist())
    thistree.append(dtc.tree_.children_left.tolist())
    thistree.append(dtc.tree_.children_right.tolist())

    # Apply rules
    for (ind,nod) in enumerate(leaves):
        rules = []
        pathValues = []
        RevTraverseTree(thistree, nod, rules, pathValues)

        pathValues.insert(0, values[nod])
        pathValues = list(reversed(pathValues))
        rules = list(reversed(rules))
        rules_list.append(rules)
        values_path.append(pathValues)

    # Print results
    for i in range(len(rules_list)):
        print('\nLeaf node ID =', i+1)
        print('Path =', rules_list[i])
        samples = dtc.tree_.n_node_samples[leaves[i]]
        
        # --- ADAPTATION START ---
        # In Regression, values_path contains the MEAN value of the node
        predicted_value = values_path[i][-1][0][0]
        
        print('samples =', int(samples))
        print(f'Predicted Sale Price = ${predicted_value:,.2f}')
        # --- ADAPTATION END ---

    return None

# Run the function (Limiting depth for print readability if needed)
print("generating tree rules...")
get_treepaths_regression(dtc=clf_BPT, df=X_train)

ModuleNotFoundError: No module named 'pydotplus'

In [None]:
# --- PART 8: Random Forest ---
from sklearn.ensemble import RandomForestRegressor

# GridSearch for CV (finding the "optimal alpha")
param_grid_rf = {
    'n_estimators': [100],
    'max_depth': [10, 20, None],
    'min_samples_leaf': [1, 5]
}

grid_rf = GridSearchCV(RandomForestRegressor(random_state=1), param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_rf.fit(X_train, y_train)

best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
print(f"\n--- Random Forest Results ---")
print(f"Best Params: {grid_rf.best_params_}")
print(f"Test R2: {r2_score(y_test, y_pred_rf):.4f}")

# --- PART 9: Gradient Boosting ---
from sklearn.ensemble import GradientBoostingRegressor

# GridSearch for CV
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1], # This is the ALPHA (Regularization)
    'max_depth': [3, 5]
}

grid_gb = GridSearchCV(GradientBoostingRegressor(random_state=1), param_grid_gb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_gb.fit(X_train, y_train)

best_gb = grid_gb.best_estimator_
y_pred_gb = best_gb.predict(X_test)
print(f"\n--- Gradient Boosting Results ---")
print(f"Best Params: {grid_gb.best_params_}")
print(f"Test R2: {r2_score(y_test, y_pred_gb):.4f}")

# --- PART 10: XGBoost ---
import xgboost as xgb

# GridSearch for CV
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'reg_alpha': [0, 1.0] # Explicit Alpha
}

grid_xgb = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror', random_state=1), param_grid_xgb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_xgb.fit(X_train, y_train)

best_xgb = grid_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)
print(f"\n--- XGBoost Results ---")
print(f"Best Params: {grid_xgb.best_params_}")
print(f"Test R2: {r2_score(y_test, y_pred_xgb):.4f}")