In [4]:
# Dependencies
%pip install pandas numpy seaborn matplotlib scikit-learn pydotplus ipython joblib

Collecting pydotplus
  Using cached pydotplus-2.0.2.tar.gz (278 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [14 lines of output]
      ERROR: Can not execute `setup.py` since setuptools failed to import in the build environment with exception:
      Traceback (most recent call last):
        File "<pip-setuptools-caller>", line 14, in <module>
        File "c:\Users\jmart0509\AppData\Local\anaconda3\Lib\site-packages\setuptools\__init__.py", line 26, in <module>
          from .dist import Distribution
        File "c:\Users\jmart0509\AppData\Local\anaconda3\Lib\site-packages\setuptools\dist.py", line 20, in <module>
          from . import (
        File "c:\Users\jmart0509\AppData\Local\anaconda3\Lib\site-packages\setuptools\_entry_points.py", line 6, in <module>
          from jaraco.text import yield_lines
        File "c:\Users\jmart0509\AppData\Local\anaconda3\Lib\site-packages\setuptools\_vendor\jaraco\text\__init__.py", line 12, in <module>
          from jaraco.cont

In [5]:
# Part 1 Download dataset
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

#Load data
df = pd.read_csv("AmesHousing.csv", index_col=0)

#Display first few rows
print(df.head())

#Identify unnamed columns and PID, which has no meaning
unnamed_cols = [col for col in df.columns if 'unnamed' in col.lower() or 'no meaning' in col.lower()]
columns_to_drop = unnamed_cols + ['PID']

#Drop identified columns
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')

print(f"Dropped {len(unnamed_cols)} unnamed columns and PID")
print(f"DataFrame shape after dropping columns: {df_cleaned.shape}")

print(df_cleaned.head())

             PID  MS SubClass MS Zoning  Lot Frontage  Lot Area Street Alley  \
Order                                                                          
1      526301100           20        RL         141.0     31770   Pave   NaN   
2      526350040           20        RH          80.0     11622   Pave   NaN   
3      526351010           20        RL          81.0     14267   Pave   NaN   
4      526353030           20        RL          93.0     11160   Pave   NaN   
5      527105010           60        RL          74.0     13830   Pave   NaN   

      Lot Shape Land Contour Utilities  ... Pool Area Pool QC  Fence  \
Order                                   ...                            
1           IR1          Lvl    AllPub  ...         0     NaN    NaN   
2           Reg          Lvl    AllPub  ...         0     NaN  MnPrv   
3           IR1          Lvl    AllPub  ...         0     NaN    NaN   
4           Reg          Lvl    AllPub  ...         0     NaN    NaN   
5      

In [7]:
# Part 2 Missing Value Imputation

# Show the number of missing values before we start
print("Missing values before imputation:")
print(df_cleaned.isnull().sum().sort_values(ascending=False).head(10))
print("-" * 30)

# --- CHANGE: We CANNOT drop rows where DV is missing, because DV doesn't exist yet.
# We proceed using df_cleaned directly.
df_sample1 = df_cleaned.copy() 

# --- 1. Impute "Meaningful NA" Categoricals ---
# These are columns where 'NA' is a category (e.g., "No Basement"), not missing data.
meaningful_na_columns = [
    'Alley', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 
    'BsmtFin Type 1', 'BsmtFin Type 2', 'FireplaceQu',
    'Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond',
    'Pool QC', 'Fence', 'Misc Feature', 'Mas Vnr Type'
]

for col in meaningful_na_columns:
    if col in df_sample1.columns:
        df_sample1[col] = df_sample1[col].fillna('None')

# --- 2. Numerical Imputation ---
# We create df_sample2 by filling all numerical NAs
df_sample2 = df_sample1.copy()

# A. Smart Imputation (Context-Aware): Fill with 0
# If a house has no basement, its basement-related numericals should be 0, not a median.

# Basement-related numericals, if there is no Basement, then the other Basement columns get 0.
if 'Bsmt Qual' in df_sample2.columns:
    mask = (df_sample2['Bsmt Qual'] == 'None')
    bsmt_num_cols = ['Total Bsmt SF', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Bsmt Full Bath', 'Bsmt Half Bath']
    for col in bsmt_num_cols:
        if col in df_sample2.columns:
            df_sample2.loc[mask, col] = df_sample2.loc[mask, col].fillna(0)

# Garage-related numericals: If there is no Garage, then the other Garage columns get 0.
if 'Garage Type' in df_sample2.columns:
    mask = (df_sample2['Garage Type'] == 'None')
    garage_num_cols = ['Garage Cars', 'Garage Area', 'Garage Yr Blt']
    for col in garage_num_cols:
        if col in df_sample2.columns:
            df_sample2.loc[mask, col] = df_sample2.loc[mask, col].fillna(0)

# Masonry veneer numericals. If there is no Masonry Veneer Numerical, then the other Masonry columns get 0.
if 'Mas Vnr Type' in df_sample2.columns:
    mask = (df_sample2['Mas Vnr Type'] == 'None')
    if 'Mas Vnr Area' in df_sample2.columns:
         df_sample2.loc[mask, 'Mas Vnr Area'] = df_sample2.loc[mask, 'Mas Vnr Area'].fillna(0)
 
# 'Lot Frontage' is likely similar for houses in the same 'Neighborhood', so I use the group median of lot frontage for the neighborhood to impute..
if 'Lot Frontage' in df_sample2.columns and 'Neighborhood' in df_sample2.columns:
    # Fill NAs with the median Lot Frontage of that specific neighborhood
    df_sample2['Lot Frontage'] = df_sample2.groupby('Neighborhood')['Lot Frontage'].transform(lambda x: x.fillna(x.median()))
    # If any NAs remain (e.g., a whole neighborhood was NA), fill with the overall median
    df_sample2['Lot Frontage'] = df_sample2['Lot Frontage'].fillna(df_sample2['Lot Frontage'].median())

# C. Generic Median Imputation (Fallback)
# Now, find ALL remaining numerical columns and fill them with their median.
# This will handle columns like 'Lot Area' and any NAs our previous logic missed.
all_numerical_cols = df_sample2.select_dtypes(include=np.number).columns
df_sample2[all_numerical_cols] = df_sample2[all_numerical_cols].fillna(value=df_sample2[all_numerical_cols].median())


# --- 3. Categorical Imputation ---
# We create df_sample4 by filling all remaining categorical NAs
df_sample4 = df_sample2.copy()


# B. Generic Mode Imputation (Fallback)
# Find ALL remaining categorical/object columns and fill with their mode.
all_categorical_cols = df_sample4.select_dtypes(include=['object', 'category']).columns

for col in all_categorical_cols:
     df_sample4[col] = df_sample4[col].fillna(df_sample4[col].mode()[0])


# --- 4. Final Check ---
# This command should now return 0. There should now be no missing values in our dataset.
total_missing = df_sample4.isnull().sum().sum()
print("-" * 30)
print(f"Total missing values remaining in df_sample4: {total_missing}")

Missing values before imputation:
Pool QC          2917
Misc Feature     2824
Alley            2732
Fence            2358
Mas Vnr Type     1775
Fireplace Qu     1422
Lot Frontage      490
Garage Cond       159
Garage Yr Blt     159
Garage Finish     159
dtype: int64
------------------------------
------------------------------
Total missing values remaining in df_sample4: 0


In [8]:
# Part 3 Variable Transformation

# 1) Define variable lists
nvar_list = [
    'Lot Frontage', 'Lot Area', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2',
    'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
    'Gr Liv Area', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch',
    '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val',
    'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Bsmt Full Bath',
    'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr',
    'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Mo Sold', 'Yr Sold'
]

cvar_list = [
    'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config',
    'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style',
    'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
    'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature',
    'Sale Type', 'Sale Condition', 
    'Lot Shape', 'Utilities', 'Land Slope', 'Exter Qual', 'Exter Cond',
    'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2',
    'Heating QC', 'Electrical', 'Kitchen Qual', 'Functional', 'Fireplace Qu',
    'Garage Finish', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence'
]


# 2) Make a modeling df that includes predictors.
needed_cols = nvar_list + cvar_list + ['SalePrice']
df_model = df_sample4[needed_cols].copy()

# 4) Type-cast
# SalePrice is numerical, but not in nvar_list, so it remains float/int automatically.
df_model[cvar_list] = df_model[cvar_list].astype('category')
df_model[nvar_list] = df_model[nvar_list].astype('float64')

# 5) Dummy-code ONLY predictor categoricals
X_num = df_model[nvar_list]
X_cat = pd.get_dummies(df_model[cvar_list], prefix_sep='_', dtype=int)

# Drop one baseline dummy PER predictor variable
for var in cvar_list:
    # drop the most frequent category’s dummy for each predictor
    mode_val = df_sample4[var].mode()[0]
    colname = f"{var}_{mode_val}"
    if colname in X_cat.columns:
        X_cat = X_cat.drop(columns=[colname])

# Combine predictors; 
X_full = pd.concat([X_num, X_cat, df_model[['SalePrice']]], axis=1)

# We do not define 'y' here yet.
print(f"Data prepared. Columns including SalePrice: {X_full.shape[1]}")

Data prepared. Columns including SalePrice: 289


In [9]:
# PART 4: Feature Selection and Partition as we do not use every variable in the dataset for our model 

# 1. Define the specific features you want to use
my_nvar_list = [
    'Lot Area', 'Overall Qual', 'Year Built', 'Total Bsmt SF', 'Gr Liv Area',
    'Full Bath', 'Garage Cars', 'Mas Vnr Area', '1st Flr SF'
]

my_cvar_list = [
    'Neighborhood_Blmngtn', 'Neighborhood_Blueste', 'Neighborhood_BrDale',
    'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr',
    'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_Gilbert',
    'Neighborhood_Greens', 'Neighborhood_GrnHill', 'Neighborhood_IDOTRR',
    'Neighborhood_Landmrk', 'Neighborhood_MeadowV', 'Neighborhood_Mitchel',
    'Neighborhood_NPkVill', 'Neighborhood_NWAmes', 'Neighborhood_NoRidge',
    'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'Neighborhood_SWISU',
    'Neighborhood_Sawyer', 'Neighborhood_SawyerW', 'Neighborhood_Somerst',
    'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Neighborhood_Veenker',
    'House Style_1.5Fin', 'House Style_1.5Unf', 'House Style_2.5Fin',
    'House Style_2.5Unf', 'House Style_2Story', 'House Style_SFoyer',
    'House Style_SLvl', 'Bldg Type_2fmCon', 'Bldg Type_Duplex',
    'Bldg Type_Twnhs', 'Bldg Type_TwnhsE', 'Kitchen Qual_Ex', 'Kitchen Qual_Fa',
    'Kitchen Qual_Gd', 'Kitchen Qual_Po', 'Exter Qual_Ex', 'Exter Qual_Fa',
    'Exter Qual_Gd', 'Foundation_BrkTil', 'Foundation_CBlock', 'Foundation_Slab',
    'Foundation_Stone', 'Foundation_Wood' 
]

# 2. Filter your 'X' from Part 3 to include ONLY these columns PLUS SalePrice
desired_columns = my_nvar_list + my_cvar_list + ['SalePrice']
valid_columns = X_full.columns.intersection(desired_columns)

print(f"Original Feature Count: {X_full.shape[1]}")
X_subset = X_full[valid_columns]
print(f"Filtered Feature Count (including SalePrice): {X_subset.shape[1]}")

# 3. Partition the data
from sklearn.model_selection import train_test_split

# We split the dataset FIRST
train_df, test_df = train_test_split(
    X_subset, test_size=0.2, random_state=1
)

# --- 4. CREATE DV USING TRAINING MEDIAN ---

# Calculate Median on Training Data Only
train_median = train_df['SalePrice'].median()
print(f"Training Median Sale Price: ${train_median:,.0f}")

# Create 'DV' column: 1 if In Budget (<= Median), 0 if Not
# Applying this logic to both Train and Test using train_median
y_train = (train_df['SalePrice'] <= train_median).astype(int)
y_test = (test_df['SalePrice'] <= train_median).astype(int)

# Remove 'SalePrice' from the predictors so the model doesn't cheat
X_train = train_df.drop(columns=['SalePrice'])
X_test = test_df.drop(columns=['SalePrice'])

print("\nPartition Complete.")
print(f"Training Data X: {X_train.shape}")
print(f"Testing Data X: {X_test.shape}")
print(f"Training Data y: {y_train.shape}")

Original Feature Count: 289
Filtered Feature Count (including SalePrice): 60
Training Median Sale Price: $163,500

Partition Complete.
Training Data X: (2344, 59)
Testing Data X: (586, 59)
Training Data y: (2344,)


In [12]:
# PART 5: Classification Tree

from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from io import StringIO
import pydotplus
from IPython.display import Image
import numpy as np

# 1. User-Defined Function: Summary Tree (Visualization)

def summary_tree(model_object):
  # Note: We use X_train.columns.values to get feature names
  dot_data = StringIO()
  export_graphviz(model_object, out_file=dot_data, filled=True,
                  rounded=True, special_characters=True, 
                  feature_names=X_train.columns.values,
                  class_names=['0', '1'])
  graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
  output_imagefile = 'tree.png'
  graph.write_png(output_imagefile)
  return output_imagefile


# 3. Run Classification Tree with GridSearchCV

kfolds = 5
maximum_depth = 100
minimum_depth = 1
param_grid = {'max_depth': list(range(minimum_depth, maximum_depth+1))}


gridsearch = GridSearchCV(
    DecisionTreeClassifier(criterion='entropy', random_state=1), 
    param_grid, 
    scoring='roc_auc', 
    cv=kfolds, 
    n_jobs=1 
)

# Fit the model using the training data 
gridsearch.fit(X_train, y_train)
clf_BPT = gridsearch.best_estimator_

# 4. Display the Tree and Stats

# Display the best pruned tree image
display(Image(summary_tree(clf_BPT)))

# Display depth and AUC
print(f"Best Depth: {clf_BPT.get_depth()}")
print(f"Test AUC: {roc_auc_score(y_test, clf_BPT.predict_proba(X_test)[:,1])}")


# 5. Leaf Node Statistics
# Modified by Reid Lapekas during Nov. 2024

def get_treepaths(dtc, df):
    rules_list = []
    values_path = []
    values = dtc.tree_.value

    def RevTraverseTree(tree, node, rules, pathValues):
        try:
            prevnode = tree[2].index(node)
            leftright = '<='
            pathValues.append(values[prevnode])
        except ValueError:
            # failed, so find it as a right node
            prevnode = tree[3].index(node)
            leftright = '>'
            pathValues.append(values[prevnode])

        # Get the rule
        p1 = df.columns[tree[0][prevnode]]
        p2 = tree[1][prevnode]
        rules.append(str(p1) + ' ' + leftright + ' ' + str(p2))

        # If not at top, go up one step
        if prevnode != 0:
            RevTraverseTree(tree, prevnode, rules, pathValues)

    # Get leaf nodes
    leaves = dtc.tree_.children_left == -1
    leaves = np.arange(0,dtc.tree_.node_count)[leaves]

    # Build simplified tree structure
    thistree = [dtc.tree_.feature.tolist()]
    thistree.append(dtc.tree_.threshold.tolist())
    thistree.append(dtc.tree_.children_left.tolist())
    thistree.append(dtc.tree_.children_right.tolist())

    # Apply rules
    for (ind,nod) in enumerate(leaves):
        rules = []
        pathValues = []
        RevTraverseTree(thistree, nod, rules, pathValues)

        pathValues.insert(0, values[nod])
        pathValues = list(reversed(pathValues))
        rules = list(reversed(rules))
        rules_list.append(rules)
        values_path.append(pathValues)

    # Print results
    for i in range(len(rules_list)):
      print('\nLeaf node ID =', i+1)
      print('Path =', rules_list[i])
      samples = dtc.tree_.n_node_samples[leaves[i]]
      # Probability * Samples = Count
      class_counts = np.round(values_path[i][-1][0] * samples).astype(int)
      
      # Note: Because of how sklearn stores values, we normalize to be safe
      # (Sometimes values_path contains raw counts, sometimes weighted)
      current_val = values_path[i][-1][0]
      if np.sum(current_val) > 0:
          normalized_probs = current_val / np.sum(current_val)
          class_counts = np.round(normalized_probs * samples).astype(int)
      
      print('sample =', int(samples))
      print('value =', list(class_counts))
      
      predicted_class = np.argmax(class_counts)
      print('class = ', predicted_class)

    return None

# Run the function on the Training data
get_treepaths(dtc=clf_BPT, df=X_train)


# Part 6: English Rule Descriptions


def extract_english_rules(tree_model, feature_names, class_labels=['0', '1']):
    tree_ = tree_model.tree_
    feature_name = [feature_names[i] if i != -2 else "undefined!" for i in tree_.feature]
    paths = []
    path = []

    def recurse(node, path, paths):
        if tree_.feature[node] != -2:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            recurse(tree_.children_left[node], path + [f"{name} <= {threshold:.2f}"], paths)
            recurse(tree_.children_right[node], path + [f"{name} > {threshold:.2f}"], paths)
        else:
            value = tree_.value[node][0]
            n_samples = int(np.sum(value))
            class_counts = [int(v) for v in value]
            p1 = class_counts[1] / n_samples if n_samples > 0 else 0
            paths.append({
                "rule": " AND ".join(path),
                "samples": n_samples,
                "class_0": class_counts[0],
                "class_1": class_counts[1],
                "predicted_probability": round(p1, 3),
                "predicted_class": class_labels[int(np.argmax(value))]
            })

    recurse(0, path, paths)
    return pd.DataFrame(paths).sort_values(by=["predicted_probability"], ascending=False)

# Generate and save
rules_df = extract_english_rules(clf_BPT, X_train.columns.values)
print("\nTop 5 Rules:")
display(rules_df.head(5))
rules_df.to_csv("tree_rules.csv", index=False)

ModuleNotFoundError: No module named 'pydotplus'