# Tree-Based Methods

## Agenda
- Decision Trees
- Bagging
- Random Forests
- Trees for Regression


In [40]:
import urllib.request
import zipfile
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np
from sklearn import tree

# Suppress all warnings
warnings.filterwarnings('ignore')



## Plot Functions

In [4]:
def plot_tree(model, features):
    plt.figure(figsize=(15, 10))
    tree.plot_tree(
        model,
        feature_names=features,
        proportion=True,
        precision=2,
        filled=True,
    )
    plt.show()


In [5]:
def plot_tree_scatter(model, df, target, feat1, feat2, scatterplot=True):
    fig, ax = plt.subplots(figsize=(10, 6))

    if scatterplot:
        sns.scatterplot(x=feat1, y=feat2, hue=target, data=df)

    x_min = df[feat1].min()
    x_max = df[feat1].max()
    y_min = df[feat2].min()
    y_max = df[feat2].max()
    node_bounds = {0: (x_min, x_max, y_min, y_max)}

    for node in range(model.tree_.node_count):
        curr_x_min, curr_x_max, curr_y_min, curr_y_max = node_bounds[node]

        if model.tree_.feature[node] == -2:
            center_x = (curr_x_min + curr_x_max) / 2
            center_y = (curr_y_min + curr_y_max) / 2

            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", category=UserWarning)
                # Check if model is regressor or classifier
                if hasattr(model, "predict_proba"):
                    # For classifier
                    prob_positive = model.predict_proba([[center_x, center_y]])[0][1]
                    color = plt.cm.coolwarm(prob_positive)
                else:
                    # For regressor
                    prediction = model.predict([[center_x, center_y]])[0]
                    # Normalize prediction to [0,1] for coloring
                    pred_min = df[target].min()
                    pred_max = df[target].max()
                    normalized_pred = (prediction - pred_min) / (pred_max - pred_min)
                    color = plt.cm.coolwarm(normalized_pred)

            plt.gca().add_patch(
                plt.Rectangle(
                    (curr_x_min, curr_y_min),
                    curr_x_max - curr_x_min,
                    curr_y_max - curr_y_min,
                    color=color,
                    alpha=0.3,
                )
            )
        else:
            feature = model.tree_.feature[node]
            threshold = model.tree_.threshold[node]

            if feature == 0:
                plt.plot([threshold, threshold], [curr_y_min, curr_y_max], "k--")
                node_bounds[model.tree_.children_left[node]] = (
                    curr_x_min,
                    threshold,
                    curr_y_min,
                    curr_y_max,
                )
                node_bounds[model.tree_.children_right[node]] = (
                    threshold,
                    curr_x_max,
                    curr_y_min,
                    curr_y_max,
                )
            elif feature == 1:
                plt.plot([curr_x_min, curr_x_max], [threshold, threshold], "r--")
                node_bounds[model.tree_.children_left[node]] = (
                    curr_x_min,
                    curr_x_max,
                    curr_y_min,
                    threshold,
                )
                node_bounds[model.tree_.children_right[node]] = (
                    curr_x_min,
                    curr_x_max,
                    threshold,
                    curr_y_max,
                )
    return ax

## Preparing the data

The dataset can only be downloaded as a zip file. Let's download, unzip, and open it.


In [6]:
url = "https://archive.ics.uci.edu/static/public/2/adult.zip"
urllib.request.urlretrieve(url, "adult.zip")

with zipfile.ZipFile("adult.zip", "r") as zip_ref:
    zip_ref.extractall("adults")

columns = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income",
]

# the dataset is pre-split into train and test
# I want to do the split myself, so let's join the datasets
df1 = pd.read_csv("adult.data", header=None, names=columns)
# first row of adult.test is weird, let's remove it
df2 = pd.read_csv("adult.test", header=None, names=columns)[1:]
df = pd.concat([df1, df2])

In [7]:
df.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
14281,43,Private,483450.0,9th,5.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,40.0,Mexico,<=50K
1166,46,Private,224582.0,Some-college,10.0,Never-married,Machine-op-inspct,Not-in-family,White,Female,0.0,0.0,52.0,United-States,<=50K.
6388,35,Local-gov,182074.0,HS-grad,9.0,Separated,Protective-serv,Not-in-family,White,Male,0.0,0.0,42.0,United-States,<=50K.
2277,28,Private,437994.0,Some-college,10.0,Never-married,Other-service,Not-in-family,Black,Male,0.0,0.0,60.0,United-States,<=50K.
4487,20,Private,22966.0,Some-college,10.0,Never-married,Other-service,Own-child,White,Male,0.0,0.0,12.0,Canada,<=50K


There's one column with a mysterious and undocumented column `fnlwgt`, let's drop it.

In [8]:
df = df.drop(columns=["fnlwgt"])

Some columns have dashes (-) instead of underscores in their names. I don't like that, so I'll rename the columns to use underscores.

In [9]:
df = df.rename(columns=lambda x: x.replace("-", "_"))

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48842 entries, 0 to 16281
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             48842 non-null  object 
 1   workclass       48842 non-null  object 
 2   education       48842 non-null  object 
 3   education_num   48842 non-null  float64
 4   marital_status  48842 non-null  object 
 5   occupation      48842 non-null  object 
 6   relationship    48842 non-null  object 
 7   race            48842 non-null  object 
 8   sex             48842 non-null  object 
 9   capital_gain    48842 non-null  float64
 10  capital_loss    48842 non-null  float64
 11  hours_per_week  48842 non-null  float64
 12  native_country  48842 non-null  object 
 13  income          48842 non-null  object 
dtypes: float64(4), object(10)
memory usage: 5.6+ MB


There are no null values in the columns, but from the column description [here](https://archive.ics.uci.edu/dataset/2/adult) I see that some of them have missing values. Maybe the missing values are encoded differently? 

In [11]:
df.sample(10)

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
16523,79,?,HS-grad,9.0,Married-civ-spouse,?,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
14776,35,State-gov,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Male,0.0,0.0,45.0,United-States,<=50K.
15722,50,Private,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K
21926,28,Private,HS-grad,9.0,Never-married,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,42.0,United-States,<=50K
12575,38,Self-emp-inc,Some-college,10.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,7298.0,0.0,40.0,United-States,>50K
25798,22,Private,Some-college,10.0,Never-married,Sales,Own-child,White,Female,0.0,0.0,35.0,United-States,<=50K
23865,67,?,10th,6.0,Never-married,?,Not-in-family,Black,Female,0.0,0.0,35.0,United-States,<=50K
5093,57,Self-emp-not-inc,11th,7.0,Divorced,Other-service,Unmarried,White,Male,4650.0,0.0,50.0,United-States,<=50K.
15869,64,Private,11th,7.0,Divorced,Machine-op-inspct,Not-in-family,White,Female,0.0,0.0,40.0,United-States,<=50K.
10014,24,Private,Some-college,10.0,Never-married,Sales,Own-child,White,Female,0.0,0.0,40.0,United-States,<=50K


Indeed, "?" seems to represent a missing value. In fact, it's " ?" (with a space).

In [12]:
# Replace "?" with NaN in all columns
df = df.replace(" ?", None)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48842 entries, 0 to 16281
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             48842 non-null  object 
 1   workclass       46043 non-null  object 
 2   education       48842 non-null  object 
 3   education_num   48842 non-null  float64
 4   marital_status  48842 non-null  object 
 5   occupation      46033 non-null  object 
 6   relationship    48842 non-null  object 
 7   race            48842 non-null  object 
 8   sex             48842 non-null  object 
 9   capital_gain    48842 non-null  float64
 10  capital_loss    48842 non-null  float64
 11  hours_per_week  48842 non-null  float64
 12  native_country  47985 non-null  object 
 13  income          48842 non-null  object 
dtypes: float64(4), object(10)
memory usage: 5.6+ MB


In [14]:
df["income"].value_counts()

income
<=50K     24720
<=50K.    12435
>50K       7841
>50K.      3846
Name: count, dtype: int64

In [15]:
# map target to more usable 0/1
df["income"] = df["income"].str.strip().str.replace(".", "")
df["income"] = df["income"].map({"<=50K": 0, ">50K": 1})

# Convert age column to integer type
df["age"] = df["age"].astype(int)

In [16]:
df["sex"].value_counts()

sex
Male      32650
Female    16192
Name: count, dtype: int64

In [17]:
df["sex"] = df["sex"].map({" Male": 1, " Female": 0})

The dataset is now ready for analysis and modelling!

In [18]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=10)

## Regression Data

In [None]:
df_hit = pd.read_csv(
    "https://raw.githubusercontent.com/intro-stat-learning/ISLP/main/ISLP/data/Hitters.csv"
)
df_hit = df_hit[df_hit["Salary"].notnull()]