# Term Project - Isolation Forest for Anomaly Detection
> Leigh Goetsch </br>
> CSC 5601 - Theory of Machine Learning </br>
> Milwaukee School of Engineering </br>
> Fall 2025


In [None]:
# imports
from isolation_forest import IsolationForest
import pandas as pd
from scipy import io as sio
import numpy as np
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Load and view dataset
Each patient is represented in the data set by six biomechanical attributes derived from the shape and orientation of the pelvis and lumbar spine (in this order): pelvic incidence, pelvic tilt, lumbar lordosis angle, sacral slope, pelvic radius and grade of spondylolisthesis. The following convention is used for the class labels: Normal (NO) and Abnormal (AB).

In [None]:
# data_path = "../Data/IRIS.csv"
data_path = "../Data/TUANDROMD.csv"
df_data = pd.read_csv(data_path)

# drop cols with identical values
nunique = df_data.nunique()
cols_to_drop = nunique[nunique == 1].index
df_data = df_data.drop(columns=cols_to_drop)
# drop nulls
df_data = df_data.dropna()
df_data["target"] = np.where(df_data["target"] == 1, 0, 1)  # make anomalies = 1

X = df_data.drop(columns=["target"]).values
y = df_data["target"].values

df_data["target"].value_counts(), df_data["target"].value_counts(normalize=True)

# plot ditribution of classes vs features
# sns.pairplot(df_data, hue='target', diag_kind='kde')

## Isolation Forest setup

In [None]:
iso_forest = IsolationForest(random_state=42, contamination=0.205)
predictions = iso_forest.fit_predict(X)

print(classification_report(y, predictions, target_names=["Inlier", "Outlier"]))


In [None]:
feature_importances = iso_forest.feature_importances_()
feature_names = df_data.columns.drop("target")
importance_lookup = dict(zip(feature_names, feature_importances))

importance_df = pd.DataFrame(
    list(importance_lookup.items()), columns=["Feature", "Importance"]
).sort_values(by="Importance", ascending=False)


rows = []

for feature in feature_names:
    # group once per feature
    grp = df_data.groupby([feature, "target"]).size().unstack("target", fill_value=0)
    grp = grp.rename(columns={0: "inliers", 1: "outliers"})

    # total counts for that feature
    total = df_data[feature].value_counts().rename("total_count")

    # merge and tidy
    merged = grp.merge(total, left_index=True, right_index=True)
    merged["feature"] = feature
    merged["importance"] = importance_lookup[feature]
    merged.reset_index(inplace=True)
    merged.rename(columns={feature: "value"}, inplace=True)

    rows.append(merged)

info_df = pd.concat(rows, ignore_index=True)
info_df


In [None]:

# plot feature importance vs outlier ratio for each feature
info_df["fraction_outliers"] = info_df["outliers"] / info_df["total_count"]
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=info_df,
    x="importance",
    y="fraction_outliers",
    size="total_count",
    hue="value",
    alpha=0.5,
)
plt.title("Feature Importance vs Outlier Ratio")
plt.xlabel("Feature Importance")
plt.ylabel("Outlier Ratio")
plt.tight_layout()
plt.show()

In [None]:
# feature_value_counts = df_data.melt(
#     id_vars=["target"],
#     value_vars=importance_df["Feature"].tolist(),
#     var_name="Feature",
#     value_name="value",
# ).groupby(["Feature", "value"]).size().reset_index(name="Frequency")

# merged_df = pd.merge(
#     importance_df, feature_value_counts, on=["Feature"]
# )
# feature_value_counts
# # pairplot of top 3 features colored by target
# sns.pairplot(
#     merged_df[merged_df["Feature"].isin(top_features["Feature"])],
#     hue="target",
#     diag_kind="kde",
# )

In [None]:
# heatmap visualization of predictions vs actual

plt.figure(figsize=(10, 6))
sns.heatmap(pd.crosstab(y, predictions, rownames=['Actual'], colnames=['Predicted']), annot=True, fmt='d')
plt.title('Heatmap of Actual vs Predicted Outliers')
plt.show()

In [None]:
data_path = '../Data/vertebral.mat'
# data_path = "../Data/satellite.mat"
# data_path = "../Data/satimage-2.mat"
mat = sio.loadmat(data_path)

X = mat['X']
y = mat['y'].flatten()
feature_names = [f"feature_{i}" for i in range(X.shape[1])]
df_vertebral = pd.DataFrame(X, columns=feature_names)
df_vertebral["target"] = y


iso_forest = IsolationForest(random_state=42, contamination=0.205)
predictions = iso_forest.fit_predict(X)
print(classification_report(y, predictions, target_names=["Inlier", "Outlier"]))

# scatter grid of all feature pairs
pd.plotting.scatter_matrix(
    df_vertebral[feature_names],
    c=y,
    figsize=(10, 10),
    marker="o",
    hist_kwds={"bins": 20},
    s=60,
    alpha=0.8,
)
plt.show()

pd.plotting.scatter_matrix(
    df_vertebral[feature_names],
    c=predictions,
    figsize=(10, 10),
    marker="o",
    hist_kwds={"bins": 20},
    s=60,
    alpha=0.8,
)
plt.show()