In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [2]:
# ========== 1) Load Train/Test Splits ==========

train_path = '../data/processed/train_local_80pct.csv'
test_path = '../data/processed/test_local_20pct.csv'

train_df = pd.read_csv(train_path, index_col=0, parse_dates=True)
test_df = pd.read_csv(test_path, index_col=0, parse_dates=True)

# For anomaly detection, we only need numeric columns. 
# If there are still NaNs, we'll fill them with 0 or forward-fill (your choice).
train_data = train_df.select_dtypes(include='number').fillna(0)
test_data = test_df.select_dtypes(include='number').fillna(0)

In [3]:
# ========== 2) Scale the Data ==========

scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_data)
test_scaled = scaler.transform(test_data)

In [4]:
# ========== 3) Train Isolation Forest ==========

iso_forest = IsolationForest(
    n_estimators=100,
    max_samples='auto',
    contamination=0.05,  # fraction of outliers to assume
    random_state=42,
    n_jobs=-1            # use all CPU cores for speed
)

iso_forest.fit(train_scaled)

In [5]:
# ========== 4) Predict Anomalies on Test ==========

# decision_function gives an outlier score (higher = more normal, lower = more anomalous)
test_scores = iso_forest.decision_function(test_scaled)

# predict returns +1 for normal, -1 for anomaly
test_labels = iso_forest.predict(test_scaled)

In [6]:
# ========== 5) Combine Results into a DataFrame ==========

results_df = test_df.copy()
results_df['anomaly_score'] = test_scores
results_df['anomaly_label'] = test_labels

# Let's isolate the anomalies
anomalies = results_df[results_df['anomaly_label'] == -1]

print(f"Found {len(anomalies)} anomalies out of {len(results_df)} total points.")
results_df.head(10)

Found 604 anomalies out of 14026 total points.


Unnamed: 0,MT_001,MT_002,MT_003,MT_004,MT_005,MT_006,MT_007,MT_008,MT_009,MT_010,...,MT_368_rollmean_daily_cumsum,MT_368_rollstd_daily_cumsum,MT_369_daily_cumsum,MT_369_rollmean_daily_cumsum,MT_369_rollstd_daily_cumsum,MT_370_daily_cumsum,MT_370_rollmean_daily_cumsum,MT_370_rollstd_daily_cumsum,anomaly_score,anomaly_label
2014-03-14 19:30:00,,,,,,,,,,,...,,,,,,,,,0.088412,1
2014-03-14 20:00:00,,,,,,,,,,,...,,,,,,,,,0.083948,1
2014-03-14 20:30:00,,,,,,,,,,,...,,,,,,,,,0.084165,1
2014-03-14 21:00:00,,,,,,,,,,,...,,,,,,,,,0.081989,1
2014-03-14 21:30:00,,,,,,,,,,,...,,,,,,,,,0.074798,1
2014-03-14 22:00:00,,,,,,,,,,,...,,,,,,,,,0.069687,1
2014-03-14 22:30:00,,,,,,,,,,,...,,,,,,,,,0.061415,1
2014-03-14 23:00:00,,,,,,,,,,,...,,,,,,,,,0.061143,1
2014-03-14 23:30:00,,,,,,,,,,,...,,,,,,,,,0.045029,1
2014-03-15 00:00:00,,,,,,,,,,,...,,,,,,,,,0.092262,1
