In [1]:
pip install pandas scikit-learn numpy matplotlib


Defaulting to user installation because normal site-packages is not writeable
Collecting matplotlib
  Downloading matplotlib-3.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.59.0-cp39-cp39-macosx_10_9_universal2.whl.metadata (107 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.7-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-11.3.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.0 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Collecting importlib-resources>=3.2.0 (from matplotlib)
  Downloading importlib_resources-6.5.2-py3-

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
import joblib

# Load data
df = pd.read_csv("dataset.csv")
df = df.applymap(lambda x: x.strip("'") if isinstance(x, str) else x)
df["amount"] = df["amount"].astype(float)
df["step"] = df["step"].astype(int)

# Downsample to ~10k rows while preserving fraud ratio
df_sample = df.groupby("fraud", group_keys=False).apply(lambda x: x.sample(frac=10000/len(df), random_state=42))

# One-hot encode categorical features
encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
cat_merchant_ohe = encoder.fit_transform(df_sample[["category", "merchant"]])

# Combine numeric + encoded
X = np.hstack([df_sample[["amount", "step"]].values, cat_merchant_ohe])

# Train Isolation Forest
model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
model.fit(X)

# Score
df_sample["anomaly_score"] = model.decision_function(X)
df_sample["predicted_anomaly"] = model.predict(X) == -1

# Evaluate
precision = precision_score(df_sample["fraud"], df_sample["predicted_anomaly"])
recall = recall_score(df_sample["fraud"], df_sample["predicted_anomaly"])
f1 = f1_score(df_sample["fraud"], df_sample["predicted_anomaly"])

print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1 Score:  {f1:.3f}")

# Save model + encoder
joblib.dump(model, "financeai_isoforest.pkl")
joblib.dump(encoder, "financeai_encoder.pkl")