In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.ensemble import IsolationForest

pd.set_option("display.max_columns", 200)
np.random.seed(42)

In [2]:
df = pd.read_csv("ffiec_v2.csv")  # if needed: r"C:\path\to\ffiec.csv"
df.shape
df.head(3)

Unnamed: 0,IDRSSD,quarter,RCON0010_qoq,RCON0020_qoq,RCON0080_qoq,RCON0090_qoq,RCON0352_qoq,RCON1420_qoq,RCON1460_qoq,RCON1545_qoq,RCON1590_qoq,RCON1763_qoq,RCON1764_qoq,RCON1797_qoq,RCON2107_qoq,RCON2122_qoq,RCON2123_qoq,RCON2165_qoq,RCON2200_qoq,RCON2202_qoq,RCON2203_qoq,RCON2210_qoq,RCON2213_qoq,RCON2215_qoq,RCON2216_qoq,RCON2236_qoq,RCON2365_qoq,RCON2377_qoq,RCON2385_qoq,RCON2520_qoq,RCON2530_qoq,RCON3360_qoq,RCON3386_qoq,RCON3387_qoq,RCON3485_qoq,RCON3493_qoq,RCON3494_qoq,RCON3495_qoq,RCON3499_qoq,RCON3500_qoq,RCON3501_qoq,RCON5367_qoq,RCON5368_qoq,RCON5370_qoq,RCON5398_qoq,RCON5399_qoq,RCON5400_qoq,RCON5508_qoq,RCON5509_qoq,RCON5510_qoq,RCON5511_qoq,RCON5512_qoq,RCON5597_qoq,RCON6631_qoq,RCON6636_qoq,RCON6648_qoq,RCON6810_qoq,RCON6835_qoq,RCON9999_qoq,RCONA564_qoq,RCONA565_qoq,RCONA566_qoq,RCONA567_qoq,RCONA568_qoq,RCONA569_qoq,RCONB534_qoq,RCONB538_qoq,RCONB539_qoq,RCONB549_qoq,RCONB550_qoq,RCONB551_qoq,RCONB552_qoq,RCONB561_qoq,RCONB562_qoq,RCONB563_qoq,RIAD0093_qoq,RIAD3123_qoq,RIAD3196_qoq,RIAD3210_qoq,RIAD3217_qoq,RIAD3521_qoq,RIAD3584_qoq,RIAD3585_qoq,RIAD3588_qoq,RIAD3589_qoq,RIAD4010_qoq,RIAD4012_qoq,RIAD4020_qoq,RIAD4024_qoq,RIAD4060_qoq,RIAD4065_qoq,RIAD4070_qoq,RIAD4073_qoq,RIAD4074_qoq,RIAD4079_qoq,RIAD4080_qoq,RIAD4092_qoq,RIAD4093_qoq,RIAD4107_qoq,RIAD4115_qoq,RIAD4135_qoq,RIAD4150_qoq,RIAD4180_qoq,RIAD4185_qoq,RIAD4200_qoq,RIAD4217_qoq,RIAD4230_qoq,RIAD4300_qoq,RIAD4301_qoq,RIAD4302_qoq,RIAD4313_qoq,RIAD4340_qoq,RIAD4356_qoq,RIAD4415_qoq,RIAD4460_qoq,RIAD4470_qoq,RIAD4507_qoq,RIAD4508_qoq,RIAD4513_qoq,RIAD4518_qoq,RIAD4605_qoq,RIAD4618_qoq,RIAD4628_qoq,RIAD4635_qoq,RIAD4644_qoq,RIAD4646_qoq,RIAD4652_qoq,RIAD4655_qoq,RIAD4662_qoq,RIAD4665_qoq,RIAD5409_qoq,RIAD5410_qoq,RIAD5411_qoq,RIAD5412_qoq,RIAD5415_qoq,RIAD5416_qoq,RIAD8431_qoq,RIAD8757_qoq,RIAD8758_qoq,RIAD8759_qoq,RIAD8760_qoq,RIADA220_qoq,RIADA251_qoq,RIADB485_qoq,RIADB486_qoq,RIADB488_qoq,RIADB489_qoq,RIADB491_qoq,RIADB492_qoq,RIADB493_qoq,RIADB496_qoq,RIADB497_qoq,RIADB507_qoq,RIADB508_qoq,RIADB509_qoq,RIADB511_qoq,RIADB514_qoq,RIADB515_qoq,RIADB522_qoq
0,451965,03/31/2002,-1.625823,0.560538,-18.271955,-47.692308,17.665109,1.80624,11.295681,85.901639,6.570743,2.845451,-79.824561,89.144883,2.290076,21.491026,0.0,-18.148906,11.810797,20.0,14.141414,8.044144,0.0,7.636923,0.0,0.0,100.0,0.0,13.086239,0.0,-27.272727,38.230929,8.411215,4.044944,24.497992,-66.666667,-66.666667,500.0,425.0,0.0,-20.0,75.482929,-44.79385,981.037277,-24.324324,-66.666667,0.0,0.0,0.0,-18.181818,-100.0,-13.513514,100.0,1.406074,17.62619,-31.420861,12.775842,-13.14554,0.049973,30.017435,-84.390609,-79.711538,653.644989,433.048744,-2.586207,0.0,966.666667,23.866348,7.716205,13.169895,5.741388,0.0,900.0,42.547638,17.417065,-55.747126,-4.651163,-89.130435,14.547811,17.819188,0.0,0.0,0.0,0.0,0.0,3.066202,-11.25,-98.944823,-40.909091,-4.347826,-8.62069,-12.676056,-50.982962,35.528942,-45.424054,8.0,-12.323944,1.451279,-1.869688,-58.139535,25.082508,10.937112,-53.04878,-36.923077,-48.809524,10.179641,7.142857,-53.401899,-41.147132,4.411765,0.0,-56.408228,0.0,-100.0,-100.0,0.0,0.0,-62.5,0.0,0.0,41.666667,0.0,100.0,25.454545,-63.636364,-100.0,0.0,-80.0,0.0,-100.0,0.0,0.0,0.0,0.0,-400.0,-3923.657143,-70.625,-36.363636,28.571429,0.0,0.0,-6.557377,0.0,100.0,10.97561,-20.0,-23.148148,100.0,50.931677,100.0,-108.160622,11.075949,0.0,17.819188,0.0,-55.384615,0.0,0.0,-4.114134
1,451965,03/31/2003,-29.071267,4.252269,-17.157712,-14.705882,54.150451,-9.83871,17.313433,-36.155203,6.120612,6.431017,-39.130435,318.575779,79.850746,63.15867,0.0,-20.5,36.178956,-16.666667,-60.619469,-52.535868,0.0,-49.562632,0.0,0.0,446.308571,0.0,61.117116,100.0,655.542857,52.076444,7.801724,4.131244,19.193548,225.0,100.0,-8.333333,-90.47619,100.0,-16.666667,120.8,142.468354,66.844078,110.714286,177.065637,57.142857,0.0,100.0,22.222222,0.0,-68.75,92.332993,15.585136,46.102012,-9.136014,23.059639,-2.822823,0.049949,210.981152,-82.88,572.037915,220.94532,25.378685,258.505408,0.0,-87.5,28.131021,-55.684486,60.309373,-4.296506,0.0,-96.666667,81.505219,35.248744,-12.987013,-7.038328,100.0,10.634117,9.502039,0.0,0.0,0.0,0.0,0.0,32.319135,-10.56338,100.0,-7.692308,45.454545,-27.358491,12.903226,3.475936,30.044183,19.841967,11.111111,-1.204819,15.190736,24.307159,-61.111111,29.419525,14.315687,5.194805,-24.390244,-20.930233,5.978261,44.444444,40.747029,39.830508,38.309859,-50.0,50.453721,0.0,0.0,0.0,0.0,0.0,-66.666667,0.0,0.0,-17.647059,0.0,0.0,18.115942,0.0,0.0,0.0,200.0,0.0,100.0,0.0,0.0,300.0,0.0,100.0,-33.510638,-57.446809,4.761905,8.333333,0.0,0.0,7.017544,0.0,-100.0,67.032967,-25.0,-28.915663,-100.0,-276.131687,0.0,155.555556,124.786325,0.0,9.502039,0.0,-51.724138,0.0,0.0,-7.404844
2,451965,03/31/2004,116.935667,55.698136,109.609777,509.195402,195.916402,127.191413,63.867684,144.475138,48.727735,41.618849,250.0,147.39493,36.929461,69.684521,0.0,-25.447508,127.65412,80.0,184.269663,118.845628,0.0,129.007028,0.0,0.0,-17.877913,0.0,127.530937,250.0,655.542857,75.836604,41.303479,41.928508,269.688769,53.846154,50.0,163.636364,0.0,-100.0,20.0,39.549275,76.011485,95.439637,120.338983,140.0,313.636364,0.0,0.0,122.727273,0.0,620.0,103.956834,169.043049,111.876609,155.884014,157.878119,156.860321,0.049924,-35.773663,245.794392,231.946403,385.532775,131.211909,23.889194,0.0,96904.0,66.691729,144.377176,123.623503,74.296991,100.0,462800.0,125.49435,175.762046,155.223881,97.076462,170.0,75.222001,32.673211,0.0,0.0,0.0,100.0,0.0,60.909555,19.947507,-55.555556,45.833333,81.25,-45.454545,135.849624,34.108527,74.631937,60.43956,120.0,174.593496,82.140745,67.347887,142.857143,38.430173,72.602056,-29.62963,41.935484,29.411765,70.25641,43.076923,55.609168,54.166667,51.731161,200.0,55.609168,100.0,0.0,100.0,0.0,300.0,100.0,0.0,100.0,196.428571,0.0,200.0,73.619632,0.0,0.0,0.0,66.666667,0.0,100.0,0.0,0.0,250.0,100.0,0.0,102.788845,195.0,-4.545455,25.641026,100.0,100.0,19.672131,0.0,100.0,104.605263,200.0,418.644068,0.0,138.084112,2500.0,-65.714286,-34.790875,0.0,32.673211,0.0,1457.142857,100.0,100.0,16.143498


In [3]:
df.dtypes.value_counts()
df[["IDRSSD", "quarter"]].head(10)

Unnamed: 0,IDRSSD,quarter
0,451965,03/31/2002
1,451965,03/31/2003
2,451965,03/31/2004
3,451965,03/31/2005
4,451965,03/31/2006
5,451965,03/31/2007
6,451965,03/31/2008
7,451965,03/31/2009
8,451965,03/31/2010
9,451965,03/31/2011


In [4]:
non_feature_cols = ["IDRSSD", "quarter"]
X = df.drop(columns=non_feature_cols)

X.shape

(557, 157)

In [5]:
# sanity check: all numeric? / Empty index?
X.select_dtypes(exclude=["number"]).columns

Index([], dtype='object')

In [6]:
#Train / Test split (Unsupervised Holdout)
X_train, X_test = train_test_split(X, test_size=0.25, random_state=42)
X_train.shape, X_test.shape

((417, 157), (140, 157))

In [7]:
#Preprocess
preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train_p = preprocess.fit_transform(X_train)
X_test_p = preprocess.transform(X_test)

X_train_p.shape, X_test_p.shape

((417, 157), (140, 157))

In [8]:
#Fit Isolation Forest
iso = IsolationForest(
    n_estimators=300,
    contamination=0.02,
    random_state=42,
    n_jobs=-1
)

iso.fit(X_train_p)

In [9]:
#Create Anomaly Scores (continous) + predictions
test_raw = iso.score_samples(X_test_p)
test_anom_score = -test_raw  # higher = more anomalous

test_pred = (iso.predict(X_test_p) == -1).astype(int)  # 1 = anomaly, 0 = normal

test_anom_score[:5], test_pred[:20], test_pred.mean()

(array([0.35192108, 0.3406255 , 0.34835613, 0.34925402, 0.36648349]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 np.float64(0.04285714285714286))

In [10]:
# Rank the most anomalous rows ( with IDRSSD & Quarter attached )
test_index = X_test.index

results = df.loc[test_index, ["IDRSSD", "quarter"]].copy()
results["anom_score"] = test_anom_score
results["is_anomaly"] = test_pred

results.sort_values("anom_score", ascending=False).head(25)

Unnamed: 0,IDRSSD,quarter,anom_score,is_anomaly
33,451965,06/30/2010,0.548259,1
153,476810,09/30/2007,0.536801,1
172,476810,12/31/2001,0.515593,1
2,451965,03/31/2004,0.509643,1
177,476810,12/31/2006,0.509612,1
83,451965,12/31/2010,0.50878,1
322,852218,06/30/2005,0.502417,0
245,480228,09/30/2001,0.48721,0
220,480228,06/30/2001,0.484799,0
277,480228,12/31/2008,0.478926,0


In [11]:
top_idx = results.sort_values("anom_score", ascending=False).index[0]
df.loc[top_idx, ["IDRSSD", "quarter"]].to_frame().T

Unnamed: 0,IDRSSD,quarter
33,451965,06/30/2010


In [12]:
# show the feature values for that same record
# Look on feature values
df.loc[top_idx, X.columns].sort_values(ascending=False).head(20)

RCON0090_qoq    24210.38254
RIAD4356_qoq    7576.866585
RCON5511_qoq         4791.0
RCON3500_qoq         1555.2
RCON2202_qoq    1441.205742
RCON2213_qoq    1385.644587
RIAD4646_qoq          800.0
RCON5508_qoq     786.597938
RCONB552_qoq     747.379455
RCON3494_qoq     739.952361
RIAD4460_qoq          700.0
RIAD8760_qoq          680.0
RCON5370_qoq     625.262362
RCON2203_qoq     618.761726
RIAD4185_qoq     568.604651
RCON2377_qoq     510.658307
RCON5512_qoq          500.0
RCON0010_qoq     447.234298
RCON6835_qoq     435.637078
RCON5510_qoq      427.61194
Name: 33, dtype: object

In [13]:
# Cell 13 — Data scientist sanity checks + interpret top anomalies (builds on Cell 9)

# --- (A) Data quality diagnostics (Step 2 / Step 3) ---
print("Shape of X:", X.shape)

# Missingness
missing = X.isna().mean().sort_values(ascending=False)
print("\nTop 15 columns by missing %:")
display((missing.head(15) * 100).to_frame("missing_%"))

# Low-variance / near-constant features
variances = X.var(numeric_only=True).sort_values()
print("\nLowest 15 variances (possible near-constant features):")
display(variances.head(15).to_frame("variance"))

# Extreme ranges (helps decide clipping/winsorizing before modeling)
desc = X.describe().T[["mean", "std", "min", "max"]]
print("\nTop 20 columns by absolute max value (extremes):")
display(desc.assign(abs_max=desc["max"].abs()).sort_values("abs_max", ascending=False).head(20))

# --- (B) Interpret the top anomalies using standardized feature contributions (Step 6) ---
# We'll compute z-scores using the SAME preprocessing scaler stats (median impute + standardize)
# and show which features are most extreme for a chosen anomaly row.

# Pick which anomaly row to inspect:
# Uses your existing top_idx from Cell 9; if it doesn't exist, default to the top-ranked anomaly in `results`.
try:
    _ = top_idx
except NameError:
    top_idx = results.sort_values("anom_score", ascending=False).index[0]

row_raw = df.loc[top_idx, ["IDRSSD", "quarter"]].to_dict()
print("\nInspecting anomaly row:", row_raw)

# Recreate the processed version of that one row using the fitted preprocess pipeline
x_row = X.loc[[top_idx]]
x_row_p = preprocess.transform(x_row)

# Get feature names (still matches X.columns because we only did impute+scale on numeric columns)
feat_names = X.columns.to_list()

# Convert the standardized row to a Series and rank by absolute z-score
z = pd.Series(np.asarray(x_row_p).ravel(), index=feat_names)
z_abs = z.abs().sort_values(ascending=False)

print("\nTop 20 most extreme standardized features (|z|) for this anomaly row:")
display(pd.DataFrame({
    "z_score": z.loc[z_abs.index[:20]],
    "abs_z": z_abs.head(20),
    "raw_value": x_row.iloc[0][z_abs.index[:20]].values
}).sort_values("abs_z", ascending=False))

# Optional: show the raw values for those features compared to overall medians
overall_median = X.median(numeric_only=True)
overall_iqr = X.quantile(0.75) - X.quantile(0.25)

print("\nSame top 10 features vs overall median and IQR (rough scale context):")
top10 = z_abs.index[:10]
display(pd.DataFrame({
    "raw_value": x_row.iloc[0][top10],
    "overall_median": overall_median[top10],
    "overall_IQR": overall_iqr[top10],
    "z_score": z[top10]
}))


Shape of X: (557, 157)

Top 15 columns by missing %:


Unnamed: 0,missing_%
RCON0010_qoq,0.0
RCON0020_qoq,0.0
RCON0080_qoq,0.0
RCON0090_qoq,0.0
RCON0352_qoq,0.0
RCON1420_qoq,0.0
RCON1460_qoq,0.0
RCON1545_qoq,0.0
RCON1590_qoq,0.0
RCON1763_qoq,0.0



Lowest 15 variances (possible near-constant features):


Unnamed: 0,variance
RCON9999_qoq,0.04355
RIAD4070_qoq,523.789078
RCON0080_qoq,642.263006
RIAD4150_qoq,674.682259
RIAD4080_qoq,679.94296
RIAD4470_qoq,704.902294
RIAD3210_qoq,720.610466
RIADB508_qoq,796.608364
RIAD3217_qoq,798.661786
RCON0020_qoq,809.240723



Top 20 columns by absolute max value (extremes):


Unnamed: 0,mean,std,min,max,abs_max
RCONB561_qoq,6350.269676,52784.007169,-100.0,463988.0,463988.0
RCONB538_qoq,1390.154239,11245.843596,-100.0,96904.0,96904.0
RIAD4356_qoq,735.932609,4786.612824,-161.228969,39491.56539,39491.56539
RIAD4415_qoq,592.418512,3168.860514,-876.83261,25830.0,25830.0
RCON0090_qoq,499.647458,2931.195052,-99.900597,24210.38254,24210.38254
RIADA251_qoq,309.083312,1730.914856,-309.777778,14410.20232,14410.20232
RCONB552_qoq,275.169107,1520.807857,-100.0,13156.76674,13156.76674
RIADB492_qoq,119.381899,689.76662,-505.898039,5378.0,5378.0
RCON3501_qoq,139.159878,659.323809,-100.0,5049.500066,5049.500066
RCON1460_qoq,121.182793,663.635421,-100.0,4799.369335,4799.369335



Inspecting anomaly row: {'IDRSSD': 451965, 'quarter': '06/30/2010'}

Top 20 most extreme standardized features (|z|) for this anomaly row:


Unnamed: 0,z_score,abs_z,raw_value
RCON5511_qoq,8.850551,8.850551,4791.0
RCON3500_qoq,8.352037,8.352037,1555.2
RCON3494_qoq,8.33763,8.33763,739.952361
RCON2213_qoq,8.273434,8.273434,1385.644587
RCON0090_qoq,7.920918,7.920918,24210.38254
RCON6835_qoq,6.493689,6.493689,435.637078
RCON2202_qoq,6.403808,6.403808,1441.205742
RIAD4460_qoq,6.23191,6.23191,700.0
RCON5508_qoq,5.39538,5.39538,786.597938
RCON2203_qoq,5.138976,5.138976,618.761726



Same top 10 features vs overall median and IQR (rough scale context):


Unnamed: 0,raw_value,overall_median,overall_IQR,z_score
RCON5511_qoq,4791.0,0.0,0.0,8.850551
RCON3500_qoq,1555.2,0.0,0.0,8.352037
RCON3494_qoq,739.952361,0.0,0.0,8.33763
RCON2213_qoq,1385.644587,0.0,22.385817,8.273434
RCON0090_qoq,24210.38254,2.508203,99.738008,7.920918
RCON6835_qoq,435.637078,0.078003,17.203742,6.493689
RCON2202_qoq,1441.205742,0.0,52.083333,6.403808
RIAD4460_qoq,700.0,0.0,95.539216,6.23191
RCON5508_qoq,786.597938,0.0,18.89366,5.39538
RCON2203_qoq,618.761726,0.0,26.25464,5.138976


In [14]:
# Cell 14 — Clip extreme feature values (winsorization) before modeling

X_clipped = X.copy()

lower_q = 0.01
upper_q = 0.99

for col in X_clipped.columns:
    low = X_clipped[col].quantile(lower_q)
    high = X_clipped[col].quantile(upper_q)
    X_clipped[col] = X_clipped[col].clip(lower=low, upper=high)

X_clipped.describe().T[["min", "max"]].head()


Unnamed: 0,min,max
RCON0010_qoq,-94.15241,937.77728
RCON0020_qoq,-65.860095,123.903458
RCON0080_qoq,-86.390686,109.277855
RCON0090_qoq,-99.880514,22110.159895
RCON0352_qoq,-100.0,674.146717


In [15]:
#Train/ Test Split again with cleaned data
X_train_c, X_test_c = train_test_split(
    X_clipped, test_size=0.25, random_state=42
)


In [16]:
#Second Preprocess
preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train_p = preprocess.fit_transform(X_train_c)
X_test_p = preprocess.transform(X_test_c)


In [17]:
#Retrain
iso = IsolationForest(
    n_estimators=300,
    contamination=0.02,
    random_state=42,
    n_jobs=-1
)

iso.fit(X_train_p)


In [18]:
#Score and Rank anomalies again
test_raw = iso.score_samples(X_test_p)
test_anom_score = -test_raw
test_pred = (iso.predict(X_test_p) == -1).astype(int)

results = df.loc[X_test_c.index, ["IDRSSD", "quarter"]].copy()
results["anom_score"] = test_anom_score
results["is_anomaly"] = test_pred

results.sort_values("anom_score", ascending=False).head(20)


Unnamed: 0,IDRSSD,quarter,anom_score,is_anomaly
33,451965,06/30/2010,0.551307,1
153,476810,09/30/2007,0.537787,1
177,476810,12/31/2006,0.513787,1
172,476810,12/31/2001,0.511718,1
2,451965,03/31/2004,0.511272,1
322,852218,06/30/2005,0.508746,1
83,451965,12/31/2010,0.504195,0
245,480228,09/30/2001,0.489485,0
220,480228,06/30/2001,0.485073,0
277,480228,12/31/2008,0.477649,0
