In [1]:

import os, glob, json, math, warnings
import numpy as np
import pandas as pd

import tensorflow_data_validation as tfdv

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, StringTensorType, Int64TensorType
import onnxruntime as ort

warnings.filterwarnings("ignore")

print("Lib versions:")
import sys, tensorflow as tf, sklearn, onnx, tfx
print("Python:", sys.version)
print("TF:", tf.__version__)
print("TFDV:", tfdv.__version__)
print("tfx:", tfx.__version__)
print("sklearn:", sklearn.__version__)
print("pandas:", pd.__version__)
print("onnx:", onnx.__version__)
print("onnxruntime:", ort.__version__)

2025-09-19 16:51:21.536358: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-19 16:51:21.536861: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-19 16:51:21.539169: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-19 16:51:21.566589: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Lib versions:
Python: 3.10.18 (main, Sep  8 2025, 22:51:44) [GCC 12.2.0]
TF: 2.16.1
TFDV: 1.16.1
tfx: 1.16.0
sklearn: 1.5.1
pandas: 1.5.3
onnx: 1.17.0
onnxruntime: 1.22.1


In [2]:
DATA_DIR = "/data"
PARQUET_GLOB = os.path.join(DATA_DIR, "**", "*.parquet")
parquet_files = sorted(glob.glob(PARQUET_GLOB, recursive=True))

if parquet_files:
    data_path = parquet_files[0]
    print(f"Found Parquet: {data_path}")
    # For very large files, you can load a sample. Here we try to read everything; adjust if needed.
    df = pd.read_parquet(data_path)
else:
    print("No Parquet found in /data; generating a synthetic demo dataset.")
    rng = np.random.default_rng(7)
    n = 5000
    df = pd.DataFrame({
        "amount": rng.normal(100, 30, size=n).round(2),
        "tx_count_30d": rng.integers(0, 50, size=n),
        "category": rng.choice(["grocery", "fuel", "online", "travel"], size=n, p=[0.5, 0.2, 0.2, 0.1]),
        "is_fraud": rng.choice([0,1], size=n, p=[0.92, 0.08])
    })

df.head(5)

Found Parquet: /data/test_parquet_data.parquet


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
target_col = "isFraud"
feature_cols = [c for c in df.columns if c not in target_col]

In [5]:
feature_cols

['step',
 'type',
 'amount',
 'nameOrig',
 'oldbalanceOrg',
 'newbalanceOrig',
 'nameDest',
 'oldbalanceDest',
 'newbalanceDest',
 'isFlaggedFraud']

In [6]:
stats = tfdv.generate_statistics_from_dataframe(df)

In [7]:
schema = tfdv.infer_schema(stats)
print("Inferred schema with", len(schema.feature), "features.")

Inferred schema with 11 features.


In [14]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'step',INT,required,,-
'type',STRING,required,,'type'
'amount',FLOAT,required,,-
'nameOrig',STRING,required,,'nameOrig'
'oldbalanceOrg',FLOAT,required,,-
'newbalanceOrig',FLOAT,required,,-
'nameDest',STRING,required,,'nameDest'
'oldbalanceDest',FLOAT,required,,-
'newbalanceDest',FLOAT,required,,-
'isFraud',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'type',"'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER'"
'nameOrig',"'C1026483832', 'C1231006815', 'C1237762639', 'C1265012928', 'C1305486145', 'C154988899', 'C1648232591', 'C1666544295', 'C1670993182', 'C1716932897', 'C1900366749', 'C1912850431', 'C2033524545', 'C2048537720', 'C249177573', 'C712410124', 'C761750706', 'C840083671', 'C90045638', 'C905080434'"
'nameDest',"'C1100439041', 'C195600860', 'C38997010', 'C476402209', 'C553264065', 'C997608398', 'M1176932104', 'M1230701703', 'M1635378213', 'M1731217984', 'M1877062907', 'M1979787155', 'M2044282225', 'M2096539129', 'M408069119', 'M473053293', 'M573487274', 'M633326333', 'M801569151', 'M972865270'"


In [9]:
anomalies = tfdv.validate_statistics(statistics=stats, schema=schema)
print("Anomalies found:", anomalies.anomaly_info.keys())

Anomalies found: KeysView({})


In [13]:
tfdv.display_anomalies(anomalies)


In [10]:
try:
    tfdv.visualize_statistics(stats)
    if len(anomalies.anomaly_info) > 0:
        tfdv.display_anomalies(anomalies)
except Exception as e:
    print("Visualization skipped:", e)

In [11]:
# Persist schema for future runs
os.makedirs("/metadata", exist_ok=True)
schema_path = "/metadata/schema.pbtxt"
with open(schema_path, "w") as f:
    f.write(str(schema))
print("Schema saved to", schema_path)

Schema saved to /metadata/schema.pbtxt


In [12]:
X = df[feature_cols].copy()
y = df[target_col].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y if y.nunique()==2 else None
)

# Preprocess: impute missing + scale numerics; impute + onehot categoricals
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ],
    remainder="drop"
)

clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

model = Pipeline(steps=[("preprocess", preprocess), ("rf", clf)])

%time model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = None
try:
    y_proba = model.predict_proba(X_test)[:,1]
except Exception:
    pass

print("Accuracy:", accuracy_score(y_test, y_pred))
if y_proba is not None and y.nunique()==2:
    try:
        print("ROC AUC:", roc_auc_score(y_test, y_proba))
    except Exception as e:
        print("ROC AUC unavailable:", e)

print("\nClassification report:\n", classification_report(y_test, y_pred))

NameError: name 'numeric_cols' is not defined

In [None]:
# Build ONNX input schema (initial_types) based on feature dtypes
initial_types = []
for c in numeric_cols:
    initial_types.append((c, FloatTensorType([None, 1])))
for c in categorical_cols:
    # Use String for non-numeric; if integers with small cardinality are categorical, they were treated above
    if pd.api.types.is_integer_dtype(df[c]):
        initial_types.append((c, Int64TensorType([None, 1])))
    else:
        initial_types.append((c, StringTensorType([None, 1])))

onnx_model = convert_sklearn(model, initial_types=initial_types, target_opset=16)

os.makedirs("/artifacts/onnx", exist_ok=True)
onnx_path = "/artifacts/onnx/random_forest_pipeline.onnx"
with open(onnx_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

print("ONNX model saved to:", onnx_path)

# Quick sanity-check with onnxruntime
sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])

# Prepare a small batch from X_test in the same per-column dict format
def to_onnx_inputs(df_batch):
    feed = {}
    for c in numeric_cols:
        feed[c] = df_batch[[c]].astype(np.float32).values
    for c in categorical_cols:
        if pd.api.types.is_integer_dtype(df_batch[c]):
            feed[c] = df_batch[[c]].astype(np.int64).values
        else:
            # onnxruntime expects bytes for string tensor
            feed[c] = df_batch[[c]].astype(str).values
    return feed

sample = X_test.head(5).copy()
inputs = to_onnx_inputs(sample)
output_names = [o.name for o in sess.get_outputs()]
preds = sess.run(output_names, inputs)

print("ONNX outputs:", {name: np.array(arr).shape for name, arr in zip(output_names, preds)})