In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('../data/fraud.csv.bz2')

# Parse the time column as datetime
df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Display basic information
print("Dataset Shape:")
print(df.shape)
print("=" * 60)

print("Dataset Info:")
print(df.info())
print("=" * 60)


In [None]:
# setup train / test split
from sklearn.model_selection import train_test_split

df_cleaned = df.dropna().copy()

X = df_cleaned.drop('fraud', axis=1)
y = df_cleaned['fraud']

# encode categorical variables
from sklearn.preprocessing import OneHotEncoder

X = X.copy()
y = y.copy()

# add hour of day as a feature
X['hour'] = X['time'].dt.hour

# Fit OneHotEncoders on training data
enc_product = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
enc_gender = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
enc_state = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
enc_hour = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform - reshape needed because OneHotEncoder expects 2D input
product_encoded = enc_product.fit_transform(X[['product_category']])
gender_encoded = enc_gender.fit_transform(X[['gender']])
state_encoded = enc_state.fit_transform(X[['address_state']])
hour_encoded = enc_hour.fit_transform(X[['hour']])

# Get feature names for the encoded columns
product_cols = enc_product.get_feature_names_out(['product_category'])
gender_cols = enc_gender.get_feature_names_out(['gender'])
state_cols = enc_state.get_feature_names_out(['address_state'])
hour_cols = enc_hour.get_feature_names_out(['hour'])

# Create DataFrames from encoded arrays
product_df = pd.DataFrame(product_encoded, columns=product_cols, index=X.index)
gender_df = pd.DataFrame(gender_encoded, columns=gender_cols, index=X.index)
state_df = pd.DataFrame(state_encoded, columns=state_cols, index=X.index)
hour_df = pd.DataFrame(hour_encoded, columns=hour_cols, index=X.index)

# Drop original columns and concatenate encoded ones
X = X.drop(['product_category', 'gender', 'address_state', 'time'], axis=1)
X = pd.concat([X, product_df, gender_df, state_df, hour_df], axis=1)
print(X.info())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=566571358)

In [None]:
import pickle

# Save encoders
with open('../model/encoders.pkl', 'wb') as f:
    pickle.dump({
        'enc_product': enc_product,
        'enc_hour': enc_hour,
        'enc_gender': enc_gender,
        'enc_state': enc_state,
        'product_cols': product_cols,
        'hour_cols': hour_cols,
        'gender_cols': gender_cols,
        'state_cols': state_cols,
    }, f)

print("Encoders saved successfully!")

In [None]:
# Setup validation set
X, X_valid, y, y_valid = train_test_split(X, y, test_size=0.1)

print(X.shape)
print(X_valid.shape)
print(y.value_counts())
print(y_valid.value_counts())

In [None]:
# balance X/y on training set: keep all True, sample smaller number of False
pos_idx = y[y].index
neg_idx = y[~y].sample(n=len(pos_idx)*5).index

balanced_idx = pos_idx.union(neg_idx)
X = X.loc[balanced_idx]
y = y.loc[balanced_idx]

print(y.value_counts())

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import mlflow
import datetime
import logging
from scipy.stats import norm
import pickle
import tempfile

# Set MLflow tracking server
mlflow.set_tracking_uri('http://host.docker.internal:5050')

# Enable autologging
mlflow.sklearn.autolog()

# remove noisy warnings
logging.getLogger("mlflow.utils.autologging_utils").setLevel(logging.ERROR)


run_name = f"fraud logistic-regression {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
with mlflow.start_run(run_name=run_name):
  model = LogisticRegression(max_iter=1000, class_weight="balanced")
  model.fit(X, y)

  # validation performance
  y_valid_pred = model.predict(X_valid)
  recall = recall_score(y_valid, y_valid_pred)
  precision = precision_score(y_valid, y_valid_pred)
  f1 = f1_score(y_valid, y_valid_pred)
  auc = roc_auc_score(y_valid, y_valid_pred)
  t = pd.crosstab(y_valid, y_valid_pred, rownames=['Actual'], colnames=['Predicted'])
  hit_rate = min(.999,max(.001,(t[True][True]) / (t[True][False] + t[True][True])))
  fa_rate = min(.999,max(.001,(t[False][True]) / (t[False][False] + t[False][True])))
  d_prime = norm.ppf(hit_rate) - norm.ppf(fa_rate)
  mlflow.log_metric('d_prime', d_prime)

  print(t)
  print(f"f1={f1:.3f}, auc={auc:.3f}, recall={recall:.3f}, d_prime={d_prime:.3f}")

  run_id = mlflow.active_run().info.run_id
  model_uri = f"runs:/{run_id}/model"
  mlflow.register_model(model_uri, f"fraud-logistic-regression")

In [None]:
# try to get the model from the registry and predict with it
name = "fraud-logistic-regression/4"
model_uri = f"models:/{name}"
loaded_model = mlflow.sklearn.load_model(model_uri)
y_valid_pred_loaded = loaded_model.predict(X_valid)
print("Predictions from loaded model match original:", np.array_equal(y_valid_pred, y_valid_pred_loaded))



In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import mlflow
import datetime
import logging
from scipy.stats import norm
import pickle
import tempfile

# Set MLflow tracking server
mlflow.set_tracking_uri('http://host.docker.internal:5050')

# Enable autologging
mlflow.sklearn.autolog()

# remove noisy warnings
logging.getLogger("mlflow.utils.autologging_utils").setLevel(logging.ERROR)

run_name = f"fraud random-forest {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
with mlflow.start_run(run_name=run_name):
  for n_esti in range(1, 20):
    with mlflow.start_run(nested=True, run_name=f"{n_esti} estimators"):
      model = RandomForestClassifier(n_estimators=n_esti, class_weight='balanced')
      model.fit(X, y)

      # validation performance
      y_valid_pred = model.predict(X_valid)
      recall = recall_score(y_valid, y_valid_pred)
      precision = precision_score(y_valid, y_valid_pred)
      f1 = f1_score(y_valid, y_valid_pred)
      auc = roc_auc_score(y_valid, y_valid_pred)
      t = pd.crosstab(y_valid, y_valid_pred, rownames=['Actual'], colnames=['Predicted'])
      hit_rate = min(.999,max(.001,(t[True][True]) / (t[True][False] + t[True][True])))
      fa_rate = min(.999,max(.001,(t[False][True]) / (t[False][False] + t[False][True])))
      d_prime = norm.ppf(hit_rate) - norm.ppf(fa_rate)
      mlflow.log_metric('d_prime', d_prime)

      print(t)
      print(f"f1={f1:.3f}, auc={auc:.3f}, recall={recall:.3f}, d_prime={d_prime:.3f}")

      run_id = mlflow.active_run().info.run_id
      model_uri = f"runs:/{run_id}/model"
      mlflow.register_model(model_uri, f"fraud-random-forest-{n_esti}-estimators")

  # Save and log encoders as artifact in the parent run
  encoders_data = {
      'enc_product': enc_product,
      'enc_gender': enc_gender,
      'enc_state': enc_state,
      'product_cols': product_cols,
      'gender_cols': gender_cols,
      'state_cols': state_cols,
      'model_version': '0.1',
      'train_date': datetime.datetime.now().isoformat()
  }
  with tempfile.NamedTemporaryFile(mode='wb', suffix='.pkl', delete=False) as f:
      pickle.dump(encoders_data, f)
      encoders_path = f.name
  mlflow.log_artifact(encoders_path, artifact_path='encoders')

In [None]:
# SVM
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import mlflow
import datetime
import logging
from scipy.stats import norm
import pickle
import tempfile

# Set MLflow tracking server
mlflow.set_tracking_uri('http://host.docker.internal:5050')

# Enable autologging
mlflow.sklearn.autolog()

# remove noisy warnings
logging.getLogger("mlflow.utils.autologging_utils").setLevel(logging.ERROR)


run_name = f"fraud support-vector-machine {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
with mlflow.start_run(run_name=run_name):
  C_range = np.logspace(-1, 1, 3, base=10)
  gamma_range = np.logspace(-3, -1, 3, base=10)
  for C in C_range:
    for gamma in gamma_range:
      with mlflow.start_run(nested=True, run_name=f"C={C}, gamma={gamma}"):
        model = SVC(kernel='rbf', C=C, gamma=gamma, class_weight="balanced", max_iter=int(1e8), verbose=False)
        model.fit(X, y)

        # validation performance
        y_valid_pred = model.predict(X_valid)
        recall = recall_score(y_valid, y_valid_pred)
        precision = precision_score(y_valid, y_valid_pred)
        f1 = f1_score(y_valid, y_valid_pred)
        auc = roc_auc_score(y_valid, y_valid_pred)
        t = pd.crosstab(y_valid, y_valid_pred, rownames=['Actual'], colnames=['Predicted'])
        hit_rate = min(.999,max(.001,(t[True][True]) / (t[True][False] + t[True][True])))
        fa_rate = min(.999,max(.001,(t[False][True]) / (t[False][False] + t[False][True])))
        d_prime = norm.ppf(hit_rate) - norm.ppf(fa_rate)
        mlflow.log_metric('d_prime', d_prime)

        print(t)
        print(f"f1={f1:.3f}, auc={auc:.3f}, recall={recall:.3f}, d_prime={d_prime:.3f}")

  # Save and log encoders as artifact in the parent run
  encoders_data = {
      'enc_product': enc_product,
      'enc_gender': enc_gender,
      'enc_state': enc_state,
      'product_cols': product_cols,
      'gender_cols': gender_cols,
      'state_cols': state_cols,
      'model_version': '0.1',
      'train_date': datetime.datetime.now().isoformat()
  }
  with tempfile.NamedTemporaryFile(mode='wb', suffix='.pkl', delete=False) as f:
      pickle.dump(encoders_data, f)
      encoders_path = f.name
  mlflow.log_artifact(encoders_path, artifact_path='encoders')

In [None]:
# Neural Net
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import mlflow
import datetime
import logging
from scipy.stats import norm
import pickle
import tempfile

# Set MLflow tracking server
mlflow.set_tracking_uri('http://host.docker.internal:5050')

# Enable autologging
mlflow.sklearn.autolog()

# remove noisy warnings
logging.getLogger("mlflow.utils.autologging_utils").setLevel(logging.ERROR)


run_name = f"fraud neural-net {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
with mlflow.start_run(run_name=run_name):
  layer1_range = np.logspace(3, 6, 4, base=2)
  layer2_range = np.logspace(3, 6, 4, base=2)
  for n1 in layer1_range:
    for n2 in layer2_range:
      with mlflow.start_run(nested=True, run_name=f"n1={int(n1)}, n2={int(n2)}"):
        model = MLPClassifier(hidden_layer_sizes=[int(n1), int(n2)], learning_rate = "adaptive", verbose=False)
        model.fit(X, y)

        # validation performance
        y_valid_pred = model.predict(X_valid)
        recall = recall_score(y_valid, y_valid_pred)
        precision = precision_score(y_valid, y_valid_pred)
        f1 = f1_score(y_valid, y_valid_pred)
        auc = roc_auc_score(y_valid, y_valid_pred)
        t = pd.crosstab(y_valid, y_valid_pred, rownames=['Actual'], colnames=['Predicted'])
        hit_rate = min(.999,max(.001,(t[True][True]) / (t[True][False] + t[True][True])))
        fa_rate = min(.999,max(.001,(t[False][True]) / (t[False][False] + t[False][True])))
        d_prime = norm.ppf(hit_rate) - norm.ppf(fa_rate)
        mlflow.log_metric('d_prime', d_prime)

        print(t)
        print(f"f1={f1:.3f}, auc={auc:.3f}, recall={recall:.3f}, d_prime={d_prime:.3f}")

  # Save and log encoders as artifact in the parent run
  encoders_data = {
      'enc_product': enc_product,
      'enc_gender': enc_gender,
      'enc_state': enc_state,
      'product_cols': product_cols,
      'gender_cols': gender_cols,
      'state_cols': state_cols,
      'model_version': '0.1',
      'train_date': datetime.datetime.now().isoformat()
  }
  with tempfile.NamedTemporaryFile(mode='wb', suffix='.pkl', delete=False) as f:
      pickle.dump(encoders_data, f)
      encoders_path = f.name
  mlflow.log_artifact(encoders_path, artifact_path='encoders')