## Provider Fraud Detection - Exploratory Data Analysis

In [2]:
# Cell 1: Imports + SparkSession (creates one only if not present)
import os
import builtins as py
import math
import pandas as pd

from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col, lit

# plotting (small samples only)
import matplotlib.pyplot as plt
import seaborn as sns

# Create a lightweight SparkSession if not already available
if 'spark' not in globals():
    spark = (SparkSession.builder
             .appName("EDA_ProviderFraud")
             .master("local[*]")
             .config("spark.driver.memory", "6g")          # lower for laptops
             .config("spark.sql.shuffle.partitions", "4")  # small for local dev
             .config("spark.network.timeout", "600s")
             .getOrCreate())
print("Spark UI:", spark.sparkContext.uiWebUrl if spark.sparkContext.uiWebUrl else "n/a")


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/15 20:23:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/15 20:23:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark UI: http://mac.lan:4041


In [4]:
# Cell 2 (robust): Load merged parquet and quick preview (handles different Spark versions)
MERGED_PATH = "curated/training/providers_merged_asof_2023-12-31_ever.parquet"

if not os.path.exists(MERGED_PATH):
    raise FileNotFoundError(f"Expected merged parquet at {MERGED_PATH} — ensure you ran the wrangling step or adjust the path.")

df = spark.read.parquet(MERGED_PATH)

print("Schema (preview):")
df.printSchema()

print("\nPreview rows:")
df.select("npi", "is_fraud").limit(20).show(truncate=False)

# ---- robust approx total rows extraction ----
def get_count_approx(rdd, timeout_ms=2000, allow_fallback_to_exact=True):
    """Return an int estimate for rdd.countApprox(timeout_ms).
       Handles Spark versions that return either dict or int.
       If result type unknown and allow_fallback_to_exact is True, falls back to rdd.count()."""
    try:
        res = rdd.countApprox(timeout_ms)
    except Exception as e:
        print("countApprox raised:", str(e))
        if allow_fallback_to_exact:
            print("Falling back to exact count() — this may take a while.")
            return rdd.count()
        return None

    # handle dict-like results: {deadline: estimate} or {"1": estimate}
    if isinstance(res, dict):
        vals = list(res.values())
        if vals:
            try:
                return int(vals[0])
            except Exception:
                pass
    # handle integer-like result
    try:
        return int(res)
    except Exception:
        # last resort: optional exact count
        if allow_fallback_to_exact:
            print("countApprox returned unexpected type; doing exact count() as fallback.")
            return rdd.count()
        return None

# Use the robust function
approx_total = get_count_approx(df.rdd, timeout_ms=2000, allow_fallback_to_exact=False)
if approx_total is None:
    # fallback to a tiny sampled estimate if countApprox failed and we don't want an exact count
    print("countApprox unavailable; using tiny sample to estimate total rows (cheap but noisy).")
    SAMPLE_F = 0.001
    sample_count = df.sample(SAMPLE_F, seed=42).limit(10_000).count()  # limit to keep small
    approx_total = int(sample_count / SAMPLE_F) if SAMPLE_F > 0 and sample_count > 0 else sample_count

print("\nApprox total rows:", approx_total)


                                                                                

Schema (preview):
root
 |-- npi: string (nullable = true)
 |-- total_services: double (nullable = true)
 |-- total_beneficiaries: double (nullable = true)
 |-- total_bene_day_services: double (nullable = true)
 |-- w_avg_submitted_charge: double (nullable = true)
 |-- w_avg_allowed: double (nullable = true)
 |-- w_avg_payment: double (nullable = true)
 |-- charge_allowed_ratio: double (nullable = true)
 |-- payment_allowed_ratio: double (nullable = true)
 |-- num_unique_procedures: long (nullable = true)
 |-- stddev_submitted_charge: double (nullable = true)
 |-- frac_drug_services: double (nullable = true)
 |-- frac_missing_zip: double (nullable = true)
 |-- services_per_bene: double (nullable = true)
 |-- bene_days_per_bene: double (nullable = true)
 |-- primary_taxonomy: string (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- label_is_inferred: integer (nullable = true)
 |-- state_abbr: string (nullable = true)


Preview rows:
+----------+--------+
|npi       |is_frau




Approx total rows: 950347




In [5]:
# Cell 3: Null counts and approximate distinct counts for key columns
cols_to_check = [
    "is_fraud", "total_services", "total_beneficiaries", "w_avg_allowed",
    "charge_allowed_ratio", "state_abbr", "primary_taxonomy", "frac_drug_services"
]
cols_present = [c for c in cols_to_check if c in df.columns]

# Null counts
null_exprs = [F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c + "_nulls") for c in cols_present]
if null_exprs:
    df.select(*null_exprs).show(truncate=False)

# Approx distinct counts for categorical-ish columns
cat_cols = [c for c in ["state_abbr", "primary_taxonomy", "is_fraud"] if c in df.columns]
for c in cat_cols:
    approx_dist = df.select(F.approx_count_distinct(c).alias("approx_dist")).collect()[0]["approx_dist"]
    print(f"Approx distinct {c}: {approx_dist}")


25/08/15 20:27:43 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------+--------------------+-------------------------+-------------------+--------------------------+----------------+----------------------+------------------------+
|is_fraud_nulls|total_services_nulls|total_beneficiaries_nulls|w_avg_allowed_nulls|charge_allowed_ratio_nulls|state_abbr_nulls|primary_taxonomy_nulls|frac_drug_services_nulls|
+--------------+--------------------+-------------------------+-------------------+--------------------------+----------------+----------------------+------------------------+
|0             |0                   |0                        |0                  |0                         |2880            |3126                  |0                       |
+--------------+--------------------+-------------------------+-------------------+--------------------------+----------------+----------------------+------------------------+

Approx distinct state_abbr: 62
Approx distinct primary_taxonomy: 622
Approx distinct is_fraud: 2


In [None]:
# Cell 4: Numeric summary and approximate quantiles (cheap)
num_cols = [c for c in ["total_services","total_beneficiaries","w_avg_allowed","charge_allowed_ratio","frac_drug_services","services_per_bene"] if c in df.columns]

if num_cols:
    # mean & std
    agg_exprs = [F.mean(c).alias(c + "_mean") for c in num_cols] + [F.stddev(c).alias(c + "_std") for c in num_cols]
    df.select(*agg_exprs).show(truncate=False)

    # approximate quantiles
    qs = [0.01,0.05,0.25,0.5,0.75,0.95,0.99]
    quantiles = df.stat.approxQuantile(num_cols, qs, 0.01)
    for c, qvals in zip(num_cols, quantiles):
        print(f"\n{c} quantiles:")
        for q,v in zip(qs, qvals):
            print(f"  {int(q*100)}% -> {v}")
else:
    print("No numeric columns found for summary.")


In [None]:
# Cell 5: Top outliers (limit) — quick checks
if "charge_allowed_ratio" in df.columns:
    print("Top charge_allowed_ratio (top 20):")
    df.select("npi","charge_allowed_ratio","total_services","total_beneficiaries").orderBy(F.col("charge_allowed_ratio").desc()).limit(20).show(truncate=False)

# Providers with very high services but tiny beneficiaries (possible anomalies)
if all(c in df.columns for c in ["total_services","total_beneficiaries"]):
    df.filter((col("total_beneficiaries") < 5) & (col("total_services") > 1000)) \
      .select("npi","total_services","total_beneficiaries").limit(20).show(truncate=False)


In [None]:
# Cell 6: Aggregations by state and by primary_taxonomy
if "state_abbr" in df.columns:
    by_state = (df.groupBy("state_abbr")
                  .agg(
                      F.count("*").alias("providers"),
                      F.sum("total_services").alias("sum_services"),
                      F.avg("charge_allowed_ratio").alias("mean_charge_ratio"),
                      F.sum("is_fraud").alias("n_fraud")
                  )
                  .orderBy(F.col("providers").desc()))
    display_rows = 50
    print("By-state summary (top):")
    by_state.show(display_rows, truncate=False)
else:
    print("No state_abbr column found.")

if "primary_taxonomy" in df.columns:
    by_tax = (df.groupBy("primary_taxonomy")
                .agg(
                    F.count("*").alias("providers"),
                    F.avg("charge_allowed_ratio").alias("mean_charge_ratio"),
                    F.sum("is_fraud").alias("n_fraud")
                )
                .orderBy(F.col("providers").desc()))
    print("\nBy-primary_taxonomy summary (top 50):")
    by_tax.show(50, truncate=False)


In [None]:
# Cell 7: Compare fraud vs non-fraud for several metrics (medians/means)
metrics = [c for c in ["charge_allowed_ratio","w_avg_allowed","total_services","total_beneficiaries","frac_drug_services","services_per_bene"] if c in df.columns]

if metrics:
    agg_exprs = []
    for m in metrics:
        agg_exprs.append(F.expr(f"percentile_approx({m}, 0.5)").alias(m + "_median"))
        agg_exprs.append(F.mean(m).alias(m + "_mean"))
    # per class
    profile = df.groupBy("is_fraud").agg(*agg_exprs)
    print("Fraud vs non-fraud profile (median/mean):")
    profile.show(truncate=False)
else:
    print("No metrics available for fraud profiling.")
