
    # Logical Data Quality Detection with Spark MLlib
    This notebook demonstrates clustering, anomaly detection, and classification using synthetic data to identify aggregator behavior, test data pollution, and classify real vs synthetic users in Microsoft Fabric.
    

In [3]:

    from pyspark.sql import SparkSession
    from pyspark.sql.functions import col

    spark = SparkSession.builder.getOrCreate()

    # Synthetic data: user_id, weekly_purchases, purchase_diversity, geo_dispersion, is_test_user
    data = [
        ("user_001", 1000, 50, 10, 0),
        ("user_002", 5, 1, 1, 0),
        ("user_003", 500, 30, 5, 0),
        ("user_004", 10, 2, 1, 1),
        ("user_005", 800, 40, 8, 0),
        ("user_006", 3, 1, 1, 1),
        ("user_007", 1200, 60, 12, 0),
        ("user_008", 6, 2, 1, 1)
    ]

    columns = ["user_id", "weekly_purchases", "purchase_diversity", "geo_dispersion", "is_test_user"]
    df = spark.createDataFrame(data, columns)
    display(df)
    

StatementMeta(, bc817dc7-52ba-42dc-b86b-634b65b32cf7, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a7dcef9e-063d-4978-972b-59ea68bc74b2)

In [4]:

    from pyspark.ml.feature import VectorAssembler

    assembler = VectorAssembler(
        inputCols=["weekly_purchases", "purchase_diversity", "geo_dispersion"],
        outputCol="features"
    )
    feature_df = assembler.transform(df)
    

StatementMeta(, bc817dc7-52ba-42dc-b86b-634b65b32cf7, 6, Finished, Available, Finished)

In [5]:

    # Clustering with KMeans
    from pyspark.ml.clustering import KMeans

    kmeans = KMeans(k=2, seed=1, featuresCol="features", predictionCol="cluster")
    model = kmeans.fit(feature_df)
    clustered_df = model.transform(feature_df)
    display(clustered_df.select("user_id", "cluster"))
    

StatementMeta(, bc817dc7-52ba-42dc-b86b-634b65b32cf7, 7, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, b034a96c-8b56-4269-8d41-e0be012cf1eb)

In [6]:

    # Anomaly Detection using distance from cluster center
    from pyspark.sql.functions import udf
    from pyspark.ml.linalg import Vectors, DenseVector
    from pyspark.sql.types import DoubleType

    centers = model.clusterCenters()

    def compute_distance(features, cluster):
        center = DenseVector(centers[cluster])
        return float(Vectors.squared_distance(features, center))

    distance_udf = udf(compute_distance, DoubleType())
    clustered_df = clustered_df.withColumn("anomaly_score", distance_udf(col("features"), col("cluster")))
    display(clustered_df.select("user_id", "cluster", "anomaly_score"))
    

StatementMeta(, bc817dc7-52ba-42dc-b86b-634b65b32cf7, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 16801b54-1d25-47e8-b36c-0d7e0c3d1ad7)

In [39]:

    # Classification: Predicting test users
    from pyspark.ml.classification import LogisticRegression

    classifier_df = clustered_df.select("features", "is_test_user")

    lr = LogisticRegression(featuresCol="features", labelCol="is_test_user", predictionCol="prediction")
    lr_model = lr.fit(classifier_df)
    predictions = lr_model.transform(classifier_df)
    display(predictions.select("features", "is_test_user", "prediction"))
    display(predictions)
    

StatementMeta(, bc817dc7-52ba-42dc-b86b-634b65b32cf7, 48, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, c8e4adb0-960d-4f88-a3bc-baad6d43805f)

SynapseWidget(Synapse.DataFrame, 9615a580-4324-40f5-ac71-29793eb689ee)

In [36]:
%%sql

-- DROP TABLE lk_cdsa_silver.silver_db.logical_quality_results;
 
CREATE TABLE IF NOT EXISTS silver_db.logical_quality_results (
    features ARRAY<DOUBLE>,
    is_test_user BIGINT,
    rawPrediction ARRAY<DOUBLE>,
    probability ARRAY<DOUBLE>,
    prediction DOUBLE
)
USING DELTA;


SHOW TABLES IN lk_cdsa_silver.silver_db;

StatementMeta(, bc817dc7-52ba-42dc-b86b-634b65b32cf7, 45, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 1 rows and 3 fields>

In [38]:
#spark.sql("DROP TABLE IF EXISTS lk_cdsa_silver.silver_db.logical_quality_results_v2")
display(predictions);
predictions.printSchema();

predictions = predictions.withColumn("features", df["features"].toArray())
predictions = predictions.withColumn("rawPrediction", df["rawPrediction"].toArray())
predictions = predictions.withColumn("probability", df["probability"].toArray())

predictions.write.format("delta").mode("overwrite").saveAsTable("lk_cdsa_silver.silver_db.logical_quality_results");

#display(spark.sql(f"""
#            SELECT * 
#            FROM lk_cdsa_silver.silver_db.logical_quality_results
#        """))


StatementMeta(, bc817dc7-52ba-42dc-b86b-634b65b32cf7, 47, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a0848dc9-ae43-455c-8059-8bf3c07caae9)

root
 |-- features: vector (nullable = true)
 |-- is_test_user: long (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `features` cannot be resolved. Did you mean one of the following? [`user_id`, `weekly_purchases`, `purchase_diversity`, `geo_dispersion`, `is_test_user`].