In [1]:
%pip install soda-core-spark-df

Collecting soda-core-spark-df
  Using cached soda_core_spark_df-3.0.39-py3-none-any.whl (3.1 kB)
Collecting soda-core-spark==3.0.39 (from soda-core-spark-df)
  Using cached soda_core_spark-3.0.39-py3-none-any.whl (5.3 kB)
Collecting soda-core==3.0.39 (from soda-core-spark==3.0.39->soda-core-spark-df)
  Using cached soda_core-3.0.39-py3-none-any.whl (177 kB)
Collecting markupsafe<=2.1.1,>=2.0.1 (from soda-core==3.0.39->soda-core-spark==3.0.39->soda-core-spark-df)
  Using cached MarkupSafe-2.1.1-cp311-cp311-linux_x86_64.whl
Collecting antlr4-python3-runtime~=4.11.1 (from soda-core==3.0.39->soda-core-spark==3.0.39->soda-core-spark-df)
  Using cached antlr4_python3_runtime-4.11.1-py3-none-any.whl (144 kB)
Collecting opentelemetry-api~=1.16.0 (from soda-core==3.0.39->soda-core-spark==3.0.39->soda-core-spark-df)
  Using cached opentelemetry_api-1.16.0-py3-none-any.whl (57 kB)
Collecting opentelemetry-exporter-otlp-proto-http~=1.16.0 (from soda-core==3.0.39->soda-core-spark==3.0.39->soda-core

In [2]:
from pyspark.sql import SparkSession
from soda.scan import Scan
from pyspark.sql.types import LongType, TimestampType, StructType, StringType, DoubleType, IntegerType, StructField
from pyspark import SparkConf

## Carregando DataFrames

In [3]:
spark = SparkSession.builder.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").getOrCreate()

In [6]:
spark.conf.set("spark.sql.parquet.enableVectorizedReader","false")

yellow_df = spark.read.option('inferSchema',True).parquet('data/raw/yellow/*/*')
yellow_df.createOrReplaceTempView("yellow_df")

green_df = spark.read.option('inferSchema',True).parquet('data/raw/green/*/*')
green_df.createOrReplaceTempView("green_df")

## Análise dos DataSets

In [7]:
yellow_df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [9]:
green_df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- lpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- lpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: integer (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- trip_type: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



## Execução de validação de dados com SODA

In [10]:
checks  ="""
checks for yellow_df:
  - row_count > 0
  - max(trip_distance) <= 30
  - min(trip_distance) > 0
  - min(passenger_count) > 0
  - max(passenger_count) < 5
  - no_datetime_less_than_2018 = 0:
      no_datetime_less_than_2018 query: SELECT COUNT(*) FROM yellow_df WHERE tpep_pickup_datetime < '01-01-2018' or tpep_dropoff_datetime < '01-01-2018'
  - pickup_lower_than_dropoff = 0:
      pickup_lower_than_dropoff query: SELECT COUNT(*) FROM yellow_df WHERE tpep_pickup_datetime > tpep_dropoff_datetime
  - duplicate_lines_lower_than = 0:
      duplicate_lines_lower_than query: SELECT COUNT(foo.*) FROM (SELECT VendorID ,tpep_pickup_datetime ,tpep_dropoff_datetime ,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag ,PULocationID ,DOLocationID ,payment_type ,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee, COUNT(*) FROM yellow_df GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 HAVING COUNT(*) > 1) as foo
  - schema:
      name: Confirm that required columns are present
      fail:
        when required column missing: [VendorID ,tpep_pickup_datetime ,tpep_dropoff_datetime ,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag ,PULocationID ,DOLocationID ,payment_type ,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee]
        when wrong column type:
          VendorID: bigint
          tpep_pickup_datetime: timestamp_ntz
          tpep_dropoff_datetime: timestamp_ntz
          passenger_count: double
          trip_distance: double
          RatecodeID: double
          store_and_fwd_flag: string
          PULocationID: bigint
          DOLocationID: bigint
          payment_type: bigint
          fare_amount: double
          extra: double
          mta_tax: double
          tip_amount: double
          tolls_amount: double
          improvement_surcharge: double
          total_amount: double
          congestion_surcharge: double
          airport_fee: double


checks for green_df:
  - row_count > 0
  - min(passenger_count) > 0
  - max(passenger_count) < 5
  - no_datetime_less_than_2018 = 0:
      no_datetime_less_than_2018 query: SELECT COUNT(*) FROM green_df WHERE lpep_pickup_datetime < '01-01-2018' or lpep_dropoff_datetime < '01-01-2018'
  - pickup_lower_than_dropoff = 0:
      pickup_lower_than_dropoff query: SELECT COUNT(*) FROM green_df WHERE lpep_pickup_datetime > lpep_dropoff_datetime 
  - duplicate_lines_lower_than = 0:
      duplicate_lines_lower_than query: SELECT COUNT(foo.*) FROM (SELECT VendorID, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, RatecodeID, PULocationID, DOLocationID, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, improvement_surcharge, total_amount, payment_type, trip_type, congestion_surcharge, COUNT(*) FROM green_df GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20 HAVING COUNT(*) > 1) as foo
  - schema:
      name: Confirm that required columns are present
      fail:
        when required column missing: [VendorID, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, RatecodeID, PULocationID, DOLocationID, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, improvement_surcharge, total_amount, payment_type, trip_type, congestion_surcharge]
        when wrong column type:
            VendorID: bigint
            lpep_pickup_datetime: timestamp_ntz
            lpep_dropoff_datetime: timestamp_ntz
            store_and_fwd_flag: string
            RatecodeID: double
            PULocationID: bigint
            DOLocationID: bigint
            passenger_count: double
            trip_distance: double
            fare_amount: double
            extra: double
            mta_tax: double
            tip_amount: double
            tolls_amount: double
            ehail_fee: int
            improvement_surcharge: double
            total_amount: double
            payment_type: double
            trip_type: double
            congestion_surcharge: double
"""

In [11]:
scan = Scan()

scan.set_scan_definition_name("Datasets validation")
scan.set_data_source_name("spark_df")
scan.add_spark_session(spark)

scan.add_sodacl_yaml_str(checks)

scan.execute()
print(scan.get_logs_text())

INFO   | Soda Core 3.0.39
INFO   | Scan summary:
INFO   | 5/16 checks PASSED: 
INFO   |     yellow_df in spark_df
INFO   |       row_count > 0 [PASSED]
INFO   |       no_datetime_less_than_2018 = 0 [PASSED]
INFO   |     green_df in spark_df
INFO   |       row_count > 0 [PASSED]
INFO   |       no_datetime_less_than_2018 = 0 [PASSED]
INFO   |       duplicate_lines_lower_than = 0 [PASSED]
INFO   | 11/16 checks FAILED: 
INFO   |     yellow_df in spark_df
INFO   |       pickup_lower_than_dropoff = 0 [FAILED]
INFO   |         check_value: 73525.0
INFO   |       duplicate_lines_lower_than = 0 [FAILED]
INFO   |         check_value: 1.0
INFO   |       Confirm that required columns are present [FAILED]
INFO   |         fail_column_type_mismatch[passenger_count] expected(double) actual(bigint)
INFO   |         fail_column_type_mismatch[RatecodeID] expected(double) actual(bigint)
INFO   |         schema_measured = [VendorID bigint, tpep_pickup_datetime timestamp_ntz, tpep_dropoff_datetime timestam