### Multi-Tenant Streaming from nested json files then performs mapping, validation and writes to CosmosDB/Postgres

#### Installation: This cluster needs to be installed with following libraries:
1. org.apache.kafka:kafka-clients:3.3.1 (maven)
2. azure-cosmos (pypi)
3. psycopg2 (pypi)

### Connect to Event Hubs

In [3]:
from pyspark.sql.functions import from_json, col,explode, split,get_json_object
from pyspark.sql.types import *
con_str = dbutils.secrets.get("scope1", "ehns001-con")
EH_SASL = f"org.apache.kafka.common.security.plain.PlainLoginModule required username='$ConnectionString' password='{con_str}';"
GROUP_ID = "$Default"

data_schema = StructType([
    StructField("tenant", StringType(), True),
    StructField("data", ArrayType(StringType()), True),
]

)
orders = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "ehns001.servicebus.windows.net:9093") \
  .option("subscribe", "sales_orders") \
  .option("kafka.sasl.mechanism","PLAIN") \
  .option("kafka.security.protocol","SASL_SSL") \
  .option("kafka.sasl.jaas.config", EH_SASL ) \
  .option("kafka.request.timeout.ms", "60000") \
  .option("kafka.session.timeout.ms", "60000") \
  .option("kafka.group.id", GROUP_ID) \
  .option("failOnDataLoss", "false") \
  .load() \
  .select(from_json(col("value").cast("string"), data_schema).alias("value"), "partition") \
  .select("value.tenant",explode("value.data").alias("data"), "partition") \
  .select("tenant","data", "partition") \

orders.createOrReplaceTempView("temp_sales_orders")



In [4]:
%sql select * from temp_sales_orders

tenant,data,partition
tenant15,"{""ORDER_NUMBER"":10167,""QTY_ORDERED"":40,""PRICE_EACH"":41.71,""ORDER_LINE"":4,""SALES"":1668.4,""ORDERDATE"":""10/23/2003 0:00"",""STATUS"":""Cancelled"",""QTR_ID"":4,""MONTH_ID"":10,""YEAR_ID"":2003,""PRODUCTLINE"":""Planes"",""MSRP"":49,""PRODUCTCODE"":""S72_1253"",""CUSTOMERNAME"":""Scandinavian Gift Ideas"",""PHONE"":""0695-34 6555"",""ADDRESSLINE1"":""?kergatan 24"",""ADDRESSLINE2"":""NaN"",""CITY"":""Boras"",""STATE"":""NaN"",""POSTALCODE"":""S-844 67"",""COUNTRY"":""Sweden"",""TERRITORY"":""EMEA"",""CONTACTLASTNAME"":""Larsson"",""CONTACTFIRSTNAME"":""Maria"",""DEALSIZE"":""Small""}",0
tenant15,"{""ORDER_NUMBER"":10423,""QTY_ORDERED"":31,""PRICE_EACH"":53.72,""ORDER_LINE"":3,""SALES"":1665.32,""ORDERDATE"":""5/30/2005 0:00"",""STATUS"":""In Process"",""QTR_ID"":2,""MONTH_ID"":5,""YEAR_ID"":2005,""PRODUCTLINE"":""Vintage Cars"",""MSRP"":62,""PRODUCTCODE"":""S18_2957"",""CUSTOMERNAME"":""Petit Auto"",""PHONE"":""(02) 5554 67"",""ADDRESSLINE1"":""Rue Joseph-Bens 532"",""ADDRESSLINE2"":""NaN"",""CITY"":""Bruxelles"",""STATE"":""NaN"",""POSTALCODE"":""B-1180"",""COUNTRY"":""Belgium"",""TERRITORY"":""EMEA"",""CONTACTLASTNAME"":""Dewey"",""CONTACTFIRSTNAME"":""Catherine"",""DEALSIZE"":""Small""}",0
tenant15,"{""ORDER_NUMBER"":10266,""QTY_ORDERED"":22,""PRICE_EACH"":100.0,""ORDER_LINE"":12,""SALES"":2454.54,""ORDERDATE"":""7/6/2004 0:00"",""STATUS"":""Shipped"",""QTR_ID"":3,""MONTH_ID"":7,""YEAR_ID"":2004,""PRODUCTLINE"":""Classic Cars"",""MSRP"":117,""PRODUCTCODE"":""S12_3380"",""CUSTOMERNAME"":""L'ordine Souveniers"",""PHONE"":""0522-556555"",""ADDRESSLINE1"":""Strada Provinciale 124"",""ADDRESSLINE2"":""NaN"",""CITY"":""Reggio Emilia"",""STATE"":""NaN"",""POSTALCODE"":""42100"",""COUNTRY"":""Italy"",""TERRITORY"":""EMEA"",""CONTACTLASTNAME"":""Moroni"",""CONTACTFIRSTNAME"":""Maurizio"",""DEALSIZE"":""Small""}",0
tenant15,"{""ORDER_NUMBER"":10414,""QTY_ORDERED"":34,""PRICE_EACH"":100.0,""ORDER_LINE"":13,""SALES"":3533.62,""ORDERDATE"":""5/6/2005 0:00"",""STATUS"":""On Hold"",""QTR_ID"":2,""MONTH_ID"":5,""YEAR_ID"":2005,""PRODUCTLINE"":""Ships"",""MSRP"":86,""PRODUCTCODE"":""S700_1938"",""CUSTOMERNAME"":""Gifts4AllAges.com"",""PHONE"":""6175559555"",""ADDRESSLINE1"":""8616 Spinnaker Dr."",""ADDRESSLINE2"":""NaN"",""CITY"":""Boston"",""STATE"":""MA"",""POSTALCODE"":""51003"",""COUNTRY"":""USA"",""TERRITORY"":""NaN"",""CONTACTLASTNAME"":""Yoshido"",""CONTACTFIRSTNAME"":""Juri"",""DEALSIZE"":""Medium""}",0
tenant15,"{""ORDER_NUMBER"":10340,""QTY_ORDERED"":55,""PRICE_EACH"":87.75,""ORDER_LINE"":7,""SALES"":4826.25,""ORDERDATE"":""11/24/2004 0:00"",""STATUS"":""Shipped"",""QTR_ID"":4,""MONTH_ID"":11,""YEAR_ID"":2004,""PRODUCTLINE"":""Planes"",""MSRP"":99,""PRODUCTCODE"":""S700_2466"",""CUSTOMERNAME"":""Enaco Distributors"",""PHONE"":""(93) 203 4555"",""ADDRESSLINE1"":""Rambla de Catalu¤a, 23"",""ADDRESSLINE2"":""NaN"",""CITY"":""Barcelona"",""STATE"":""NaN"",""POSTALCODE"":""8022"",""COUNTRY"":""Spain"",""TERRITORY"":""EMEA"",""CONTACTLASTNAME"":""Saavedra"",""CONTACTFIRSTNAME"":""Eduardo"",""DEALSIZE"":""Medium""}",0
tenant15,"{""ORDER_NUMBER"":10285,""QTY_ORDERED"":39,""PRICE_EACH"":78.92,""ORDER_LINE"":2,""SALES"":3077.88,""ORDERDATE"":""8/27/2004 0:00"",""STATUS"":""Shipped"",""QTR_ID"":3,""MONTH_ID"":8,""YEAR_ID"":2004,""PRODUCTLINE"":""Motorcycles"",""MSRP"":81,""PRODUCTCODE"":""S50_4713"",""CUSTOMERNAME"":""Marta's Replicas Co."",""PHONE"":""6175558555"",""ADDRESSLINE1"":""39323 Spinnaker Dr."",""ADDRESSLINE2"":""NaN"",""CITY"":""Cambridge"",""STATE"":""MA"",""POSTALCODE"":""51247"",""COUNTRY"":""USA"",""TERRITORY"":""NaN"",""CONTACTLASTNAME"":""Hernandez"",""CONTACTFIRSTNAME"":""Marta"",""DEALSIZE"":""Medium""}",0
tenant15,"{""ORDER_NUMBER"":10259,""QTY_ORDERED"":34,""PRICE_EACH"":99.41,""ORDER_LINE"":7,""SALES"":3379.94,""ORDERDATE"":""6/15/2004 0:00"",""STATUS"":""Shipped"",""QTR_ID"":2,""MONTH_ID"":6,""YEAR_ID"":2004,""PRODUCTLINE"":""Trucks and Buses"",""MSRP"":122,""PRODUCTCODE"":""S18_2319"",""CUSTOMERNAME"":""Handji Gifts& Co"",""PHONE"":""+65 224 1555"",""ADDRESSLINE1"":""Village Close - 106 Linden Road Sandown"",""ADDRESSLINE2"":""2nd Floor"",""CITY"":""Singapore"",""STATE"":""NaN"",""POSTALCODE"":""69045"",""COUNTRY"":""Singapore"",""TERRITORY"":""APAC"",""CONTACTLASTNAME"":""Victorino"",""CONTACTFIRSTNAME"":""Wendy"",""DEALSIZE"":""Medium""}",0
tenant15,"{""ORDER_NUMBER"":10245,""QTY_ORDERED"":44,""PRICE_EACH"":69.16,""ORDER_LINE"":5,""SALES"":3043.04,""ORDERDATE"":""5/4/2004 0:00"",""STATUS"":""Shipped"",""QTR_ID"":2,""MONTH_ID"":5,""YEAR_ID"":2004,""PRODUCTLINE"":""Trucks and Buses"",""MSRP"":64,""PRODUCTCODE"":""S32_3522"",""CUSTOMERNAME"":""Super Scale Inc."",""PHONE"":""2035559545"",""ADDRESSLINE1"":""567 North Pendale Street"",""ADDRESSLINE2"":""NaN"",""CITY"":""New Haven"",""STATE"":""CT"",""POSTALCODE"":""97823"",""COUNTRY"":""USA"",""TERRITORY"":""NaN"",""CONTACTLASTNAME"":""Murphy"",""CONTACTFIRSTNAME"":""Leslie"",""DEALSIZE"":""Medium""}",0
tenant15,"{""ORDER_NUMBER"":10201,""QTY_ORDERED"":-5,""PRICE_EACH"":100.0,""ORDER_LINE"":5,""SALES"":3025.92,""ORDERDATE"":""12/1/2003 0:00"",""STATUS"":""Shipped"",""QTR_ID"":4,""MONTH_ID"":12,""YEAR_ID"":2003,""PRODUCTLINE"":""Motorcycles"",""MSRP"":118,""PRODUCTCODE"":""S10_2016"",""CUSTOMERNAME"":""Mini Wheels Co."",""PHONE"":""6505555787"",""ADDRESSLINE1"":""5557 North Pendale Street"",""ADDRESSLINE2"":""NaN"",""CITY"":""San Francisco"",""STATE"":""CA"",""POSTALCODE"":""NaN"",""COUNTRY"":""USA"",""TERRITORY"":""NaN"",""CONTACTLASTNAME"":""Murphy"",""CONTACTFIRSTNAME"":""Julie"",""DEALSIZE"":""Medium""}",0
tenant15,"{""ORDER_NUMBER"":10273,""QTY_ORDERED"":40,""PRICE_EACH"":100.0,""ORDER_LINE"":13,""SALES"":5026.4,""ORDERDATE"":""7/21/2004 0:00"",""STATUS"":""Shipped"",""QTR_ID"":3,""MONTH_ID"":7,""YEAR_ID"":2004,""PRODUCTLINE"":""Vintage Cars"",""MSRP"":136,""PRODUCTCODE"":""S18_3140"",""CUSTOMERNAME"":""Petit Auto"",""PHONE"":""(02) 5554 67"",""ADDRESSLINE1"":""Rue Joseph-Bens 532"",""ADDRESSLINE2"":""NaN"",""CITY"":""Bruxelles"",""STATE"":""NaN"",""POSTALCODE"":""B-1180"",""COUNTRY"":""Belgium"",""TERRITORY"":""EMEA"",""CONTACTLASTNAME"":""Dewey"",""CONTACTFIRSTNAME"":""Catherine"",""DEALSIZE"":""Medium""}",0


### Mapping rule and validation rule per tenant. Easy onboarding new tenant

In [6]:
import pandas as pd
rule1 = {
  "mapping" :{"ORDER_NUMBER":"ORDERNUMBER","QTY_ORDERED":"QUANTITYORDERED","PRICE_EACH":"PRICEEACH","ORDER_LINE":"ORDERLINENUMBER","SALES":"SALES","ORDERDATE":"ORDERDATE","STATUS":"STATUS","QTR_ID":"QTR_ID","MONTH_ID":"MONTH_ID","YEAR_ID":"YEAR_ID","PRODUCTLINE":"PRODUCTLINE","MSRP":"MSRP","PRODUCTCODE":"PRODUCTCODE","CUSTOMERNAME":"CUSTOMERNAME","PHONE":"PHONE","ADDRESSLINE1":"ADDRESSLINE1","ADDRESSLINE2":"ADDRESSLINE2","CITY":"CITY","STATE":"STATE","POSTALCODE":"POSTALCODE","COUNTRY":"COUNTRY","TERRITORY":"TERRITORY","CONTACTLASTNAME":"CONTACTLASTNAME","CONTACTFIRSTNAME":"CONTACTFIRSTNAME","DEALSIZE":"DEALSIZE"},
"data_validation"
  :{"CITY":"NOT_NULL", "QUANTITYORDERED":"POSITIVE_NUMBER"}
}
rule2 = {
  "mapping" :{"ORDER_NUMBER":"ORDERNUMBER","QTY_ORDERED":"QUANTITYORDERED","PRICE_EACH":"PRICEEACH","ORDER_LINE":"ORDERLINENUMBER","SALES":"SALES","ORDERDATE":"ORDERDATE","STATUS":"STATUS","QTR_ID":"QTR_ID","MONTH_ID":"MONTH_ID","YEAR_ID":"YEAR_ID","PRODUCTLINE":"PRODUCTLINE","MSRP":"MSRP","PRODUCTCODE":"PRODUCTCODE","CUSTOMERNAME":"CUSTOMERNAME","PHONE":"PHONE","ADDRESSLINE1":"ADDRESSLINE1","ADDRESSLINE2":"ADDRESSLINE2","CITY":"CITY","STATE":"STATE","POSTALCODE":"POSTALCODE","COUNTRY":"COUNTRY","TERRITORY":"TERRITORY","CONTACTLASTNAME":"CONTACTLASTNAME","CONTACTFIRSTNAME":"CONTACTFIRSTNAME","DEALSIZE":"DEALSIZE"},
"data_validation"
  :{"CITY":"NOT_NULL", "QUANTITYORDERED":"POSITIVE_NUMBER"}
}
rule3 = {
  "mapping" :{"ORDER_NUMBER":"ORDERNUMBER","QTY_ORDERED":"QUANTITYORDERED","PRICE_EACH":"PRICEEACH","ORDER_LINE":"ORDERLINENUMBER","SALES":"SALES","ORDERDATE":"ORDERDATE","STATUS":"STATUS","QTR_ID":"QTR_ID","MONTH_ID":"MONTH_ID","YEAR_ID":"YEAR_ID","PRODUCTLINE":"PRODUCTLINE","MSRP":"MSRP","PRODUCTCODE":"PRODUCTCODE","CUSTOMERNAME":"CUSTOMERNAME","PHONE":"PHONE","ADDRESSLINE1":"ADDRESSLINE1","ADDRESSLINE2":"ADDRESSLINE2","CITY":"CITY","STATE":"STATE","POSTALCODE":"POSTALCODE","COUNTRY":"COUNTRY","TERRITORY":"TERRITORY","CONTACTLASTNAME":"CONTACTLASTNAME","CONTACTFIRSTNAME":"CONTACTFIRSTNAME","DEALSIZE":"DEALSIZE"},
"data_validation"
  :{"CITY":"NOT_NULL", "QUANTITYORDERED":"POSITIVE_NUMBER"}
}
tenant_rules = {"tenant":['tenant11','tenant15','tenant2','tenant3','tenant4','tenant5','tenant6','tenant7','tenant8','tenant9','tenant10'], "rule":[rule2, rule1, rule2, rule3, rule2, rule1, rule2, rule3, rule1, rule3, rule1]}
tenant_rules = pd.DataFrame(tenant_rules)
tenant_rule_df = spark.createDataFrame(tenant_rules)
tenant_rule_df.cache()



  Unable to convert the field rule. If this column is not necessary, you may consider dropping it or converting to primitive type before the conversion.
Direct cause: Nested StructType not supported in conversion from Arrow: struct<data_validation: struct<CITY: string, QUANTITYORDERED: string>, mapping: struct<ADDRESSLINE1: string, ADDRESSLINE2: string, CITY: string, CONTACTFIRSTNAME: string, CONTACTLASTNAME: string, COUNTRY: string, CUSTOMERNAME: string, DEALSIZE: string, MONTH_ID: string, MSRP: string, ORDERDATE: string, ORDER_LINE: string, ORDER_NUMBER: string, PHONE: string, POSTALCODE: string, PRICE_EACH: string, PRODUCTCODE: string, PRODUCTLINE: string, QTR_ID: string, QTY_ORDERED: string, SALES: string, STATE: string, STATUS: string, TERRITORY: string, YEAR_ID: string>>
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)
Out[3]: DataFrame[tenant: string, rule: map<string,map<string,string>>]

In [7]:
display(tenant_rule_df)

tenant,rule
tenant11,"Map(data_validation -> Map(CITY -> NOT_NULL, QUANTITYORDERED -> POSITIVE_NUMBER), mapping -> Map(PRODUCTLINE -> PRODUCTLINE, CUSTOMERNAME -> CUSTOMERNAME, MONTH_ID -> MONTH_ID, MSRP -> MSRP, ORDERDATE -> ORDERDATE, CITY -> CITY, PRICE_EACH -> PRICEEACH, CONTACTLASTNAME -> CONTACTLASTNAME, TERRITORY -> TERRITORY, ADDRESSLINE1 -> ADDRESSLINE1, PRODUCTCODE -> PRODUCTCODE, STATUS -> STATUS, QTY_ORDERED -> QUANTITYORDERED, SALES -> SALES, YEAR_ID -> YEAR_ID, PHONE -> PHONE, ORDER_NUMBER -> ORDERNUMBER, POSTALCODE -> POSTALCODE, DEALSIZE -> DEALSIZE, CONTACTFIRSTNAME -> CONTACTFIRSTNAME, COUNTRY -> COUNTRY, ADDRESSLINE2 -> ADDRESSLINE2, STATE -> STATE, QTR_ID -> QTR_ID, ORDER_LINE -> ORDERLINENUMBER))"
tenant15,"Map(data_validation -> Map(CITY -> NOT_NULL, QUANTITYORDERED -> POSITIVE_NUMBER), mapping -> Map(PRODUCTLINE -> PRODUCTLINE, CUSTOMERNAME -> CUSTOMERNAME, MONTH_ID -> MONTH_ID, MSRP -> MSRP, ORDERDATE -> ORDERDATE, CITY -> CITY, PRICE_EACH -> PRICEEACH, CONTACTLASTNAME -> CONTACTLASTNAME, TERRITORY -> TERRITORY, ADDRESSLINE1 -> ADDRESSLINE1, PRODUCTCODE -> PRODUCTCODE, STATUS -> STATUS, QTY_ORDERED -> QUANTITYORDERED, SALES -> SALES, YEAR_ID -> YEAR_ID, PHONE -> PHONE, ORDER_NUMBER -> ORDERNUMBER, POSTALCODE -> POSTALCODE, DEALSIZE -> DEALSIZE, CONTACTFIRSTNAME -> CONTACTFIRSTNAME, COUNTRY -> COUNTRY, ADDRESSLINE2 -> ADDRESSLINE2, STATE -> STATE, QTR_ID -> QTR_ID, ORDER_LINE -> ORDERLINENUMBER))"
tenant2,"Map(data_validation -> Map(CITY -> NOT_NULL, QUANTITYORDERED -> POSITIVE_NUMBER), mapping -> Map(PRODUCTLINE -> PRODUCTLINE, CUSTOMERNAME -> CUSTOMERNAME, MONTH_ID -> MONTH_ID, MSRP -> MSRP, ORDERDATE -> ORDERDATE, CITY -> CITY, PRICE_EACH -> PRICEEACH, CONTACTLASTNAME -> CONTACTLASTNAME, TERRITORY -> TERRITORY, ADDRESSLINE1 -> ADDRESSLINE1, PRODUCTCODE -> PRODUCTCODE, STATUS -> STATUS, QTY_ORDERED -> QUANTITYORDERED, SALES -> SALES, YEAR_ID -> YEAR_ID, PHONE -> PHONE, ORDER_NUMBER -> ORDERNUMBER, POSTALCODE -> POSTALCODE, DEALSIZE -> DEALSIZE, CONTACTFIRSTNAME -> CONTACTFIRSTNAME, COUNTRY -> COUNTRY, ADDRESSLINE2 -> ADDRESSLINE2, STATE -> STATE, QTR_ID -> QTR_ID, ORDER_LINE -> ORDERLINENUMBER))"
tenant3,"Map(data_validation -> Map(CITY -> NOT_NULL, QUANTITYORDERED -> POSITIVE_NUMBER), mapping -> Map(PRODUCTLINE -> PRODUCTLINE, CUSTOMERNAME -> CUSTOMERNAME, MONTH_ID -> MONTH_ID, MSRP -> MSRP, ORDERDATE -> ORDERDATE, CITY -> CITY, PRICE_EACH -> PRICEEACH, CONTACTLASTNAME -> CONTACTLASTNAME, TERRITORY -> TERRITORY, ADDRESSLINE1 -> ADDRESSLINE1, PRODUCTCODE -> PRODUCTCODE, STATUS -> STATUS, QTY_ORDERED -> QUANTITYORDERED, SALES -> SALES, YEAR_ID -> YEAR_ID, PHONE -> PHONE, ORDER_NUMBER -> ORDERNUMBER, POSTALCODE -> POSTALCODE, DEALSIZE -> DEALSIZE, CONTACTFIRSTNAME -> CONTACTFIRSTNAME, COUNTRY -> COUNTRY, ADDRESSLINE2 -> ADDRESSLINE2, STATE -> STATE, QTR_ID -> QTR_ID, ORDER_LINE -> ORDERLINENUMBER))"
tenant4,"Map(data_validation -> Map(CITY -> NOT_NULL, QUANTITYORDERED -> POSITIVE_NUMBER), mapping -> Map(PRODUCTLINE -> PRODUCTLINE, CUSTOMERNAME -> CUSTOMERNAME, MONTH_ID -> MONTH_ID, MSRP -> MSRP, ORDERDATE -> ORDERDATE, CITY -> CITY, PRICE_EACH -> PRICEEACH, CONTACTLASTNAME -> CONTACTLASTNAME, TERRITORY -> TERRITORY, ADDRESSLINE1 -> ADDRESSLINE1, PRODUCTCODE -> PRODUCTCODE, STATUS -> STATUS, QTY_ORDERED -> QUANTITYORDERED, SALES -> SALES, YEAR_ID -> YEAR_ID, PHONE -> PHONE, ORDER_NUMBER -> ORDERNUMBER, POSTALCODE -> POSTALCODE, DEALSIZE -> DEALSIZE, CONTACTFIRSTNAME -> CONTACTFIRSTNAME, COUNTRY -> COUNTRY, ADDRESSLINE2 -> ADDRESSLINE2, STATE -> STATE, QTR_ID -> QTR_ID, ORDER_LINE -> ORDERLINENUMBER))"
tenant5,"Map(data_validation -> Map(CITY -> NOT_NULL, QUANTITYORDERED -> POSITIVE_NUMBER), mapping -> Map(PRODUCTLINE -> PRODUCTLINE, CUSTOMERNAME -> CUSTOMERNAME, MONTH_ID -> MONTH_ID, MSRP -> MSRP, ORDERDATE -> ORDERDATE, CITY -> CITY, PRICE_EACH -> PRICEEACH, CONTACTLASTNAME -> CONTACTLASTNAME, TERRITORY -> TERRITORY, ADDRESSLINE1 -> ADDRESSLINE1, PRODUCTCODE -> PRODUCTCODE, STATUS -> STATUS, QTY_ORDERED -> QUANTITYORDERED, SALES -> SALES, YEAR_ID -> YEAR_ID, PHONE -> PHONE, ORDER_NUMBER -> ORDERNUMBER, POSTALCODE -> POSTALCODE, DEALSIZE -> DEALSIZE, CONTACTFIRSTNAME -> CONTACTFIRSTNAME, COUNTRY -> COUNTRY, ADDRESSLINE2 -> ADDRESSLINE2, STATE -> STATE, QTR_ID -> QTR_ID, ORDER_LINE -> ORDERLINENUMBER))"
tenant6,"Map(data_validation -> Map(CITY -> NOT_NULL, QUANTITYORDERED -> POSITIVE_NUMBER), mapping -> Map(PRODUCTLINE -> PRODUCTLINE, CUSTOMERNAME -> CUSTOMERNAME, MONTH_ID -> MONTH_ID, MSRP -> MSRP, ORDERDATE -> ORDERDATE, CITY -> CITY, PRICE_EACH -> PRICEEACH, CONTACTLASTNAME -> CONTACTLASTNAME, TERRITORY -> TERRITORY, ADDRESSLINE1 -> ADDRESSLINE1, PRODUCTCODE -> PRODUCTCODE, STATUS -> STATUS, QTY_ORDERED -> QUANTITYORDERED, SALES -> SALES, YEAR_ID -> YEAR_ID, PHONE -> PHONE, ORDER_NUMBER -> ORDERNUMBER, POSTALCODE -> POSTALCODE, DEALSIZE -> DEALSIZE, CONTACTFIRSTNAME -> CONTACTFIRSTNAME, COUNTRY -> COUNTRY, ADDRESSLINE2 -> ADDRESSLINE2, STATE -> STATE, QTR_ID -> QTR_ID, ORDER_LINE -> ORDERLINENUMBER))"
tenant7,"Map(data_validation -> Map(CITY -> NOT_NULL, QUANTITYORDERED -> POSITIVE_NUMBER), mapping -> Map(PRODUCTLINE -> PRODUCTLINE, CUSTOMERNAME -> CUSTOMERNAME, MONTH_ID -> MONTH_ID, MSRP -> MSRP, ORDERDATE -> ORDERDATE, CITY -> CITY, PRICE_EACH -> PRICEEACH, CONTACTLASTNAME -> CONTACTLASTNAME, TERRITORY -> TERRITORY, ADDRESSLINE1 -> ADDRESSLINE1, PRODUCTCODE -> PRODUCTCODE, STATUS -> STATUS, QTY_ORDERED -> QUANTITYORDERED, SALES -> SALES, YEAR_ID -> YEAR_ID, PHONE -> PHONE, ORDER_NUMBER -> ORDERNUMBER, POSTALCODE -> POSTALCODE, DEALSIZE -> DEALSIZE, CONTACTFIRSTNAME -> CONTACTFIRSTNAME, COUNTRY -> COUNTRY, ADDRESSLINE2 -> ADDRESSLINE2, STATE -> STATE, QTR_ID -> QTR_ID, ORDER_LINE -> ORDERLINENUMBER))"
tenant8,"Map(data_validation -> Map(CITY -> NOT_NULL, QUANTITYORDERED -> POSITIVE_NUMBER), mapping -> Map(PRODUCTLINE -> PRODUCTLINE, CUSTOMERNAME -> CUSTOMERNAME, MONTH_ID -> MONTH_ID, MSRP -> MSRP, ORDERDATE -> ORDERDATE, CITY -> CITY, PRICE_EACH -> PRICEEACH, CONTACTLASTNAME -> CONTACTLASTNAME, TERRITORY -> TERRITORY, ADDRESSLINE1 -> ADDRESSLINE1, PRODUCTCODE -> PRODUCTCODE, STATUS -> STATUS, QTY_ORDERED -> QUANTITYORDERED, SALES -> SALES, YEAR_ID -> YEAR_ID, PHONE -> PHONE, ORDER_NUMBER -> ORDERNUMBER, POSTALCODE -> POSTALCODE, DEALSIZE -> DEALSIZE, CONTACTFIRSTNAME -> CONTACTFIRSTNAME, COUNTRY -> COUNTRY, ADDRESSLINE2 -> ADDRESSLINE2, STATE -> STATE, QTR_ID -> QTR_ID, ORDER_LINE -> ORDERLINENUMBER))"
tenant9,"Map(data_validation -> Map(CITY -> NOT_NULL, QUANTITYORDERED -> POSITIVE_NUMBER), mapping -> Map(PRODUCTLINE -> PRODUCTLINE, CUSTOMERNAME -> CUSTOMERNAME, MONTH_ID -> MONTH_ID, MSRP -> MSRP, ORDERDATE -> ORDERDATE, CITY -> CITY, PRICE_EACH -> PRICEEACH, CONTACTLASTNAME -> CONTACTLASTNAME, TERRITORY -> TERRITORY, ADDRESSLINE1 -> ADDRESSLINE1, PRODUCTCODE -> PRODUCTCODE, STATUS -> STATUS, QTY_ORDERED -> QUANTITYORDERED, SALES -> SALES, YEAR_ID -> YEAR_ID, PHONE -> PHONE, ORDER_NUMBER -> ORDERNUMBER, POSTALCODE -> POSTALCODE, DEALSIZE -> DEALSIZE, CONTACTFIRSTNAME -> CONTACTFIRSTNAME, COUNTRY -> COUNTRY, ADDRESSLINE2 -> ADDRESSLINE2, STATE -> STATE, QTR_ID -> QTR_ID, ORDER_LINE -> ORDERLINENUMBER))"


### Perform data mapping and validation and write output to CosmosDB (and Delta table)

In [9]:
from azure.cosmos import CosmosClient
import psycopg2
import pandas as pd
import ast
URL ="https://cosmosdbnative01.documents.azure.com:443/"
KEY =dbutils.secrets.get("scope1", "cosmosdbnative01-con")
postgres_pass = dbutils.secrets.get("scope1", "cosmospostgres01-pass")
# schema = "status int"
# {"ORDER_NUMBER":10266,"QTY_ORDERED":22,"PRICE_EACH":100.0,"ORDER_LINE":12,"SALES":2454.54,"ORDERDATE":"7/6/2004 0:00","STATUS":"Shipped","QTR_ID":3,"MONTH_ID":7,"YEAR_ID":2004,"PRODUCTLINE":"Classic Cars","MSRP":117,"PRODUCTCODE":"S12_3380","CUSTOMERNAME":"L'ordine Souveniers","PHONE":"0522-556555","ADDRESSLINE1":"Strada Provinciale 124","ADDRESSLINE2":"NaN","CITY":"Reggio Emilia","STATE":"NaN","POSTALCODE":"42100","COUNTRY":"Italy","TERRITORY":"EMEA","CONTACTLASTNAME":"Moroni","CONTACTFIRSTNAME...

schema = "tenant string, ORDERNUMBER int,QUANTITYORDERED int,PRICEEACH float,ORDERLINENUMBER int,SALES float,ORDERDATE string,STATUS string,QTR_ID int,MONTH_ID int,YEAR_ID int,PRODUCTLINE string,MSRP int,PRODUCTCODE string,CUSTOMERNAME string,PHONE string,ADDRESSLINE1 string,ADDRESSLINE2 string,CITY string,STATE string,POSTALCODE string,COUNTRY string,TERRITORY string,CONTACTLASTNAME string,CONTACTFIRSTNAME string,DEALSIZE string, valid_flag string, reason string"

# schema = "tenant string, valid_flag string"
def write_cosmos(data):
  client = CosmosClient(URL, credential=KEY)
  DATABASE_NAME = 'sales'
  database = client.get_database_client(DATABASE_NAME)
  CONTAINER_NAME = 'sales_orders'
  container = database.get_container_client(CONTAINER_NAME)
  for item in data:
    container.upsert_item(item)
def write_postgress(data):
  columns = ",".join(data.columns)
  columns = "("+columns + ")"
  data = data.to_dict(orient="records")
  connection = psycopg2.connect(user="citus",
                                password=postgres_pass,
                                host="c.cosmospostgres01.postgres.database.azure.com",
                                port="5432",
                                database="citus",
                                sslmode='require')
  values = [tuple([value for value in row.values()]) for row in data ]
  cursor = connection.cursor()
  args = ','.join(cursor.mogrify("(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", i).decode('utf-8')
                  for i in values)
                      
  # executing the sql statement
  cursor.execute(f"INSERT INTO sales_orders {columns} VALUES " + (args))
  connection.commit()
def process_validation(inputdf):
  rule = inputdf.iloc[0]['rule']
  mapping = rule['mapping']
  mapping = {item[0]:item[1] for item in mapping }
  source_columns = list(mapping.keys())
  target_columns = list(mapping.values())
  exploded_df = inputdf["data"].to_list()
  exploded_df = [ast.literal_eval(item) for item in exploded_df]
  exploded_df = pd.DataFrame(exploded_df)
  exploded_df['tenant'] = inputdf['tenant']

  #doing data mapping
  if set(source_columns).issubset(exploded_df.columns):
    source_columns.append("tenant")
    outputdf = exploded_df[source_columns]
    outputdf.rename(columns = mapping, inplace=True)
  else:
    target_columns.append("tenant")
    outputdf = pd.DataFrame([["9999"]*len(target_columns)], columns=target_columns)
    outputdf["valid_flag"] = "false"
    outputdf["reason"] = "column mapping failed"
    write_cosmos(outputdf.to_dict(orient="records"))
#     write_postgress(outputdf)
    return outputdf
  ###doing data validation
  
  data_validation = rule['data_validation']
  data_validation = {item[0]:item[1] for item in data_validation }
  outputdf['valid_flag'] = "true"
  outputdf['reason'] = ""
  #process data mapping
  for column in data_validation.keys():
    rule = data_validation[column]
    if rule == "POSITIVE_NUMBER":
      outputdf.loc[outputdf[column] <= 0, 'valid_flag'] = "false"
      outputdf.loc[outputdf[column] <= 0, 'reason'] = f"{column} has negative value"
  write_cosmos(outputdf.to_dict(orient="records"))
#   write_postgress(outputdf)

  return outputdf
def process_batch(batchdf, batchid):
  
  joined_df = batchdf.join(tenant_rule_df, on = "tenant")
  outputdf = joined_df.groupby("tenant").applyInPandas(process_validation, schema) 
#   outputdf.collect()
  outputdf.write.format("delta").mode("append").saveAsTable("val_orders")
writer = orders.writeStream.foreachBatch(process_batch)
writer.start()

Out[22]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f3d4f7817c0>

## Cross referential integrity check (join with sales order table )

In [11]:
# import dlt
# from pyspark.sql.functions import *
# from pyspark.sql.types import *
# @dlt.table(
#   comment="The validated table from validation process."
# )
# def val_orders_stream():
#   return (spark.read.format("delta").table("val_orders"))
  
# @dlt.table(
#   comment="final valid table"
# )
# @dlt.expect("saved_ORDERNUMBER is NULL", "saved_tenant IS  NULL")
# def final_sales_orders():
#   saved_orders = spark.sql("select tenant saved_tenant, ORDERNUMBER saved_ORDERNUMBER from final_orders where STATUS = 'Shipped'")
#   joined_orders = valid_orders.join(saved_orders, expr("ORDERNUMBER= saved_ORDERNUMBER and tenant = saved_tenant"), "leftouter")
#   return joined_orders.drop("saved_ORDERNUMBER", "saved_tenant")
  



In [12]:
from pyspark.sql.functions import expr

valid_orders = spark.readStream.format("delta").table("val_orders")
saved_orders = spark.sql("select tenant saved_tenant, ORDERNUMBER saved_ORDERNUMBER from final_orders where STATUS = 'Shipped'")
joined_orders = valid_orders.join(saved_orders, expr("ORDERNUMBER= saved_ORDERNUMBER and tenant = saved_tenant"), "leftouter")
final_orders = joined_orders.filter("saved_ORDERNUMBER is null").drop("saved_ORDERNUMBER", "saved_tenant")
final_orders.writeStream.format("delta").option("checkpointLocation","tmp/checkpoint/final_orders").table("final_orders")

Out[47]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f3d23906040>