#### Data Reading

**Read the Bronze data from ADLS and store in Dataframe**

In [0]:
df_main = spark.read.format("parquet").load(
    "abfss://bronze@claimcraftadls.dfs.core.windows.net/Raw"
)

In [0]:
display(df_main)

**Write Bronze/Raw data to analyse the Data Quality in Dashboard <later>**

In [0]:
df_main.write.mode("overwrite").saveAsTable(
    "`Pharma_Claim_Craft`.Claim_craft_Bronze.bronze_raw_claimsdata"
)

In [0]:
%sql
select * from `Pharma_Claim_Craft`.claim_craft_bronze.bronze_raw_claimsdata

### Data Cleaning 

**Type casting & Round off the value**

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import to_date, col


In [0]:
df_main = (
    df_main.withColumn("Claim_Date", to_date(col("Claim_Date"),"yyyy-MM-dd"))
    .withColumn("Copay_Amount", round(col("Copay_Amount").cast('double'),2))
    .withColumn("Billed_Amount",round(col("Billed_Amount").cast('double'),2))
    .withColumn("Paid_Amount", round(col("Paid_Amount").cast('double'),2))
)
display(df_main)

**Normalise the value(Replace the unknown, '' to NULL)**

In [0]:

df_main = df_main.na.fill(
    {
        "Pharmacy_Name": "NULL",
        "Coverage_Status": "NULL",
        "Plan_Type": "NULL",
        "Channel": "NULL",
        "Territory": "NULL",
        "Prior_Auth": "NULL",
        "Duplicate_Claim": "NULL",
        "Submission_Source": "NULL",
        "Notes": "NULL",
    }
)
display(df_main)

In [0]:
df_main = df_main.na.fill(
    {
        "Copay_Amount": 0,
        "Billed_Amount": 0,
        "Paid_Amount": 0
    }
)
display(df_main)

#### Drop Duplicate

In [0]:
df_main=df_main.dropDuplicates(['Claim_ID'])
df_main.count()


#### Calculation/Operations

**Calculate Total claims, Percentage, Outstanding difference,Set Flag**

In [0]:
df_main = (
    df_main.withColumn("Total_Claim", round(col("Copay_Amount") + col("Paid_Amount"),2))
    .withColumn(
        "Copay_Percentage", round((col("Copay_Amount") / col("Billed_Amount")) * 100, 2)
    )
    .withColumn("Outstanding_Amount", round(col("Billed_Amount") - col("Total_Claim"), 2))
    .withColumn(
        "Duplicate_Flag",
        when(col("Duplicate_Claim") == "Yes", 1)
        .otherwise(0)
        .alias("Claim_Duplicate_Flag"),
    )
)

display(df_main)

#### Null value cleanup and Write the Log

**This is to learn how to log /metrics -To capture in Observability dashboard**

In [0]:
from pyspark.sql.functions import col, round
from pyspark.sql.functions import current_timestamp
from datetime import datetime


In [0]:
# Step 1: Clean the data
total_records = df_main.count()
df_clean = df_main.dropna(subset=["Rep_ID", "Patient_Zip"])
valid_records = df_clean.count()

# Step 2: Create a DataFrame for metrics

metrics_df = spark.createDataFrame(
    [
        {
            "Metric_Timestamp": datetime.now(),
            "Total_Records": total_records,
            "Valid_Records": valid_records,        
        }
    ]
)
# Step 3: Calculate metrics and addinto Dataframe

metrics_df= metrics_df.withColumn("dropped_records", col('total_records') - col('valid_records')).withColumn(
       "Claim_Completion_Rate", 
    round(col("Valid_Records") / col("Total_Records"), 4)

)

# Step 4: Write metrics to Delta table
metrics_df.write.mode("append").format("delta").saveAsTable(
    "`Pharma_Claim_Craft`.Claim_craft_Audit.claim_quality_metrics"
)

In [0]:
metrics_df.display()

In [0]:
df_clean.display()

#### Data Writing in Silver Container(ADLS) 

In [0]:
df_clean.write.format("parquet").mode("overwrite").option('path','abfss://silver@claimcraftadls.dfs.core.windows.net/Claimcraft_Cleaned_data').save()


In [0]:
df_test_silver= spark.read.format("parquet").load(
    'abfss://silver@claimcraftadls.dfs.core.windows.net/Claimcraft_Cleaned_data'
)

In [0]:
df_test_silver.display()

#### Write the Cleaned datainto silver Table

In [0]:
df_clean.write.mode("overwrite").format("delta").saveAsTable("Pharma_Claim_Craft.Claim_craft_Silver.Claimcraft_Cleaned_data")

In [0]:
%sql
select * from `Pharma_Claim_Craft`.claim_craft_silver.claimcraft_cleaned_data