## Module 2: Perform Data Cleansing and Preparation using Apache Spark

#### Reading data from our delta table

In [1]:
data_df = spark.read.format("delta").load("Tables/diabetes")
display(data_df)

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 3, Finished, Available)

SynapseWidget(Synapse.DataFrame, 0905f793-7557-4a77-8018-2727921b96fb)

#### Checking if datatypes are numerical

In [2]:
data_df.dtypes

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 4, Finished, Available)

[('pregnancies', 'int'),
 ('plasma_glucose', 'int'),
 ('blood_pressure', 'int'),
 ('triceps_skin_thickness', 'int'),
 ('insulin', 'int'),
 ('bmi', 'double'),
 ('diabetes_pedigree', 'double'),
 ('age', 'int'),
 ('diabetes', 'int')]

#### Summarize dataframe

In [3]:
display(data_df.summary())

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 5, Finished, Available)

SynapseWidget(Synapse.DataFrame, 49896354-a3c2-4e2b-929f-4affa653aeb4)

observations from above

- blood_pressure and BMI is 0 for a few entries

In [4]:
display(data_df.select("age").summary())

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 6, Finished, Available)

SynapseWidget(Synapse.DataFrame, ed88e8b6-c483-48e4-83c4-4b0f90df2135)

In [5]:
display(data_df.groupBy("age").count())

StatementMeta(, 8d7c0321-042c-4f30-86fd-b8fb1cd0d87c, 9, Finished, Available)

SynapseWidget(Synapse.DataFrame, 139678ee-4571-4ccc-8031-c6ab97fa2fce)

####  Missing Observation Analysis

We saw on df.head() that some features contain "0", it doesn't make sense here and this indicates missing value below so we will replace "0" value by NaN:

In [5]:
data_df_fillna = data_df.replace(0,None,['plasma_glucose','blood_pressure','triceps_skin_thickness','insulin','bmi'])

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 7, Finished, Available)

In [6]:
display(data_df_fillna.filter("blood_pressure IS NULL"))

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 8, Finished, Available)

SynapseWidget(Synapse.DataFrame, 1b02990d-181c-4385-bae4-f36427112086)

In [7]:
data_df_fillna.summary("count").show()

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 9, Finished, Available)

+-------+-----------+--------------+--------------+----------------------+-------+---+-----------------+---+--------+
|summary|pregnancies|plasma_glucose|blood_pressure|triceps_skin_thickness|insulin|bmi|diabetes_pedigree|age|diabetes|
+-------+-----------+--------------+--------------+----------------------+-------+---+-----------------+---+--------+
|  count|        768|           763|           733|                   541|    394|757|              768|768|     768|
+-------+-----------+--------------+--------------+----------------------+-------+---+-----------------+---+--------+



## feature engineering
- adding obesity levels based on BMI

In [8]:
from pyspark.sql.functions import when

data_df_newbmi = data_df_fillna.withColumn('obesity_level', when(data_df_fillna.bmi <= 18.5, 'underweight')
                    .when((data_df_fillna.bmi > 18.5) & (data_df_fillna.bmi <= 24.9), 'normal')
                    .when((data_df_fillna.bmi > 24.9) & (data_df_fillna.bmi <= 29.9), 'overweight')
                    .otherwise('obese'))

display(data_df_newbmi)

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 10, Finished, Available)

SynapseWidget(Synapse.DataFrame, acf1688c-1cfd-4fc0-9184-b80fadcc01b4)

In [9]:
data_df_processed = data_df_newbmi.withColumn('insulin_level', 
              when(data_df_newbmi.insulin <= 16, 'normal')
              .otherwise('abnormal'))

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 11, Finished, Available)

In [10]:
display(data_df_processed)

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 12, Finished, Available)

SynapseWidget(Synapse.DataFrame, d0a33e28-63ee-4853-a750-921844ae32b3)

#### Save processed data to a Delta Table

In [11]:
spark.conf.set("sprk.sql.parquet.vorder.enabled", "true") # Enable Verti-Parquet write
spark.conf.set("spark.microsoft.delta.optimizeWrite.enabled", "true") # Enable automatic delta optimized write

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 13, Finished, Available)

In [12]:
table_name = "diabetes_processed"
data_df_processed.write.mode("overwrite").format("delta").save(f"Tables/{table_name}")
print(f"Spark dataframe saved to delta table: {table_name}")

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 14, Finished, Available)

Spark dataframe saved to delta table: diabetes_processed


In [13]:
%%sql

select * from diabetes_processed limit 100;

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 15, Finished, Available)

<Spark SQL result set with 100 rows and 11 fields>

In [14]:
%%sql 

select obesity_level, diabetes, count(*) as count
from diabetes_processed 
--where diabetes = 1
GROUP By obesity_level, diabetes

StatementMeta(, 8dceec53-b5b9-4846-aabb-3370f6da5bee, 16, Finished, Available)

<Spark SQL result set with 7 rows and 3 fields>