In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("ETL using Spark").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/02 15:53:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Load data from previous notebook ####

In [5]:
df = spark.read.csv("student_transformed.csv", header=True, inferSchema=True)
df.show()

                                                                                

+--------+---------+---------+
| Student|Height_cm|Weight_kg|
+--------+---------+---------+
|student6|   157.48|    38.56|
|student7|    165.1|    36.29|
|student2|   149.86|    45.36|
|student1|   162.56|    40.82|
|student3|   175.26|    43.09|
|student5|    152.4|    36.29|
+--------+---------+---------+



#### Transform

In [6]:
from pyspark.sql.functions import col
from pyspark.sql.functions import expr
from pyspark.sql.functions import round

In [7]:
# Transform Height_cm to meters
df = df.withColumn("Height_meters", round(col("Height_cm") / 100 , 2))
df.show()

+--------+---------+---------+-------------+
| Student|Height_cm|Weight_kg|Height_meters|
+--------+---------+---------+-------------+
|student6|   157.48|    38.56|         1.57|
|student7|    165.1|    36.29|         1.65|
|student2|   149.86|    45.36|          1.5|
|student1|   162.56|    40.82|         1.63|
|student3|   175.26|    43.09|         1.75|
|student5|    152.4|    36.29|         1.52|
+--------+---------+---------+-------------+



In [10]:
# Create BMI column
# BMI = Weight / (Height * Height)
# Weight must be in kg and Height in meters

df = df.withColumn("BMI", round(col("Weight_kg")/(col("Height_meters") * col("Height_meters")),2))
df.show()

+--------+---------+---------+-------------+-----+
| Student|Height_cm|Weight_kg|Height_meters|  BMI|
+--------+---------+---------+-------------+-----+
|student6|   157.48|    38.56|         1.57|15.64|
|student7|    165.1|    36.29|         1.65|13.33|
|student2|   149.86|    45.36|          1.5|20.16|
|student1|   162.56|    40.82|         1.63|15.36|
|student3|   175.26|    43.09|         1.75|14.07|
|student5|    152.4|    36.29|         1.52|15.71|
+--------+---------+---------+-------------+-----+



In [11]:
df = df.drop("Height_cm")
df.show()

+--------+---------+-------------+-----+
| Student|Weight_kg|Height_meters|  BMI|
+--------+---------+-------------+-----+
|student6|    38.56|         1.57|15.64|
|student7|    36.29|         1.65|13.33|
|student2|    45.36|          1.5|20.16|
|student1|    40.82|         1.63|15.36|
|student3|    43.09|         1.75|14.07|
|student5|    36.29|         1.52|15.71|
+--------+---------+-------------+-----+



In [12]:
df.write.mode("overwrite").parquet("student_transformed.parquet")

                                                                                

In [13]:
df = spark.read.parquet("./student_transformed.parquet/")
df.show()

+--------+---------+-------------+-----+
| Student|Weight_kg|Height_meters|  BMI|
+--------+---------+-------------+-----+
|student6|    38.56|         1.57|15.64|
|student7|    36.29|         1.65|13.33|
|student2|    45.36|          1.5|20.16|
|student1|    40.82|         1.63|15.36|
|student3|    43.09|         1.75|14.07|
|student5|    36.29|         1.52|15.71|
+--------+---------+-------------+-----+



In [14]:
spark.stop()