In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("ETL using Spark").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/02 15:23:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
data = [("student1",64,90),
        ("student2",59,100),
        ("student3",69,95),
        ("",70,110),
        ("student5",60,80),
        ("student3",69,95),
        ("student6",62,85),
        ("student7",65,80),
        ("student7",65,80)]

In [6]:
df = spark.createDataFrame(data, ["Student", "Height_inches", "Weight_lbs"])
df.show()

                                                                                

+--------+-------------+----------+
| Student|Height_inches|Weight_lbs|
+--------+-------------+----------+
|student1|           64|        90|
|student2|           59|       100|
|student3|           69|        95|
|        |           70|       110|
|student5|           60|        80|
|student3|           69|        95|
|student6|           62|        85|
|student7|           65|        80|
|student7|           65|        80|
+--------+-------------+----------+



In [7]:
df.write.csv("student-hw.csv", header=True)

                                                                                

#### Read from a csv file to a parquet file ####

In [8]:
df = spark.read.csv('student-hw.csv', header=True, inferSchema=True)
df.show()

+--------+-------------+----------+
| Student|Height_inches|Weight_lbs|
+--------+-------------+----------+
|student6|           62|        85|
|student7|           65|        80|
|student7|           65|        80|
|student1|           64|        90|
|student2|           59|       100|
|student5|           60|        80|
|student3|           69|        95|
|student3|           69|        95|
|    NULL|           70|       110|
+--------+-------------+----------+



In [9]:
df.count()

9

In [10]:
df = df.dropDuplicates()

In [11]:
df.show()

+--------+-------------+----------+
| Student|Height_inches|Weight_lbs|
+--------+-------------+----------+
|student6|           62|        85|
|student7|           65|        80|
|student2|           59|       100|
|student1|           64|        90|
|student3|           69|        95|
|student5|           60|        80|
|    NULL|           70|       110|
+--------+-------------+----------+



In [12]:
df.count()

7

In [13]:
df = df.dropna()

In [14]:
df.count()

6

In [15]:
df.show()

+--------+-------------+----------+
| Student|Height_inches|Weight_lbs|
+--------+-------------+----------+
|student6|           62|        85|
|student7|           65|        80|
|student2|           59|       100|
|student1|           64|        90|
|student3|           69|        95|
|student5|           60|        80|
+--------+-------------+----------+



In [16]:
df.write.mode("overwrite").parquet("student-hw.parquet")

                                                                                

In [17]:
df = spark.read.parquet("./student-hw.parquet")
df.show()

+--------+-------------+----------+
| Student|Height_inches|Weight_lbs|
+--------+-------------+----------+
|student6|           62|        85|
|student7|           65|        80|
|student2|           59|       100|
|student1|           64|        90|
|student3|           69|        95|
|student5|           60|        80|
+--------+-------------+----------+



In [18]:
!ls -l student-hw.parquet

total 8
-rw-r--r--  1 mike  staff     0 Dec  2 15:23 _SUCCESS
-rw-r--r--  1 mike  staff  1053 Dec  2 15:23 part-00000-94537c2f-338d-4494-b19d-15b678f95dd0-c000.snappy.parquet


#### Condense Parquet to a single file #####

In [19]:
df = df.repartition(1)

In [20]:
df.write.mode("overwrite").parquet("student-hw-single.parquet")

In [21]:
!ls -l student-hw-single.parquet

total 8
-rw-r--r--  1 mike  staff     0 Dec  2 15:23 _SUCCESS
-rw-r--r--  1 mike  staff  1053 Dec  2 15:23 part-00000-6132e7e1-6fa3-4701-a679-db2827570ad5-c000.snappy.parquet


#### Read from a parquet file and write to csv file ####

In [22]:
df = spark.read.parquet("student-hw-single.parquet")
df.show()

+--------+-------------+----------+
| Student|Height_inches|Weight_lbs|
+--------+-------------+----------+
|student6|           62|        85|
|student7|           65|        80|
|student2|           59|       100|
|student1|           64|        90|
|student3|           69|        95|
|student5|           60|        80|
+--------+-------------+----------+



#### Transform the data ####

In [23]:
from pyspark.sql.functions import expr
from pyspark.sql.functions import col
from pyspark.sql.functions import round

In [24]:
df = df.withColumn("Height_cm", expr("Height_inches * 2.54"))
df.show()

+--------+-------------+----------+---------+
| Student|Height_inches|Weight_lbs|Height_cm|
+--------+-------------+----------+---------+
|student6|           62|        85|   157.48|
|student7|           65|        80|   165.10|
|student2|           59|       100|   149.86|
|student1|           64|        90|   162.56|
|student3|           69|        95|   175.26|
|student5|           60|        80|   152.40|
+--------+-------------+----------+---------+



In [26]:
df = df.withColumn("Weight_kg", round(col("Weight_lbs") * 0.453592, 2))
df.show()

+--------+-------------+----------+---------+---------+
| Student|Height_inches|Weight_lbs|Height_cm|Weight_kg|
+--------+-------------+----------+---------+---------+
|student6|           62|        85|   157.48|    38.56|
|student7|           65|        80|   165.10|    36.29|
|student2|           59|       100|   149.86|    45.36|
|student1|           64|        90|   162.56|    40.82|
|student3|           69|        95|   175.26|    43.09|
|student5|           60|        80|   152.40|    36.29|
+--------+-------------+----------+---------+---------+



In [27]:
df = df.drop("Height_inches", "Weight_lbs")
df.show()

+--------+---------+---------+
| Student|Height_cm|Weight_kg|
+--------+---------+---------+
|student6|   157.48|    38.56|
|student7|   165.10|    36.29|
|student2|   149.86|    45.36|
|student1|   162.56|    40.82|
|student3|   175.26|    43.09|
|student5|   152.40|    36.29|
+--------+---------+---------+



In [28]:
df.write.mode("overwrite").csv("student_transformed.csv", header=True)

In [29]:
df = spark.read.csv('student_transformed.csv', header=True, inferSchema=True)
df.show()

+--------+---------+---------+
| Student|Height_cm|Weight_kg|
+--------+---------+---------+
|student6|   157.48|    38.56|
|student7|    165.1|    36.29|
|student2|   149.86|    45.36|
|student1|   162.56|    40.82|
|student3|   175.26|    43.09|
|student5|    152.4|    36.29|
+--------+---------+---------+



In [30]:
spark.stop()