### Extract

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col, explode
import pyspark.sql.functions as f

In [2]:
spark = SparkSession.builder\
.appName('ETL Pipeline')\
.config("spark.driver.extraClassPath", "/share_folder/postgresql-42.6.0.jar")\
.config("spark.executor.memory", "1g")\
.getOrCreate()
df = spark.read.text('data/WordData_etl.txt')
df.show(5)

23/05/30 07:23:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel{newLevel). For SparkR, use setLogLevel{newLevel).


+--------------------+
|               value|
+--------------------+
|This is a Japanes...|
|The team members ...|
|As the years pass...|
|If you don't like...|
|He was disappoint...|
+--------------------+
only showing top 5 rows



                                                                                

### Transforming

In [3]:
df2 = df.withColumn("splitedData", f.split("value"," "))

In [4]:
df3 = df2.withColumn("words", explode("splitedData"))

In [5]:
wordsDF = df3.select("words")
wordCount = wordsDF.groupBy("words").count()
wordCount.show()

+-----------+-----+
|      words|count|
+-----------+-----+
|   Tomorrow|    4|
|         If|    8|
|      leave|    4|
|      corny|    4|
|        day|    4|
|preoccupied|    4|
|       even|    8|
|      crazy|    4|
|    bananas|    4|
|     priest|    4|
|        did|    4|
|    whether|    4|
|     Having|    4|
|        I'm|    4|
|      crime|    4|
|  surprised|    4|
|      James|    4|
|      could|    8|
|        buy|    4|
|        him|    8|
+-----------+-----+
only showing top 20 rows



### Loading

In [8]:
driver = "org.postgresql.Driver"
url = "jdbc:postgresql://host.docker.internal:5432/spark"
table = "word_count"
user = "postgres"
password = "1"

wordCount.write.format("jdbc").\
option("driver", driver).\
option("url",url).\
option("dbtable", table).\
option("mode", "append").\
option("user",user).\
option("password", password).save()

                                                                                

In [9]:
df_read = spark.read.format("jdbc").\
option("driver", driver).\
option("url",url).\
option("dbtable", table).\
option("mode", "append").\
option("user",user).\
option("password", password).load() 

In [10]:
df_read.show(5)

+---------+-----+
|    words|count|
+---------+-----+
|surprised|    4|
|     even|    8|
|    James|    4|
|    crazy|    4|
|  bananas|    4|
+---------+-----+
only showing top 5 rows

