In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, from_json, date_format, to_timestamp
from pyspark.sql.types import *
from pyspark.sql import functions as F

import findspark
findspark.init()

spark = SparkSession. \
    builder. \
    appName("Data Sources"). \
    master("local"). \
    config("spark.jars", "jars/postgresql-42.2.19.jar"). \
    getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", 14)



# Read/Write DataFrame with file system, HDFS, S3, FTP

In [None]:
# config("spark.python.worker.memory", "8g"). \
#     config("spark.driver.memory", "8g"). \
#     config("spark.executor.memory", "8g"). \
#  \


In [2]:
cars_df = spark.read. \
    format("json"). \
    option("inferSchema", "true"). \
    option("mode", "failFast"). \
    option("path", "data/cars"). \
    load()

cars_df.show()



+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|                Name|Origin|Weight_in_lbs|      Year|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|      null|
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|1970-01-01|
|        11.5|        8|       350.0|       165|            15.0|   buick skylark 320|   USA|         3693|1970-01-01|
|        11.0|        8|       318.0|       150|            18.0|  plymouth satellite|   USA|         3436|1970-01-01|
|        12.0|        8|       304.0|       150|            16.0|       amc rebel sst|   USA|         3433|1970-01-01|
|        10.5|        8|       302.0|       140|

In [None]:
# HDFS
# option("path", "hdfs://nn1home:8020/sources/cars"). \

# FTP
# option("path", "ftp://user:pwd/192.168.1.5/sources/cars"). \

# S3
# option("path", s3://bucket-name/sources/cars)


In [3]:
cars_df_v2 = spark.read. \
    format("json"). \
    options(mode="failFast", path="data/cars", inferSchema="true"). \
    load()

cars_df_v2.show()

         # /sources/cars
# 10.1.1.1 node1 -> block1     S3 NETWORK                             -> partition1 -> task1
# 10.1.1.2 node2 -> block2 -> Spark Driver -> Name Node -> ip adress -> partition2 -> task2
# 10.1.1.3 node3 -> block2                                           -> parttion3 -> task3


+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|                Name|Origin|Weight_in_lbs|      Year|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|      null|
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|1970-01-01|
|        11.5|        8|       350.0|       165|            15.0|   buick skylark 320|   USA|         3693|1970-01-01|
|        11.0|        8|       318.0|       150|            18.0|  plymouth satellite|   USA|         3436|1970-01-01|
|        12.0|        8|       304.0|       150|            16.0|       amc rebel sst|   USA|         3433|1970-01-01|
|        10.5|        8|       302.0|       140|

In [4]:
cars_df.show(10, False)

cars_df.\
    repartition(3). \
    write. \
    mode("overwrite"). \
    option("compression", "snappy"). \
    parquet("../sources/parquet")


#    repartition(3). \
#    write. \
#    partitionBy("Year"). \

#    repartition("Year"). \
#    write. \
#    partitionBy("Year"). \

    
# A lot of small files problem
# repartition(3) => round robin
# repartition(col("field")) => hash partitioning
# repartition(3) + partitionBy("Year") NOT GOOD
# repartition(col("field")) + partitionBy("Year") GOOD

+------------+---------+------------+----------+----------------+-------------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|Name                     |Origin|Weight_in_lbs|Year      |
+------------+---------+------------+----------+----------------+-------------------------+------+-------------+----------+
|12.0        |8        |307.0       |130       |18.0            |chevrolet chevelle malibu|USA   |3504         |null      |
|12.0        |8        |307.0       |130       |18.0            |chevrolet chevelle malibu|USA   |3504         |1970-01-01|
|11.5        |8        |350.0       |165       |15.0            |buick skylark 320        |USA   |3693         |1970-01-01|
|11.0        |8        |318.0       |150       |18.0            |plymouth satellite       |USA   |3436         |1970-01-01|
|12.0        |8        |304.0       |150       |16.0            |amc rebel sst            |USA   |3433         |1970-01-01|
|10.5   

In [5]:
# stocks_df.write.save("data/stocks_parquet")

# each row is a value in a DF with a SINGLE column ("value")
text_df = spark.read.text("data/lipsum")
text_df.show()

# !!!!!!!!!!!!! DIFFERENCE between 
# saveAsTable() 
# write + save()
# parquet() + ???

+--------------------+
|               value|
+--------------------+
|Lorem ipsum dolor...|
|                    |
|Vestibulum sed di...|
|                    |
|Aliquam vestibulu...|
|                    |
|Fusce in accumsan...|
|                    |
|Nullam ultrices q...|
|                    |
|Nam tempor nisi a...|
|                    |
|Nunc vulputate ac...|
|                    |
|Proin blandit ero...|
|                    |
|Ut bibendum ac du...|
|                    |
|Vestibulum ac var...|
|                    |
+--------------------+
only showing top 20 rows



# data_formats_json_avro_parquet

In [24]:
state_names_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("data/statenames")

state_names_df.show(10, False)
state_names_df.printSchema()

state_names_df \
    .coalesce(1) \
    .write \
    .mode("overwrite") \
    .parquet("data/target/statenames_parquet")

# Parquet = binary data, high compression, low CPU usage, very fast
# also contains the schema
# the default data format in Spark


# Problem: which format best for read, write, storage size()

# TODO - ADD COMPARING DATA FORMATS FOR CLUSTER


+-----+------+----+--------+---+
|State|Gender|Year|Name    |Cnt|
+-----+------+----+--------+---+
|IN   |F     |1910|Mary    |619|
|IN   |F     |1910|Helen   |324|
|IN   |F     |1910|Ruth    |238|
|IN   |F     |1910|Dorothy |215|
|IN   |F     |1910|Mildred |200|
|IN   |F     |1910|Margaret|196|
|IN   |F     |1910|Thelma  |137|
|IN   |F     |1910|Edna    |113|
|IN   |F     |1910|Martha  |112|
|IN   |F     |1910|Hazel   |108|
+-----+------+----+--------+---+
only showing top 10 rows

root
 |-- State: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Cnt: integer (nullable = true)



# jdbc_postgres_oracle
# WORK WITH JDBC FROM PyCharm projects https://github.com/vadopolski/eas-017-RDD-py

In [7]:
driver = "org.postgresql.Driver"
url = "jdbc:postgresql://localhost:5432/spark"
user = "docker"
password = "docker"

DBPARAMS = {
    "user": user,
    "password": password,
    "driver": driver
}


# 
# PROBLEM HOW TO READ/WRITE DATA FROM JDBC


# NOT WORKING HERE
# Py4JJavaError: An error occurred while calling o73.jdbc.
# : java.lang.ClassNotFoundException: org.postgresql.Driver
# 	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:476)

employees = "public.employees"
employees_pruned = """(select e.first_name, e.last_name, e.hire_date from public.employees e where e.gender = 'F') as new_emp"""

# 10101        99999
# 10102        99998
# 10103        10103

df = spark.read.jdbc(url=url, table=employees, properties=DBPARAMS)

# print("GET NUM PARTITIONS")
# print(df.rdd.getNumPartitions())

df.show()



+------+----------+----------+-------------+------+----------+
|emp_no|birth_date|first_name|    last_name|gender| hire_date|
+------+----------+----------+-------------+------+----------+
| 10010|1963-06-01| Duangkaew|     Piveteau|     F|1989-08-24|
| 10020|1952-12-24|    Mayuko|      Warwick|     M|1991-01-26|
| 10030|1958-07-14|     Elvis|      Demeyer|     M|1994-02-17|
| 10040|1959-09-13|     Weiyi|      Meriste|     F|1993-02-14|
| 10050|1958-05-21|   Yinghua|       Dredge|     M|1990-12-25|
| 10060|1961-10-15|  Breannda|  Billingsley|     M|1987-11-02|
| 10070|1955-08-20|    Reuven|   Garigliano|     M|1985-10-14|
| 10080|1957-12-03|    Premal|         Baek|     M|1985-11-19|
| 10090|1961-05-30|    Kendra|      Hofting|     M|1986-03-14|
| 10100|1953-04-21|  Hironobu|    Haraldson|     F|1987-09-21|
| 10110|1957-03-07|    Xuejia|       Ullian|     F|1986-08-22|
| 10120|1960-03-26|    Armond|   Fairtlough|     F|1996-07-06|
| 10130|1955-04-27|    Nishit|    Casperson|     M|1988

In [8]:
# df.printSchema()
# df.agg(F.max(F.col("emp_no")), F.min(F.col("emp_no"))).show()


df = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS,
                     column="emp_no", lowerBound = 10010, upperBound = 499990, numPartitions = 10)

# lowerBound = 10010
# upperBound = 499990
#
# ex1 => part1 => select * from public.employees e where e.emp_num > x and e.emp_num
# ex2 => part2 =

pred = ["gender = 'F'", "gender = 'M'", "gender = 'M'"]
# be carefully with borders
pred2 = ["emp_no > 10010 and emp_no <= 50000", "emp_no >= 50000 and emp_no <= 100000"]

df = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS, predicates =pred)
df.show()

# lowerBound = 10010,
# upperBound = 499990,
# numPartitions = 20,

# Killer joins => optimised UDF

# print("GET NUM PARTITIONS")
# print(df.rdd.getNumPartitions())
#
# df.show()


+------+----------+----------+----------+------+----------+
|emp_no|birth_date|first_name| last_name|gender| hire_date|
+------+----------+----------+----------+------+----------+
| 10010|1963-06-01| Duangkaew|  Piveteau|     F|1989-08-24|
| 10040|1959-09-13|     Weiyi|   Meriste|     F|1993-02-14|
| 10100|1953-04-21|  Hironobu| Haraldson|     F|1987-09-21|
| 10110|1957-03-07|    Xuejia|    Ullian|     F|1986-08-22|
| 10120|1960-03-26|    Armond|Fairtlough|     F|1996-07-06|
| 10140|1957-03-11|     Yucel|     Auria|     F|1991-03-14|
| 10150|1955-01-29|  Zhenbing|     Perng|     F|1986-11-16|
| 10170|1960-10-03|   Kasturi|  Jenevein|     F|1986-01-02|
| 10190|1964-12-11|      Arve|Fairtlough|     F|1986-06-23|
| 10220|1958-05-25|      Kish| Fasbender|     F|1992-06-25|
| 10260|1961-07-14|     Alper|     Suomi|     F|1991-04-13|
| 10270|1963-01-30|     Bedir|Hartvigsen|     F|1990-04-26|
| 10300|1960-07-12|  Tadahiko|  Ulupinar|     F|1991-05-17|
| 10320|1956-06-22|     Uinam| Stasinski

In [9]:
employees_df = spark.read. \
    format("jdbc"). \
    option("driver", driver). \
    option("url", url). \
    option("user", user). \
    option("password", password). \
    option("dbtable", "public.employees"). \
    load()


In [10]:
# department_df = spark.read (dept_no, dept_name) // 200
#
# employees_df. \
#     groupBy("dept_no"). \
#     count(). \
#     join(department_df, col("dept_no") = col("dept_no"),  "inner")

# Solution1 UDF
#

print("GET NUM PARTITIONS")
print(employees_df.rdd.getNumPartitions())


employees_df.show()

employees_df.write.bucketBy(10, "emp_no").sortBy("emp_no").mode("overwrite").saveAsTable("employee_bucketed")
# employees_df.write.mode("overwrite").save() Parquet



GET NUM PARTITIONS
1
+------+----------+----------+-------------+------+----------+
|emp_no|birth_date|first_name|    last_name|gender| hire_date|
+------+----------+----------+-------------+------+----------+
| 10010|1963-06-01| Duangkaew|     Piveteau|     F|1989-08-24|
| 10020|1952-12-24|    Mayuko|      Warwick|     M|1991-01-26|
| 10030|1958-07-14|     Elvis|      Demeyer|     M|1994-02-17|
| 10040|1959-09-13|     Weiyi|      Meriste|     F|1993-02-14|
| 10050|1958-05-21|   Yinghua|       Dredge|     M|1990-12-25|
| 10060|1961-10-15|  Breannda|  Billingsley|     M|1987-11-02|
| 10070|1955-08-20|    Reuven|   Garigliano|     M|1985-10-14|
| 10080|1957-12-03|    Premal|         Baek|     M|1985-11-19|
| 10090|1961-05-30|    Kendra|      Hofting|     M|1986-03-14|
| 10100|1953-04-21|  Hironobu|    Haraldson|     F|1987-09-21|
| 10110|1957-03-07|    Xuejia|       Ullian|     F|1986-08-22|
| 10120|1960-03-26|    Armond|   Fairtlough|     F|1996-07-06|
| 10130|1955-04-27|    Nishit|    

# queue_kafka

In [None]:
schema = StructType([
    StructField("timestamp", StringType()),
    StructField("page", StringType())
])

# TODO NOT WORKING HERE TOO


# source_batch_df = spark.read\
#     .format("kafka")\
#     .option("kafka.bootstrap.servers", "localhost:29092")\
#     .option("subscribe", "input")\
#     .load()
#
# print(source_batch_df.isStreaming)
#
# source_batch_df.show()


source_streaming_df = spark.readStream\
    .format("kafka")\
    .option("kafka.bootstrap.servers", "localhost:29092")\
    .option("subscribe", "input")\
    .load()

print(source_streaming_df.isStreaming)

typed_source_streaming_df = source_streaming_df.\
    select(expr("cast(value as string) as actualValue")).\
    select(from_json(col("actualValue"), schema).alias("page")).\
    selectExpr("page.timestamp as timestamp", "page.page as page").\
    select(date_format(to_timestamp(col("timestamp"), "dd-MM-yyyy HH:mm:ss:SSS"), "HH:mm:ss:SSS").alias("time"),col("page")
  )

source_streaming_df.\
    writeStream.\
    outputMode("append").\
    foreachBatch(lambda b, l: b.show).\
    trigger(processingTime='3 seconds').\
    start().\
    awaitTermination()


Exercise: read the movies DF, then write it as
- tab-separated "CSV"
- parquet
- table "public.movies" in the Postgres DB

Exercise #2: find a way to read the people-1m dataFrame. Then write it as JSON.