In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

from input_output_manager import IOManager

IMAGE_T = 'image'
OBJECT_T = 'object'
spark = SparkSession.builder.master("spark://192.168.1.2:7077")\
        .config('spark.driver.extraClassPath', 'jdbc_driver/postgresql-42.2.14.jar')\
        .getOrCreate()

def transform_text_data_sql(data_type):
    path_object = f"data_sample/{data_type}"
    generated_df = spark.read.json(path_object) # can be path or single file
    return generated_df

In [2]:
images_df = transform_text_data_sql(IMAGE_T)
images_df.printSchema()
images_df.show(2)

root
 |-- Id: string (nullable = true)
 |-- Release date: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- description: string (nullable = true)

+---------+---------------+-----------+--------------------+
|       Id|   Release date|       Type|         description|
+---------+---------------+-----------+--------------------+
|potw1319a|    13 May 2013|Observation|This Hubble image...|
| ann1801a|9 February 2018|    Artwork|The CAPjournal is...|
+---------+---------------+-----------+--------------------+



In [7]:
obj_df = transform_text_data_sql(OBJECT_T)
obj_df.printSchema()
obj_df.show(2)

root
 |-- Category: string (nullable = true)
 |-- Constellation: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Type: string (nullable = true)

+-------------+----------------+------------------+-----------+--------------------+
|     Category|   Constellation|          Distance|       Name|                Type|
+-------------+----------------+------------------+-----------+--------------------+
|    Cosmology|Piscis Austrinus|z=0.312 (redshift)|Abell S1077|Early Universe : ...|
|Illustrations|            null|              null| CAPjournal|         Unspecified|
+-------------+----------------+------------------+-----------+--------------------+



In [4]:
images_df_sql = spark.read \
  .format("jdbc") \
  .option("url", "jdbc:postgresql://192.168.1.7:5432/spark-rw") \
  .option("dbtable", "images") \
  .option("user", "spark-rw") \
  .load()
    
images_df_sql.show()


+---------+-----------------+------------+--------------------+
|       Id|     Release date|        Type|         description|
+---------+-----------------+------------+--------------------+
|potw1110a|     7 March 2011| Observation|The strange and i...|
|heic0207h|      5 June 2002| Observation|Two powerful came...|
| opo0647a|28 September 2006|     Collage|Just as we near t...|
|  ann0819|  24 October 2008|Photographic|Following a week ...|
|  ann0814|  15 October 2008|Photographic|At a teleconferen...|
| opo0928h| 10 November 2009|     Collage|In celebration of...|
|heic0506a|    25 April 2005| Observation|The graceful, win...|
|heic0206c|    30 April 2002| Observation|Resembling a nigh...|
|heic1206b|    17 April 2012|     Collage|From massive star...|
| opo1733b|  6 November 2017| Observation|As if this Hubble...|
|potw1237a|10 September 2012| Observation|This image portra...|
|potw1418a|       5 May 2014| Observation|This bundle of br...|
|heic0516a| 13 December 2005| Observatio

In [10]:
images_df_sql.filter(images_df_sql.Type == 'Observation').show()

+---------+-----------------+-----------+--------------------+
|       Id|     Release date|       Type|         description|
+---------+-----------------+-----------+--------------------+
|potw1110a|     7 March 2011|Observation|The strange and i...|
|heic0207h|      5 June 2002|Observation|Two powerful came...|
|heic0506a|    25 April 2005|Observation|The graceful, win...|
|heic0206c|    30 April 2002|Observation|Resembling a nigh...|
| opo1733b|  6 November 2017|Observation|As if this Hubble...|
|potw1237a|10 September 2012|Observation|This image portra...|
|potw1418a|       5 May 2014|Observation|This bundle of br...|
|heic0516a| 13 December 2005|Observation|This Hubble Space...|
| opo0635a|     31 July 2006|Observation|In the wake of In...|
| opo1602a|   8 January 2016|Observation|Astronomers have ...|
| opo1733a|     21 June 2019|Observation|Some asteroids fr...|
|potw1241a|   8 October 2012|Observation|The  Universe is ...|
|heic0301a| 20 February 2003|Observation|The Boomerang 