# Spark sample showing read/write methods
In this sample notebook, we will read CSV file from HDFS, write it as parquet file and save a Hive table definition. We will also run some Spark SQL commands using the Hive table.


In [1]:
%%configure -f
{"executorMemory": "12g", "executorCores": 4, "numExecutors":11}


In [1]:
for item in sorted(sc._conf.getAll()): print(item)
# vanilla:
#('spark.livy.spark_major_version', '2')
#('spark.master', 'yarn')

In [1]:
# jvv: based on Meet's example

from pyspark.sql.types import *

# Source:
customSchema= StructType([ 
      StructField("id",LongType(), True),
      StructField("chunkid",LongType(), True),
      StructField("coord_ra",DoubleType(), True),
      StructField("coord_decl",DoubleType(), True),
      StructField("coord_htmId20",LongType(), True),
      StructField("parent",LongType(), True),
      StructField("flags_badcentroid",IntegerType(), True),
      StructField("centroid_sdss_x",DoubleType(), True),
      StructField("centroid_sdss_y",DoubleType(), True),
      StructField("centroid_sdss_xVar",DoubleType(), True),
      StructField("centroid_sdss_xyCov",DoubleType(), True),
      StructField("centroid_sdss_yVar",DoubleType(), True),
      StructField("centroid_sdss_flags",IntegerType(), True),
      StructField("flags_pixel_edge",IntegerType(), True),
      StructField("flags_pixel_interpolated_any",IntegerType(), True),
      StructField("flags_pixel_interpolated_center",IntegerType(), True),
      StructField("flags_pixel_saturated_any",IntegerType(), True),
      StructField("flags_pixel_saturated_center",IntegerType(), True),
      StructField("flags_pixel_cr_any",IntegerType(), True),
      StructField("flags_pixel_cr_center",IntegerType(), True),
      StructField("centroid_naive_x",DoubleType(), True),
      StructField("centroid_naive_y",DoubleType(), True),
      StructField("centroid_naive_xVar",DoubleType(), True),
      StructField("centroid_naive_xyCov",DoubleType(), True),
      StructField("centroid_naive_yVar",DoubleType(), True),
      StructField("centroid_naive_flags",IntegerType(), True),
      StructField("centroid_gaussian_x",DoubleType(), True),
      StructField("centroid_gaussian_y",DoubleType(), True),
      StructField("centroid_gaussian_xVar",DoubleType(), True),
      StructField("centroid_gaussian_xyCov",DoubleType(), True),
      StructField("centroid_gaussian_yVar",DoubleType(), True),
      StructField("centroid_gaussian_flags",IntegerType(), True),
      StructField("shape_sdss_Ixx",DoubleType(), True),
      StructField("shape_sdss_Iyy",DoubleType(), True),
      StructField("shape_sdss_Ixy",DoubleType(), True),
      StructField("shape_sdss_IxxVar",DoubleType(), True),
      StructField("shape_sdss_IxxIyyCov",DoubleType(), True),
      StructField("shape_sdss_IxxIxyCov",DoubleType(), True),
      StructField("shape_sdss_IyyVar",DoubleType(), True),
      StructField("shape_sdss_IyyIxyCov",DoubleType(), True),
      StructField("shape_sdss_IxyVar",DoubleType(), True),
      StructField("shape_sdss_flags",IntegerType(), True),
      StructField("shape_sdss_centroid_x",DoubleType(), True),
      StructField("shape_sdss_centroid_y",DoubleType(), True),
      StructField("shape_sdss_centroid_xVar",DoubleType(), True),
      StructField("shape_sdss_centroid_xyCov",DoubleType(), True),
      StructField("shape_sdss_centroid_yVar",DoubleType(), True),
      StructField("shape_sdss_centroid_flags",IntegerType(), True),
      StructField("shape_sdss_flags_unweightedbad",IntegerType(), True),
      StructField("shape_sdss_flags_unweighted",IntegerType(), True),
      StructField("shape_sdss_flags_shift",IntegerType(), True),
      StructField("shape_sdss_flags_maxiter",IntegerType(), True),
      StructField("flux_psf",DoubleType(), True),
      StructField("flux_psf_err",DoubleType(), True),
      StructField("flux_psf_flags",IntegerType(), True),
      StructField("flux_psf_psffactor",DoubleType(), True),
      StructField("flux_psf_flags_psffactor",IntegerType(), True),
      StructField("flux_psf_flags_badcorr",IntegerType(), True),
      StructField("flux_naive",DoubleType(), True),
      StructField("flux_naive_err",DoubleType(), True),
      StructField("flux_naive_flags",IntegerType(), True),
      StructField("flux_gaussian",DoubleType(), True),
      StructField("flux_gaussian_err",DoubleType(), True),
      StructField("flux_gaussian_flags",IntegerType(), True),
      StructField("flux_gaussian_psffactor",DoubleType(), True),
      StructField("flux_gaussian_flags_psffactor",IntegerType(), True),
      StructField("flux_gaussian_flags_badcorr",IntegerType(), True),
      StructField("flux_sinc",DoubleType(), True),
      StructField("flux_sinc_err",DoubleType(), True),
      StructField("flux_sinc_flags",IntegerType(), True),
      StructField("centroid_record_x",DoubleType(), True),
      StructField("centroid_record_y",DoubleType(), True),
      StructField("classification_extendedness",DoubleType(), True),
      StructField("aperturecorrection",DoubleType(), True),
      StructField("aperturecorrection_err",DoubleType(), True),
      StructField("refFlux",DoubleType(), True),
      StructField("refFlux_err",DoubleType(), True),
      StructField("objectId",LongType(), True),
      StructField("coord_raVar",DoubleType(), True),
      StructField("coord_radeclCov",DoubleType(), True),
      StructField("coord_declVar",DoubleType(), True),
      StructField("exposure_id",LongType(), True),
      StructField("exposure_filter_id",LongType(), True),
      StructField("exposure_time",DoubleType(), True),
      StructField("exposure_time_mid",DoubleType(), True),
      StructField("cluster_id",LongType(), True),
      StructField("cluster_coord_ra",DoubleType(), True),
      StructField("cluster_coord_decl",DoubleType(), True)


]) 


In [1]:
# jvv: Meet's schema, tweaked for INTs:

from pyspark.sql.types import *

# Source:
customSchema= StructType([ 
      StructField("id",LongType(), True),
      StructField("chunkid",IntegerType(), True),
      StructField("coord_ra",DoubleType(), True),
      StructField("coord_decl",DoubleType(), True),
      StructField("coord_htmId20",LongType(), True),
      StructField("parent",LongType(), True),
      StructField("flags_badcentroid",IntegerType(), True),
      StructField("centroid_sdss_x",DoubleType(), True),
      StructField("centroid_sdss_y",DoubleType(), True),
      StructField("centroid_sdss_xVar",DoubleType(), True),
      StructField("centroid_sdss_xyCov",DoubleType(), True),
      StructField("centroid_sdss_yVar",DoubleType(), True),
      StructField("centroid_sdss_flags",IntegerType(), True),
      StructField("flags_pixel_edge",IntegerType(), True),
      StructField("flags_pixel_interpolated_any",IntegerType(), True),
      StructField("flags_pixel_interpolated_center",IntegerType(), True),
      StructField("flags_pixel_saturated_any",IntegerType(), True),
      StructField("flags_pixel_saturated_center",IntegerType(), True),
      StructField("flags_pixel_cr_any",IntegerType(), True),
      StructField("flags_pixel_cr_center",IntegerType(), True),
      StructField("centroid_naive_x",DoubleType(), True),
      StructField("centroid_naive_y",DoubleType(), True),
      StructField("centroid_naive_xVar",DoubleType(), True),
      StructField("centroid_naive_xyCov",DoubleType(), True),
      StructField("centroid_naive_yVar",DoubleType(), True),
      StructField("centroid_naive_flags",IntegerType(), True),
      StructField("centroid_gaussian_x",DoubleType(), True),
      StructField("centroid_gaussian_y",DoubleType(), True),
      StructField("centroid_gaussian_xVar",DoubleType(), True),
      StructField("centroid_gaussian_xyCov",DoubleType(), True),
      StructField("centroid_gaussian_yVar",DoubleType(), True),
      StructField("centroid_gaussian_flags",IntegerType(), True),
      StructField("shape_sdss_Ixx",DoubleType(), True),
      StructField("shape_sdss_Iyy",DoubleType(), True),
      StructField("shape_sdss_Ixy",DoubleType(), True),
      StructField("shape_sdss_IxxVar",DoubleType(), True),
      StructField("shape_sdss_IxxIyyCov",DoubleType(), True),
      StructField("shape_sdss_IxxIxyCov",DoubleType(), True),
      StructField("shape_sdss_IyyVar",DoubleType(), True),
      StructField("shape_sdss_IyyIxyCov",DoubleType(), True),
      StructField("shape_sdss_IxyVar",DoubleType(), True),
      StructField("shape_sdss_flags",IntegerType(), True),
      StructField("shape_sdss_centroid_x",DoubleType(), True),
      StructField("shape_sdss_centroid_y",DoubleType(), True),
      StructField("shape_sdss_centroid_xVar",DoubleType(), True),
      StructField("shape_sdss_centroid_xyCov",DoubleType(), True),
      StructField("shape_sdss_centroid_yVar",DoubleType(), True),
      StructField("shape_sdss_centroid_flags",IntegerType(), True),
      StructField("shape_sdss_flags_unweightedbad",IntegerType(), True),
      StructField("shape_sdss_flags_unweighted",IntegerType(), True),
      StructField("shape_sdss_flags_shift",IntegerType(), True),
      StructField("shape_sdss_flags_maxiter",IntegerType(), True),
      StructField("flux_psf",DoubleType(), True),
      StructField("flux_psf_err",DoubleType(), True),
      StructField("flux_psf_flags",IntegerType(), True),
      StructField("flux_psf_psffactor",DoubleType(), True),
      StructField("flux_psf_flags_psffactor",IntegerType(), True),
      StructField("flux_psf_flags_badcorr",IntegerType(), True),
      StructField("flux_naive",DoubleType(), True),
      StructField("flux_naive_err",DoubleType(), True),
      StructField("flux_naive_flags",IntegerType(), True),
      StructField("flux_gaussian",DoubleType(), True),
      StructField("flux_gaussian_err",DoubleType(), True),
      StructField("flux_gaussian_flags",IntegerType(), True),
      StructField("flux_gaussian_psffactor",DoubleType(), True),
      StructField("flux_gaussian_flags_psffactor",IntegerType(), True),
      StructField("flux_gaussian_flags_badcorr",IntegerType(), True),
      StructField("flux_sinc",DoubleType(), True),
      StructField("flux_sinc_err",DoubleType(), True),
      StructField("flux_sinc_flags",IntegerType(), True),
      StructField("centroid_record_x",DoubleType(), True),
      StructField("centroid_record_y",DoubleType(), True),
      StructField("classification_extendedness",DoubleType(), True),
      StructField("aperturecorrection",DoubleType(), True),
      StructField("aperturecorrection_err",DoubleType(), True),
      StructField("refFlux",DoubleType(), True),
      StructField("refFlux_err",DoubleType(), True),
      StructField("objectId",LongType(), True),
      StructField("coord_raVar",DoubleType(), True),
      StructField("coord_radeclCov",DoubleType(), True),
      StructField("coord_declVar",DoubleType(), True),
      StructField("exposure_id",LongType(), True),
      StructField("exposure_filter_id",IntegerType(), True),
      StructField("exposure_time",DoubleType(), True),
      StructField("exposure_time_mid",DoubleType(), True),
      StructField("cluster_id",LongType(), True),
      StructField("cluster_coord_ra",DoubleType(), True),
      StructField("cluster_coord_decl",DoubleType(), True)


]) 


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2,application_1548715828284_0041,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [1]:
# jvv: Meet's schema, tweaked for INTs:

from pyspark.sql.types import *

# Source:
customSchema= StructType([ 
      StructField("id",LongType(), True),
      StructField("coord_ra",DoubleType(), True)
]) 


In [1]:
#df1 = spark.read.load('/LSST/Source/csv/', format="csv", sep=';', schema=customSchema, header="true")
#df1 = spark.read.load('/LSST/Source/csv/Source_8945.csv', format="csv", sep=';' , schema=customSchema)
#df1 = spark.read.load('/LSST/Source/csv/Source_8945.csv', format="csv", sep=';', inferSchema="true", header="true")
#df1 = spark.read.load('/LSST/Source/csv/Source_8945.csv', format="csv", sep=';', schema=customSchema, header="true")
df1 = spark.read.load('/LSST/sue/csv/Source', format="csv", sep=',', schema=customSchema)
#df1 = spark.read.load('/LSST/sue/csv/Source/Source_9659.csv', format="csv", sep=',', inferSchema="true")
#df1 = spark.read.load('/LSST/sue/csv/Source/Source_9659.csv', format="csv", sep=',', schema=customSchema)
#df1 = spark.read.load('/LSST/jvv/Source_9659_id-chunkid-coord_ra.csv', format="csv", sep=',', schema=customSchema)
#df1 = spark.read.load('/LSST/jvv/csv/Source', format="csv", sep=',', schema=customSchema)
#df1 = spark.read.load('/LSST/jvv/test', format="csv", sep=',', schema=customSchema)

# Aris sample, not identical to Meet's:
'''
results = spark.read.option("inferSchema", "true").csv('/clickstream_data').toDF(
            "wcs_click_date_sk", "wcs_click_time_sk", "wcs_sales_sk", "wcs_item_sk", "wcs_web_page_sk", "wcs_user_sk"
            )
'''

#df1.printSchema()
df1.show()

+----------------+-------+------------------+------------------+--------------+------+-----------------+------------------+------------------+--------------------+-------------------+--------------------+-------------------+----------------+----------------------------+-------------------------------+-------------------------+----------------------------+------------------+---------------------+------------------+------------------+-------------------+--------------------+-------------------+--------------------+-------------------+-------------------+----------------------+-----------------------+----------------------+-----------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+---------------------+---------------------+------------------------+-------------------------+------------------------+------------------------

In [1]:
#df1.coalesce(350).write.parquet("/LSST/Source/parquet/", mode='overwrite')

# Meet above v. Aris sample:
# results.write.format("parquet").mode("overwrite").saveAsTable("web_clickstreams")
df1.write.format("parquet").mode("overwrite").saveAsTable("Sourcehiveint")

In [1]:
df1.printSchema()

root
 |-- id: long (nullable = true)
 |-- chunkid: integer (nullable = true)
 |-- coord_ra: double (nullable = true)
 |-- coord_decl: double (nullable = true)
 |-- coord_htmId20: long (nullable = true)
 |-- parent: long (nullable = true)
 |-- flags_badcentroid: integer (nullable = true)
 |-- centroid_sdss_x: double (nullable = true)
 |-- centroid_sdss_y: double (nullable = true)
 |-- centroid_sdss_xVar: double (nullable = true)
 |-- centroid_sdss_xyCov: double (nullable = true)
 |-- centroid_sdss_yVar: double (nullable = true)
 |-- centroid_sdss_flags: integer (nullable = true)
 |-- flags_pixel_edge: integer (nullable = true)
 |-- flags_pixel_interpolated_any: integer (nullable = true)
 |-- flags_pixel_interpolated_center: integer (nullable = true)
 |-- flags_pixel_saturated_any: integer (nullable = true)
 |-- flags_pixel_saturated_center: integer (nullable = true)
 |-- flags_pixel_cr_any: integer (nullable = true)
 |-- flags_pixel_cr_center: integer (nullable = true)
 |-- centroid_nai

In [1]:
# Disable saving SUCCESS file
sc._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") 

# Print the current warehouse directory where the parquet files will be stored
print(spark.conf.get("spark.sql.warehouse.dir"))

# Save results as parquet file and create hive table
results.write.format("parquet").mode("overwrite").saveAsTable("web_clickstreams")


hdfs:///user/hive/warehouse

In [1]:
# Execute Spark SQL commands
#sqlDF = spark.sql("SELECT * FROM Source LIMIT 100")
sqlDF = spark.sql("SELECT distinct(flags_badcentroid) FROM Sourcehive")
sqlDF.show()

#sqlDF = spark.sql("SELECT wcs_user_sk, COUNT(*)\
#                     FROM web_clickstreams\
#                    WHERE wcs_user_sk IS NOT NULL\
#                   GROUP BY wcs_user_sk\
#                   ORDER BY COUNT(*) DESC LIMIT 100")
#sqlDF.show()

+-----------------+
|flags_badcentroid|
+-----------------+
|                0|
+-----------------+

In [1]:
import pyspark

# start
sc = pyspark.SparkContext()

#stop
sc.stop()

Cannot run multiple SparkContexts at once; existing SparkContext(app=livy-session-1, master=yarn) created by __init__ at /tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1548715828284_0011/container_1548715828284_0011_01_000001/tmp/2953620281318378199:595 
Traceback (most recent call last):
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1548715828284_0011/container_1548715828284_0011_01_000001/pyspark.zip/pyspark/context.py", line 115, in __init__
    SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1548715828284_0011/container_1548715828284_0011_01_000001/pyspark.zip/pyspark/context.py", line 308, in _ensure_initialized
    callsite.function, callsite.file, callsite.linenum))
ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=livy-session-1, master=yarn) created by __init__ at /tmp/hadoop-root/nm-local-dir/usercache/root/

In [1]:
# Read the product reviews CSV files into a spark data frame, print schema & top rows
results = spark.read.option("inferSchema", "true").csv('/product_review_data').toDF(
            "pr_review_sk", "pr_review_content"
            )
results.printSchema()
results.show()

root
 |-- pr_review_sk: integer (nullable = true)
 |-- pr_review_content: string (nullable = true)

+------------+--------------------+
|pr_review_sk|   pr_review_content|
+------------+--------------------+
|       72621|Works fine. Easy ...|
|       89334|great product to ...|
|       89335|Next time will go...|
|       84259|Great Gift Great ...|
|       84398|After trip to Par...|
|       66434|Simply the best t...|
|       66501|This is the exact...|
|       66587|Not super magnet;...|
|       66680|Installed as bath...|
|       66694|Our home was buil...|
|       84489|Hi ;We are runnin...|
|       79052|Terra cotta is th...|
|       73034|One of my fingern...|
|       73298|We installed thes...|
|       66810|needed silicone c...|
|       66912|Great Gift Great ...|
|       67028|Laguiole knives a...|
|       89770|Good sound timers...|
|       84679|AWESOME FEEDBACK ...|
|       84953|love the retro gl...|
+------------+--------------------+
only showing top 20 rows

In [1]:
# Save results as parquet file and create hive table
results.write.format("parquet").mode("overwrite").saveAsTable("product_reviews")


In [1]:
# Execute Spark SQL commands
sqlDF = spark.sql("SELECT pr_review_sk, CHAR_LENGTH(pr_review_content) as len FROM product_reviews LIMIT 100")
sqlDF.show()

+------------+----+
|pr_review_sk| len|
+------------+----+
|       14868| 985|
|       14869|1601|
|       14875|1221|
|       14880| 665|
|       14886|  91|
|       14894| 697|
|       14899| 356|
|       14903|2361|
|       14908| 872|
|       14909|  74|
|       14917| 908|
|       14918|  50|
|       14919| 256|
|       14921| 723|
|       14925| 313|
|       14931|1304|
|       14939|1023|
|       14949| 552|
|       14954|2144|
|       14955| 123|
+------------+----+
only showing top 20 rows