# Spark sample showing read/write methods
In this sample notebook, we will read CSV file from HDFS, write it as parquet file and save a Hive table definition. We will also run some Spark SQL commands using the Hive table.


In [4]:
%%configure -f
{"executorMemory": "58g", "executorCores": 30, "numExecutors":4}


In [5]:
for item in sorted(sc._conf.getAll()): print(item)
# vanilla:
#('spark.livy.spark_major_version', '2')
#('spark.master', 'yarn')

In [8]:
# this one doesn't work, don't use it.
# jvv: Meet's schema, tweaked for with UC's sql<->parquet data type mappings for stuff like INTs and BITs
# SQL BIT as parquet booleans here
from pyspark.sql.types import *

# Source:
customSchemabool= StructType([ 
      StructField("id", LongType(), True),
      StructField("chunkid", IntegerType(), True),
      StructField("coord_ra", DoubleType(), True),
      StructField("coord_decl", DoubleType(), True),
      StructField("coord_htmId20", LongType(), True),
      StructField("parent", LongType(), True),
      StructField("flags_badcentroid", BooleanType(), True),
      StructField("centroid_sdss_x", DoubleType(), True),
      StructField("centroid_sdss_y", DoubleType(), True),
      StructField("centroid_sdss_xVar", DoubleType(), True),
      StructField("centroid_sdss_xyCov", DoubleType(), True),
      StructField("centroid_sdss_yVar", DoubleType(), True),
      StructField("centroid_sdss_flags", BooleanType(), True),
      StructField("flags_pixel_edge", BooleanType(), True),
      StructField("flags_pixel_interpolated_any", BooleanType(), True),
      StructField("flags_pixel_interpolated_center", BooleanType(), True),
      StructField("flags_pixel_saturated_any", BooleanType(), True),
      StructField("flags_pixel_saturated_center", BooleanType(), True),
      StructField("flags_pixel_cr_any", BooleanType(), True),
      StructField("flags_pixel_cr_center", BooleanType(), True),
      StructField("centroid_naive_x", DoubleType(), True),
      StructField("centroid_naive_y", DoubleType(), True),
      StructField("centroid_naive_xVar", DoubleType(), True),
      StructField("centroid_naive_xyCov", DoubleType(), True),
      StructField("centroid_naive_yVar", DoubleType(), True),
      StructField("centroid_naive_flags", BooleanType(), True),
      StructField("centroid_gaussian_x", DoubleType(), True),
      StructField("centroid_gaussian_y", DoubleType(), True),
      StructField("centroid_gaussian_xVar", DoubleType(), True),
      StructField("centroid_gaussian_xyCov", DoubleType(), True),
      StructField("centroid_gaussian_yVar", DoubleType(), True),
      StructField("centroid_gaussian_flags", BooleanType(), True),
      StructField("shape_sdss_Ixx", DoubleType(), True),
      StructField("shape_sdss_Iyy", DoubleType(), True),
      StructField("shape_sdss_Ixy", DoubleType(), True),
      StructField("shape_sdss_IxxVar", DoubleType(), True),
      StructField("shape_sdss_IxxIyyCov", DoubleType(), True),
      StructField("shape_sdss_IxxIxyCov", DoubleType(), True),
      StructField("shape_sdss_IyyVar", DoubleType(), True),
      StructField("shape_sdss_IyyIxyCov", DoubleType(), True),
      StructField("shape_sdss_IxyVar", DoubleType(), True),
      StructField("shape_sdss_flags", BooleanType(), True),
      StructField("shape_sdss_centroid_x", DoubleType(), True),
      StructField("shape_sdss_centroid_y", DoubleType(), True),
      StructField("shape_sdss_centroid_xVar", DoubleType(), True),
      StructField("shape_sdss_centroid_xyCov", DoubleType(), True),
      StructField("shape_sdss_centroid_yVar", DoubleType(), True),
      StructField("shape_sdss_centroid_flags", BooleanType(), True),
      StructField("shape_sdss_flags_unweightedbad", BooleanType(), True),
      StructField("shape_sdss_flags_unweighted", BooleanType(), True),
      StructField("shape_sdss_flags_shift", BooleanType(), True),
      StructField("shape_sdss_flags_maxiter", BooleanType(), True),
      StructField("flux_psf", DoubleType(), True),
      StructField("flux_psf_err", DoubleType(), True),
      StructField("flux_psf_flags", BooleanType(), True),
      StructField("flux_psf_psffactor", DoubleType(), True),
      StructField("flux_psf_flags_psffactor", BooleanType(), True),
      StructField("flux_psf_flags_badcorr", BooleanType(), True),
      StructField("flux_naive", DoubleType(), True),
      StructField("flux_naive_err", DoubleType(), True),
      StructField("flux_naive_flags", BooleanType(), True),
      StructField("flux_gaussian", DoubleType(), True),
      StructField("flux_gaussian_err", DoubleType(), True),
      StructField("flux_gaussian_flags", BooleanType(), True),
      StructField("flux_gaussian_psffactor", DoubleType(), True),
      StructField("flux_gaussian_flags_psffactor", BooleanType(), True),
      StructField("flux_gaussian_flags_badcorr", BooleanType(), True),
      StructField("flux_sinc", DoubleType(), True),
      StructField("flux_sinc_err", DoubleType(), True),
      StructField("flux_sinc_flags", BooleanType(), True),
      StructField("centroid_record_x", DoubleType(), True),
      StructField("centroid_record_y", DoubleType(), True),
      StructField("classification_extendedness", DoubleType(), True),
      StructField("aperturecorrection", DoubleType(), True),
      StructField("aperturecorrection_err", DoubleType(), True),
      StructField("refFlux", DoubleType(), True),
      StructField("refFlux_err", DoubleType(), True),
      StructField("objectId", LongType(), True),
      StructField("coord_raVar", DoubleType(), True),
      StructField("coord_radeclCov", DoubleType(), True),
      StructField("coord_declVar", DoubleType(), True),
      StructField("exposure_id", LongType(), True),
      StructField("exposure_filter_id", IntegerType(), True),
      StructField("exposure_time", DoubleType(), True),
      StructField("exposure_time_mid", DoubleType(), True),
      StructField("cluster_id", LongType(), True),
      StructField("cluster_coord_ra", DoubleType(), True),
      StructField("cluster_coord_decl", DoubleType(), True),


]) 


In [3]:
# sue: THIS IS THE ONE THAT WORKS
# jvv: Meet's schema, tweaked for with UC's sql<->parquet data type mappings for stuff like INTs and BITs
# except SQL BITs as parquet int32s
from pyspark.sql.types import *

# Source:
customSchemaint= StructType([ 
      StructField("id", LongType(), True),
      StructField("chunkid", IntegerType(), True),
      StructField("coord_ra", DoubleType(), True),
      StructField("coord_decl", DoubleType(), True),
      StructField("coord_htmId20", LongType(), True),
      StructField("parent", LongType(), True),
      StructField("flags_badcentroid", IntegerType(), True),
      StructField("centroid_sdss_x", DoubleType(), True),
      StructField("centroid_sdss_y", DoubleType(), True),
      StructField("centroid_sdss_xVar", DoubleType(), True),
      StructField("centroid_sdss_xyCov", DoubleType(), True),
      StructField("centroid_sdss_yVar", DoubleType(), True),
      StructField("centroid_sdss_flags", IntegerType(), True),
      StructField("flags_pixel_edge", IntegerType(), True),
      StructField("flags_pixel_interpolated_any", IntegerType(), True),
      StructField("flags_pixel_interpolated_center", IntegerType(), True),
      StructField("flags_pixel_saturated_any", IntegerType(), True),
      StructField("flags_pixel_saturated_center", IntegerType(), True),
      StructField("flags_pixel_cr_any", IntegerType(), True),
      StructField("flags_pixel_cr_center", IntegerType(), True),
      StructField("centroid_naive_x", DoubleType(), True),
      StructField("centroid_naive_y", DoubleType(), True),
      StructField("centroid_naive_xVar", DoubleType(), True),
      StructField("centroid_naive_xyCov", DoubleType(), True),
      StructField("centroid_naive_yVar", DoubleType(), True),
      StructField("centroid_naive_flags", IntegerType(), True),
      StructField("centroid_gaussian_x", DoubleType(), True),
      StructField("centroid_gaussian_y", DoubleType(), True),
      StructField("centroid_gaussian_xVar", DoubleType(), True),
      StructField("centroid_gaussian_xyCov", DoubleType(), True),
      StructField("centroid_gaussian_yVar", DoubleType(), True),
      StructField("centroid_gaussian_flags", IntegerType(), True),
      StructField("shape_sdss_Ixx", DoubleType(), True),
      StructField("shape_sdss_Iyy", DoubleType(), True),
      StructField("shape_sdss_Ixy", DoubleType(), True),
      StructField("shape_sdss_IxxVar", DoubleType(), True),
      StructField("shape_sdss_IxxIyyCov", DoubleType(), True),
      StructField("shape_sdss_IxxIxyCov", DoubleType(), True),
      StructField("shape_sdss_IyyVar", DoubleType(), True),
      StructField("shape_sdss_IyyIxyCov", DoubleType(), True),
      StructField("shape_sdss_IxyVar", DoubleType(), True),
      StructField("shape_sdss_flags", IntegerType(), True),
      StructField("shape_sdss_centroid_x", DoubleType(), True),
      StructField("shape_sdss_centroid_y", DoubleType(), True),
      StructField("shape_sdss_centroid_xVar", DoubleType(), True),
      StructField("shape_sdss_centroid_xyCov", DoubleType(), True),
      StructField("shape_sdss_centroid_yVar", DoubleType(), True),
      StructField("shape_sdss_centroid_flags", IntegerType(), True),
      StructField("shape_sdss_flags_unweightedbad", IntegerType(), True),
      StructField("shape_sdss_flags_unweighted", IntegerType(), True),
      StructField("shape_sdss_flags_shift", IntegerType(), True),
      StructField("shape_sdss_flags_maxiter", IntegerType(), True),
      StructField("flux_psf", DoubleType(), True),
      StructField("flux_psf_err", DoubleType(), True),
      StructField("flux_psf_flags", IntegerType(), True),
      StructField("flux_psf_psffactor", DoubleType(), True),
      StructField("flux_psf_flags_psffactor", IntegerType(), True),
      StructField("flux_psf_flags_badcorr", IntegerType(), True),
      StructField("flux_naive", DoubleType(), True),
      StructField("flux_naive_err", DoubleType(), True),
      StructField("flux_naive_flags", IntegerType(), True),
      StructField("flux_gaussian", DoubleType(), True),
      StructField("flux_gaussian_err", DoubleType(), True),
      StructField("flux_gaussian_flags", IntegerType(), True),
      StructField("flux_gaussian_psffactor", DoubleType(), True),
      StructField("flux_gaussian_flags_psffactor", IntegerType(), True),
      StructField("flux_gaussian_flags_badcorr", IntegerType(), True),
      StructField("flux_sinc", DoubleType(), True),
      StructField("flux_sinc_err", DoubleType(), True),
      StructField("flux_sinc_flags", IntegerType(), True),
      StructField("centroid_record_x", DoubleType(), True),
      StructField("centroid_record_y", DoubleType(), True),
      StructField("classification_extendedness", DoubleType(), True),
      StructField("aperturecorrection", DoubleType(), True),
      StructField("aperturecorrection_err", DoubleType(), True),
      StructField("refFlux", DoubleType(), True),
      StructField("refFlux_err", DoubleType(), True),
      StructField("objectId", LongType(), True),
      StructField("coord_raVar", DoubleType(), True),
      StructField("coord_radeclCov", DoubleType(), True),
      StructField("coord_declVar", DoubleType(), True),
      StructField("exposure_id", LongType(), True),
      StructField("exposure_filter_id", IntegerType(), True),
      StructField("exposure_time", DoubleType(), True),
      StructField("exposure_time_mid", DoubleType(), True),
      StructField("cluster_id", LongType(), True),
      StructField("cluster_coord_ra", DoubleType(), True),
      StructField("cluster_coord_decl", DoubleType(), True),


]) 


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
35,application_1580142637008_0036,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
# sue's test to make parquet files close to 1GB in size
dfsue = spark.read.load('/LSST/csv/test/Source', format="csv", sep=',', schema=customSchemaint)
dfsue.show(2)

In [3]:
#df1 = spark.read.load('/LSST/Source/csv/', format="csv", sep=';', schema=customSchema, header="true")
#df1 = spark.read.load('/LSST/Source/csv/Source_8945.csv', format="csv", sep=';' , schema=customSchema)
#df1 = spark.read.load('/LSST/Source/csv/Source_8945.csv', format="csv", sep=';', inferSchema="true", header="true")
#df1 = spark.read.load('/LSST/Source/csv/Source_8945.csv', format="csv", sep=';', schema=customSchema, header="true")

#df1 = spark.read.load('/LSST/csv/Source', format="csv", sep=',', schema=customSchemaint)
#df1 = spark.read.load('/LSST/sue/csv/Source/Source_9659.csv', format="csv", sep=',', inferSchema="true")
df1 = spark.read.load('/LSST/csv/Source/Source_9659.csv', format="csv", sep=',', schema=customSchemaint)
#df1 = spark.read.load('/LSST/jvv/Source_9659_id-chunkid-coord_ra.csv', format="csv", sep=',', schema=customSchema)
#df1 = spark.read.load('/LSST/jvv/csv/Source', format="csv", sep=',', schema=customSchema)
#df1 = spark.read.load('/LSST/jvv/test', format="csv", sep=',', schema=customSchema)


# Aris sample, not identical to Meet's:
'''
results = spark.read.option("inferSchema", "true").csv('/clickstream_data').toDF(
            "wcs_click_date_sk", "wcs_click_time_sk", "wcs_sales_sk", "wcs_item_sk", "wcs_web_page_sk", "wcs_user_sk"
            )
'''

#df1.printSchema()
df1.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
dfsue.coalesce(12).write.format("parquet").mode("overwrite").saveAsTable("source_test_parquet")

In [5]:
#df1.coalesce(350).write.parquet("/user/hive/warehouse/source/", mode='overwrite')

# Meet above v. Aris sample:
# results.write.format("parquet").mode("overwrite").saveAsTable("web_clickstreams")
# this will write the default number of files (lots of small files)
df1.write.format("parquet").mode("overwrite").saveAsTable("Source")


In [4]:
# attempt to write 6000 parquet files from csv
import datetime

before = datetime.datetime.now()
# coalesce stuff into 6000 files 

df1.coalesce(6000).write.parquet("/user/hive/warehouse/source_new", mode='overwrite')
## 15h

after = datetime.datetime.now()
print (after - before )

In [11]:
df1.printSchema()

In [1]:
# Disable saving SUCCESS file
sc._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") 

# Print the current warehouse directory where the parquet files will be stored
print(spark.conf.get("spark.sql.warehouse.dir"))

# Save results as parquet file and create hive table
results.write.format("parquet").mode("overwrite").saveAsTable("web_clickstreams")


In [3]:
import datetime

before = datetime.datetime.now()


# Execute Spark SQL commands
#sqlDF = spark.sql("SELECT * FROM Source LIMIT 100")

#sqlDF = spark.sql("SELECT * FROM Source LIMIT 100")
#sqlDF = spark.sql("select min(coord_ra) from Source")
#sqlDF = spark.sql("SELECT count(*)  FROM Source where flux_sinc between 1 and 1.1")
sqlDF = spark.sql("select id from source limit 10")

#sqlDF = spark.sql("select * from sourcesue limit 100")


sqlDF.show()
after = datetime.datetime.now()
print (after - before )

#sqlDF = spark.sql("SELECT wcs_user_sk, COUNT(*)\
#                     FROM web_clickstreams\
#                    WHERE wcs_user_sk IS NOT NULL\
#                   GROUP BY wcs_user_sk\
#                   ORDER BY COUNT(*) DESC LIMIT 100")
#sqlDF.show()

In [6]:
# sue's attempt to read a dir of parquet files into a dataframe
df = spark.read.load("/user/hive/warehouse/source_new")
#df = spark.read.load("/user/hive/warehouse/sourcesue")

In [8]:
df.write.saveAsTable("source")

In [4]:
import datetime

before = datetime.datetime.now()
# coalesce stuff into 6000 files 

df.coalesce(6).write.format("parquet").mode("overwrite").saveAsTable("source_new")

after = datetime.datetime.now()
print (after - before )


In [16]:
import datetime

before = datetime.datetime.now()
time.sleep(1)
after = datetime.datetime.now()
print (after - before )

In [1]:
import pyspark

# start
sc = pyspark.SparkContext()

#stop
sc.stop()

In [1]:
# Read the product reviews CSV files into a spark data frame, print schema & top rows
results = spark.read.option("inferSchema", "true").csv('/product_review_data').toDF(
            "pr_review_sk", "pr_review_content"
            )
results.printSchema()
results.show()

In [1]:
# Save results as parquet file and create hive table
results.write.format("parquet").mode("overwrite").saveAsTable("product_reviews")


In [1]:
# Execute Spark SQL commands
sqlDF = spark.sql("SELECT pr_review_sk, CHAR_LENGTH(pr_review_content) as len FROM product_reviews LIMIT 100")
sqlDF.show()

In [10]:
spark.sql('drop table Source')