# Spark sample showing read/write methods
In this sample notebook, we will read CSV file from HDFS, write it as parquet file and save a Hive table definition. We will also run some Spark SQL commands using the Hive table.


In [1]:
%%configure -f
{"executorMemory": "12g", "executorCores": 4, "numExecutors":11}


In [1]:
for item in sorted(sc._conf.getAll()): print(item)
# vanilla:
#('spark.livy.spark_major_version', '2')
#('spark.master', 'yarn')

In [3]:
# jvv: Meet's schema, tweaked for with UC's sql<->parquet data type mappings for stuff like INTs and BITs
# SQL BIT as parquet booleans here
from pyspark.sql.types import *

# Source:
customSchemaint= StructType([ 
      StructField("deepSourceId", LongType(), True),
      StructField("scienceCcdExposureId", LongType(), True),
      StructField("psfFlux", DoubleType(), True),
      StructField("psfFluxSigma", DoubleType(), True),
      StructField("flagBadMeasCentroid", IntegerType(), True),
      StructField("flagPixEdge", IntegerType(), True),
      StructField("flagPixInterpAny", IntegerType(), True),
      StructField("flagPixInterpCen", IntegerType(), True),
      StructField("flagPixSaturAny", IntegerType(), True),
      StructField("flagPixSaturCen", IntegerType(), True),
      StructField("flagBadPsfFlux", IntegerType(), True),
      StructField("chunkId", IntegerType(), True),
      StructField("subChunkId", IntegerType(), True)


]) 


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1560370594170_0001,pyspark3,idle,Link,Link,âœ”


SparkSession available as 'spark'.


In [6]:
#df1 = spark.read.load('/LSST/Source/csv/', format="csv", sep=';', schema=customSchema, header="true")
#df1 = spark.read.load('/LSST/Source/csv/Source_8945.csv', format="csv", sep=';' , schema=customSchema)
#df1 = spark.read.load('/LSST/Source/csv/Source_8945.csv', format="csv", sep=';', inferSchema="true", header="true")
#df1 = spark.read.load('/LSST/Source/csv/Source_8945.csv', format="csv", sep=';', schema=customSchema, header="true")
df1 = spark.read.load('/LSST/sue/csv/forcedsource', format="csv", sep=';', schema=customSchemaint, header="true")
#df1 = spark.read.load('/LSST/sue/csv/Source/Source_9659.csv', format="csv", sep=',', inferSchema="true")
#df1 = spark.read.load('/LSST/sue/csv/Source/Source_9659.csv', format="csv", sep=',', schema=customSchemaint)
#df1 = spark.read.load('/LSST/jvv/Source_9659_id-chunkid-coord_ra.csv', format="csv", sep=',', schema=customSchema)
#df1 = spark.read.load('/LSST/jvv/csv/Source', format="csv", sep=',', schema=customSchema)
#df1 = spark.read.load('/LSST/jvv/test', format="csv", sep=',', schema=customSchema)

# Aris sample, not identical to Meet's:
'''
results = spark.read.option("inferSchema", "true").csv('/clickstream_data').toDF(
            "wcs_click_date_sk", "wcs_click_time_sk", "wcs_sales_sk", "wcs_item_sk", "wcs_web_page_sk", "wcs_user_sk"
            )
'''

#df1.printSchema()
df1.show()

+----------------+--------------------+------------+------------+-------------------+-----------+----------------+----------------+---------------+---------------+--------------+-------+----------+
|    deepSourceId|scienceCcdExposureId|     psfFlux|psfFluxSigma|flagBadMeasCentroid|flagPixEdge|flagPixInterpAny|flagPixInterpCen|flagPixSaturAny|flagPixSaturCen|flagBadPsfFlux|chunkId|subChunkId|
+----------------+--------------------+------------+------------+-------------------+-----------+----------------+----------------+---------------+---------------+--------------+-------+----------+
|3166692272243625|          6471250146| 19.31360054|38.270599365|                  0|          0|               0|               0|              0|              0|             0|      0|       746|
|3166692272243472|          6471250146|258.83999634|38.578201294|                  0|          0|               0|               0|              0|              0|             0|      0|       746|
|316669227

In [7]:
#df1.coalesce(350).write.parquet("/user/hive/warehouse/forcedsource/", mode='overwrite')

# Meet above v. Aris sample:
# results.write.format("parquet").mode("overwrite").saveAsTable("web_clickstreams")
df1.coalesce(350).write.format("parquet").mode("overwrite").saveAsTable("forcedsource")

In [11]:
df1.printSchema()

In [1]:
# Disable saving SUCCESS file
sc._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") 

# Print the current warehouse directory where the parquet files will be stored
print(spark.conf.get("spark.sql.warehouse.dir"))

# Save results as parquet file and create hive table
results.write.format("parquet").mode("overwrite").saveAsTable("web_clickstreams")


In [19]:
import datetime

before = datetime.datetime.now()


# Execute Spark SQL commands
#sqlDF = spark.sql("SELECT * FROM Source LIMIT 100")

sqlDF = spark.sql("SELECT count(*)  FROM Source where flux_sinc between 1 and 1.1")

sqlDF.show()
after = datetime.datetime.now()
print (after - before )

#sqlDF = spark.sql("SELECT wcs_user_sk, COUNT(*)\
#                     FROM web_clickstreams\
#                    WHERE wcs_user_sk IS NOT NULL\
#                   GROUP BY wcs_user_sk\
#                   ORDER BY COUNT(*) DESC LIMIT 100")
#sqlDF.show()

In [16]:
import datetime

before = datetime.datetime.now()
time.sleep(1)
after = datetime.datetime.now()
print (after - before )

In [1]:
import pyspark

# start
sc = pyspark.SparkContext()

#stop
sc.stop()

In [1]:
# Read the product reviews CSV files into a spark data frame, print schema & top rows
results = spark.read.option("inferSchema", "true").csv('/product_review_data').toDF(
            "pr_review_sk", "pr_review_content"
            )
results.printSchema()
results.show()

In [1]:
# Save results as parquet file and create hive table
results.write.format("parquet").mode("overwrite").saveAsTable("product_reviews")


In [1]:
# Execute Spark SQL commands
sqlDF = spark.sql("SELECT pr_review_sk, CHAR_LENGTH(pr_review_content) as len FROM product_reviews LIMIT 100")
sqlDF.show()