In [1]:
%%configure
{ "conf": {
            "spark.jars":"hdfs:///apps/hudi/lib/hudi-spark-bundle.jar,hdfs:///apps/hudi/lib/spark-avro.jar",
            "spark.serializer":"org.apache.spark.serializer.KryoSerializer",
            "spark.sql.hive.convertMetastoreParquet":"false"
          }}

In [2]:
%%html 
<img src="/ProductReviewsProcessingRepo/images/hudi_demo_diagram.png">

In [None]:
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions._
import org.apache.hudi.DataSourceWriteOptions
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.hive.MultiPartKeysValueExtractor

In [13]:
//Specify common DataSourceWriteOptions in the single hudiOptions variable 
val hudiTableName = "amazon_product_reviews"
val hudiTableRecordKey = "review_id"
val hudiTablePrecombineKey = "timestamp"
val hudiTablePath = "s3://hocanint-reinvent-2019-demo-outputs/createdatasets/" + hudiTableName
val hudiTablePartitionColumn = "review_date"

val hudiOptions = Map[String,String](
  HoodieWriteConfig.TABLE_NAME -> hudiTableName,
  //For this data set, we will configure it to use the Merge on Read storage strategy.
  DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY -> "MERGE_ON_READ", 
  DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY -> "review_id",
  DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY -> hudiTablePrecombineKey,
  DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY -> hudiTablePartitionColumn,
  DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY -> "false"
  //DataSourceWriteOptions.HIVE_ASSUME_DATE_PARTITION_OPT_KEY -> "false",
  //DataSourceWriteOptions.HIVE_TABLE_OPT_KEY -> hudiTableName
)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

hudiTableName: String = amazon_product_reviews
hudiTableRecordKey: String = review_id
hudiTablePrecombineKey: String = timestamp
hudiTablePath: String = s3://hocanint-reinvent-2019-demo-outputs/createdatasets/amazon_product_reviews
hudiTablePartitionColumn: String = review_date
hudiOptions: scala.collection.immutable.Map[String,String] = Map(hoodie.datasource.write.precombine.field -> timestamp, hoodie.datasource.hive_sync.enable -> false, hoodie.datasource.write.recordkey.field -> review_id, hoodie.table.name -> amazon_product_reviews, hoodie.datasource.write.storage.type -> MERGE_ON_READ, hoodie.datasource.write.partitionpath.field -> review_date)


In [5]:
/****************************
Read out product reviews table
*****************************/
val df = spark.read.option("sep", "\t").option("header", "true").csv("s3://amazon-reviews-pds/tsv/amazon_reviews_us_Home_Improvement_v1_00.tsv.gz")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

df: org.apache.spark.sql.DataFrame = [marketplace: string, customer_id: string ... 13 more fields]


In [6]:
/****************
We need to add a timestamp of the current transaction and format the date column which will be used as a partition column. 
*****************/
val inputdf = df.withColumn(hudiTablePrecombineKey, current_timestamp()).withColumn(hudiTablePartitionColumn, regexp_replace(col(hudiTablePartitionColumn), "-", "/"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

inputdf: org.apache.spark.sql.DataFrame = [marketplace: string, customer_id: string ... 14 more fields]


In [None]:
/** ********************************
Lets write our input dataset to Hudi.
************************************/
(inputdf.write
  .format("org.apache.hudi")
  .options(hudiOptions)
  .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL)
  .mode(SaveMode.Overwrite)
  .save(hudiTablePath))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
/** **********************************
Lets look at a product that some of my consumers may be having an issue with.
**************************************/
inputdf.registerTempTable("amazon_product_reviews_raw_ro_table");
spark.sql("select star_rating, count(*) from amazon_product_reviews_raw_ro_table group by star_rating")

In [10]:
/** *********************************
Select the rows we want to update and and make the update.
************************************/
val upsertdf = inputdf.filter($"customer_id" === 17767084).withColumn("star_rating", expr("0"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

upsertdf: org.apache.spark.sql.DataFrame = [marketplace: string, customer_id: string ... 14 more fields]


In [12]:
/** ********************************
Before, if you wanted to update data in S3, you had to read the old data, merge with the new data, and then overwrite
the old data. Now, with Hudi, you can directly update the data in-place.
************************************/
(upsertdf.write
  .format("org.apache.hudi")
  .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
  .options(hudiOptions)
  .mode(SaveMode.Append)
  .save(hudiTablePath))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 31.0 failed 4 times, most recent failure: Lost task 0.3 in stage 31.0 (TID 24833, ip-172-31-22-141.ec2.internal, executor 11): org.apache.hudi.exception.HoodieUpsertException: Error upserting bucketType UPDATE for partition :0
	at org.apache.hudi.table.HoodieCopyOnWriteTable.handleUpsertPartition(HoodieCopyOnWriteTable.java:264)
	at org.apache.hudi.HoodieWriteClient.lambda$upsertRecordsInternal$507693af$1(HoodieWriteClient.java:428)
	at org.apache.spark.api.java.JavaRDDLike$$anonfun$mapPartitionsWithIndex$1.apply(JavaRDDLike.scala:102)
	at org.apache.spark.api.java.JavaRDDLike$$anonfun$mapPartitionsWithIndex$1.apply(JavaRDDLike.scala:102)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$25.apply(RDD.scala:853)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$25.apply(RDD.scala:853)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD

In [None]:
/** ********************************
Suppose that we wanted to know what was a review at a certain point of time. Hudi Allows that by specifying 
a point in time and it will read 
************************************/
val readFromTime = date_add(current_timestamp(), -1)
(val amazon_product_reviews_table = spark.read()
     .format("org.apache.hudi")
     .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(),
             DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
     .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), readFromTime)
     .options(hudiOptions)
     .load(hudiTablePath))

amazon_product_reviews_table.filter($"customer_id" === 17767084).show()

In [None]:
/*******************************
Hudi also provides a Read Optimized table. 
********************************/
val roViewDF = spark.read.format("org.apache.hudi").load(hudiTablePath + "/*/*/*/*")
roViewDF.registerTempTable("amazon_product_reviews_ro_table")

spark.sql("select review_id, product_title, star_rating from amazon_product_reviews_ro_table where customer_id = '17767084'").show()

In [None]:
/** *******************************
Now, suppose we need to delete a customers information due to GDPR because a request was made by a customer?
***********************************/
val deleteRowsDf = spark.read.option("sep", "\t").option("header", "true")
        .csv("s3://amazon-reviews-pds/tsv/amazon_reviews_us_Home_Improvement_v1_00.tsv.gz")
        .filter($"customer_id" === 17767084);

//Deletion
deleteRowsDf.write
  .format("org.apache.hudi")
  .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
  .option(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY, "org.apache.hudi.EmptyHoodieRecordPayload")
  .options(hudiOptions)
  .mode(SaveMode.Append)
  .save(hudiTablePath)

In [None]:
/***********************************
At this point, I am going to switch to SQLDeveloper and call Hive Queries to show that the data has been changed
************************************/