In [None]:
%%configure
{ "conf": {
            "spark.jars":"hdfs:///apps/hudi/lib/hudi-spark-bundle.jar,hdfs:///apps/hudi/lib/spark-avro.jar",
            "spark.serializer":"org.apache.spark.serializer.KryoSerializer",
            "spark.sql.hive.convertMetastoreParquet":"false"
          }}

In [None]:
// PUT A NICE GRAPHIC HERE ON WHAT WE ARE LOOKING TO ACHIEVE HERE.

In [None]:
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions._
import org.apache.hudi.DataSourceWriteOptions
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.hive.MultiPartKeysValueExtractor

In [None]:
//Specify common DataSourceWriteOptions in the single hudiOptions variable 
val hudiTableName = "amazon_product_reviews"
val hudiTableRecordKey = "review_id"
val hudiTablePrecombineKey = "timestamp"
val hudiTablePath = "s3://hocanint-reinvent-demo-outputs/createdatasets/" + hudiTableName
val hudiTablePartitionColumn = "review_date"

val hudiOptions = Map[String,String](
  HoodieWriteConfig.TABLE_NAME -> hudiTableName,
  DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY -> "COPY_ON_WRITE", 
  DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY -> "review_id",
  DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY -> hudiTablePrecombineKey,
  DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY -> hudiTablePartitionColumn,
  DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY -> "false"
  //DataSourceWriteOptions.HIVE_ASSUME_DATE_PARTITION_OPT_KEY -> "false",
  //DataSourceWriteOptions.HIVE_TABLE_OPT_KEY -> hudiTableName
)

In [None]:
/****************************
Read out product reviews table
*****************************/
val df = spark.read.option("sep", "\t").option("header", "true").csv("s3://amazon-reviews-pds/tsv/amazon_reviews_us_Home_Improvement_v1_00.tsv.gz")

In [None]:
/****************
We need to add a timestamp of the current transaction and format the 
*****************/
val inputdf = df.withColumn(hudiTablePrecombineKey, current_timestamp()).withColumn(hudiTablePartitionColumn, regexp_replace(col(hudiTablePartitionColumn), "-", "/"))
inputdf.show()

In [None]:
/** ********************************
Lets write our input dataset to Hudi.
************************************/
(inputdf.write
  .format("org.apache.hudi")
  .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
  .options(hudiOptions)
  .mode(SaveMode.Overwrite)
  .save(hudiTablePath))

inputdf.registerTempTable("amazon_product_reviews_raw_ro_table");
spark.sql("select * from amazon_product_reviews_raw_ro_table where customer_id = '17767084'")

In [None]:
/** **********************************
Lets look at a product that some of my consumers may be having an issue with.
**************************************/

//Graph a products ratings by bucket. There will be a spike, ie there is some customer_id that is somehow putting in star ratings of 100


In [None]:
/** *********************************

************************************/
val upsertdf = inputdf.filter($"customer_id" === 17767084).withColumn("star_rating", expr("0"))

In [None]:
/** ********************************
Before, if you wanted to update data in S3, you had to read the old data, merge with the new data, and then overwrite
the old data. Now, with Hudi, you can directly update the data in-place.
************************************/
//Upserts
(upsertDf.write
  .format("org.apache.hudi")
  .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
  .options(hudiOptions)
  .mode(SaveMode.Append)
  .save(hudiTablePath))

In [None]:
/** ********************************
Suppose that we wanted to know what was a review at a certain point of time. Hudi Allows that by specifying 
a point in time and it will read 
************************************/
val readFromTime = date_add(current_timestamp(), -1)
(val amazon_product_reviews_table = spark.read()
     .format("org.apache.hudi")
     .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(),
             DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
     .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), readFromTime)
     .options(hudiOptions)
     .load(hudiTablePath))

amazon_product_reviews_table.filter($"customer_id" === 17767084).show()

In [None]:
/*******************************
Hudi also provides a Read Optimized table. 
********************************/
val roViewDF = spark.read.format("org.apache.hudi").load(hudiTablePath + "/*/*/*/*")
roViewDF.registerTempTable("amazon_product_reviews_ro_table")

spark.sql("select review_id, product_title, star_rating from amazon_product_reviews_ro_table where customer_id = '17767084'").show()

In [None]:
/** *******************************
Now, suppose we need to delete a customers information due to GDPR because a request was made by a customer?
***********************************/
val deleteRowsDf = spark.read.option("sep", "\t").option("header", "true")
        .csv("s3://amazon-reviews-pds/tsv/amazon_reviews_us_Home_Improvement_v1_00.tsv.gz")
        .filter($"customer_id" === 17767084);

//Deletion
deleteRowsDf.write
  .format("org.apache.hudi")
  .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
  .option(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY, "org.apache.hudi.EmptyHoodieRecordPayload")
  .options(hudiOptions)
  .mode(SaveMode.Append)
  .save(hudiTablePath)

In [None]:
/***********************************
At this point, I am going to switch to SQLDeveloper and call Hive Queries to show that the data has been changed
************************************/