In [None]:
import sys
!conda install --yes --prefix {sys.prefix} boto3
!{sys.executable} -m pip install git+git://github.com/moj-analytical-services/etl_manager.git#egg=etl_manager

In [1]:
# Remove metastore and data used in this example (Want to start a blank slate)
import os
import shutil

if os.path.isdir('metastore_db') :
    shutil.rmtree('metastore_db')

if os.path.isfile('derby.log') :
    os.remove('derby.log')
    
if os.path.isdir('data/hive_df') :
    shutil.rmtree('hive_df')

if os.path.isdir('data/non_hive_df') :
    shutil.rmtree('non_hive_df')

In [2]:
# Attempt to combine the two

import boto3
import os
# from dataengineeringutils.utils import read_json

import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, functions as F
from pyspark.storagelevel import StorageLevel

# Import own function library
from pyspark.sql.types import *

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk:1.10.34,org.apache.hadoop:hadoop-aws:2.6.0 pyspark-shell'

session = boto3.Session()
credentials = session.get_credentials()


spark = SparkSession \
    .builder \
    .master("local[*]") \
    .config("hive.exec.dynamic.partition", "true") \
    .config("hive.exec.dynamic.partition.mode", "nonstrict") \
    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem") \
    .config("spark.hadoop.fs.s3.awsAccessKeyId", credentials.access_key) \
    .config("spark.hadoop.fs.s3.awsSecretAccessKey", credentials.secret_key) \
    .enableHiveSupport() \
    .getOrCreate()
    
os.getcwd()

'/home/jovyan/Documents/projects/spark_testing'

## Simple example of overwrite differences

Differences between HIVE and normal table in spark

Some useful resources:

- https://docs.databricks.com/spark/latest/spark-sql/language-manual/insert.html
- https://medium.com/@anuvrat/writing-into-dynamic-partitions-using-spark-2e2b818a007a

In [3]:
### CREATE HIVE TABLE (with one row)
spark.sql("""
CREATE TABLE IF NOT EXISTS hive_df (col1 INT, col2 STRING, partition_bin INT)
USING HIVE OPTIONS(fileFormat 'PARQUET')
PARTITIONED BY (partition_bin)
LOCATION 'data/hive_df'
""")
spark.sql("""
INSERT INTO hive_df PARTITION (partition_bin = 0)
VALUES (0, 'init_record')
""")
###

### CREATE NON HIVE TABLE (with one row)
spark.sql("""
CREATE TABLE IF NOT EXISTS non_hive_df (col1 INT, col2 STRING, partition_bin INT)
USING PARQUET
PARTITIONED BY (partition_bin)
LOCATION 'data/non_hive_df'
""")
spark.sql("""
INSERT INTO non_hive_df PARTITION (partition_bin = 0)
VALUES (0, 'init_record')
""")
###

### ATTEMPT DYNAMIC OVERWRITE WITH EACH TABLE
spark.sql("""
INSERT OVERWRITE TABLE hive_df PARTITION (partition_bin)
VALUES (0, 'new_record', 1)
""")
spark.sql("""
INSERT OVERWRITE TABLE non_hive_df PARTITION (partition_bin)
VALUES (0, 'new_record', 1)
""")

spark.sql("SELECT * FROM hive_df").show() # 2 row dynamic overwrite
spark.sql("SELECT * FROM non_hive_df").show() # 1 row full table overwrite

+----+-----------+-------------+
|col1|       col2|partition_bin|
+----+-----------+-------------+
|   0|init_record|            0|
|   0|init_record|            0|
|   0|init_record|            0|
|   0| new_record|            1|
|   0| new_record|            1|
+----+-----------+-------------+

+----+----------+-------------+
|col1|      col2|partition_bin|
+----+----------+-------------+
|   0|new_record|            1|
+----+----------+-------------+



## More messing about 

In [4]:
# Test new update method
df_partitions = ['partition_bin']
in_schema = StructType([StructField('id', IntegerType(), True), StructField('dummy_contents', StringType())])
db_schema = StructType([StructField('partition_bin', IntegerType(), True), StructField('id', IntegerType(), True), StructField('dummy_contents', StringType(), True), StructField('dea_record_start_date', IntegerType(), True), StructField('dea_record_end_date', IntegerType(), True)])

# Day one we get our first set of data
data_day_1 = [
    (1, 'a'),
    (2, 'a'),
    (3, 'a'),
    (4, 'a'),
    (5, 'a'),
    (6, 'a'),
    (7, 'a'),
    (8, 'a'),
    (9, 'a')]
df1 = spark.createDataFrame(data_day_1, in_schema)
df1 = df1.withColumn('partition_bin', F.floor(df1['id']/5))
df1.createOrReplaceTempView('df1')

# Data 2 we mostly get new data with some old data and updated records
data_day_2 = [
    (9, 'a'),
    (9, 'b'),
    (10, 'a'),
    (11, 'a'),
    (12, 'a'),
    (13, 'a')]
df2 = spark.createDataFrame(data_day_2, in_schema)
df2 = df2.withColumn('partition_bin', F.floor(df2['id']/5))
df2.createOrReplaceTempView('df2')

In [5]:
print(spark.catalog.listDatabases())
print(spark.catalog.listTables())

[Database(name='default', description='Default Hive database', locationUri='file:/home/jovyan/Documents/projects/spark_testing/spark-warehouse')]
[Table(name='hive_df', database='default', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='non_hive_df', database='default', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='df1', database=None, description=None, tableType='TEMPORARY', isTemporary=True), Table(name='df2', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]


In [7]:
# Create a table that enables HIVE


spark.sql("""
CREATE TABLE IF NOT EXISTS df_test
USING HIVE OPTIONS(fileFormat 'PARQUET')
PARTITIONED BY (partition_bin)
LOCATION 'data/df_test'
AS SELECT * FROM df1
""")

# Insert df into parition static partition insert
spark.sql("""
INSERT INTO df_test PARTITION (partition_bin = 0)
SELECT id, dummy_contents FROM df1 where partition_bin = 0
""")

DataFrame[]

In [8]:
spark.sql("SELECT * FROM df_test").show()

+---+--------------+-------------+
| id|dummy_contents|partition_bin|
+---+--------------+-------------+
|  7|             a|            1|
|  8|             a|            1|
|  9|             a|            1|
|  1|             a|            0|
|  2|             a|            0|
|  3|             a|            0|
|  4|             a|            0|
|  1|             a|            0|
|  2|             a|            0|
|  3|             a|            0|
|  4|             a|            0|
|  3|             a|            0|
|  4|             a|            0|
|  1|             a|            0|
|  2|             a|            0|
|  5|             a|            1|
|  6|             a|            1|
|  9|             a|            1|
|  9|             b|            1|
+---+--------------+-------------+



In [9]:
# Insert overwrite to static partion 
## Basically we have just inserted df1 into df_test
spark.sql("""
INSERT OVERWRITE TABLE df_test PARTITION (partition_bin = 1)
SELECT id, dummy_contents FROM df1 where partition_bin = 1
""")

DataFrame[]

In [10]:
spark.sql("SELECT * FROM df_test").show()

+---+--------------+-------------+
| id|dummy_contents|partition_bin|
+---+--------------+-------------+
|  7|             a|            1|
|  8|             a|            1|
|  9|             a|            1|
|  1|             a|            0|
|  2|             a|            0|
|  3|             a|            0|
|  4|             a|            0|
|  1|             a|            0|
|  2|             a|            0|
|  3|             a|            0|
|  4|             a|            0|
|  3|             a|            0|
|  4|             a|            0|
|  1|             a|            0|
|  2|             a|            0|
|  5|             a|            1|
|  6|             a|            1|
+---+--------------+-------------+



In [11]:
## Dynamic partition overwrite (as table was defined as HIVE)
# Only overwrites partition_bin = 1 or 2 (as these are the only values in df2)
# partition with partition_bin = 0 remains unchanged
spark.sql("""
INSERT OVERWRITE TABLE df_test PARTITION (partition_bin)
SELECT id, dummy_contents, partition_bin FROM df2
""")

DataFrame[]

In [12]:
spark.sql("SELECT * FROM df_test").show()

+---+--------------+-------------+
| id|dummy_contents|partition_bin|
+---+--------------+-------------+
|  1|             a|            0|
|  2|             a|            0|
|  3|             a|            0|
|  4|             a|            0|
|  1|             a|            0|
|  2|             a|            0|
|  3|             a|            0|
|  4|             a|            0|
|  3|             a|            0|
|  4|             a|            0|
|  1|             a|            0|
|  2|             a|            0|
| 12|             a|            2|
| 13|             a|            2|
| 12|             a|            2|
| 13|             a|            2|
|  9|             a|            1|
|  9|             b|            1|
| 11|             a|            2|
| 11|             a|            2|
+---+--------------+-------------+
only showing top 20 rows

