# Utility Functions

In [None]:
import os
import glob
import itertools
import boto3

CLIENT = boto3.client('s3')

def upload(table, bucket, path):
    assert os.path.isdir(table)
    for item in itertools.chain(glob.glob(table + '/**'), glob.glob(table + '/.**')):
        if not os.path.isfile(item):
            upload(item, bucket, path + '/' + os.path.basename(item))
        else:
            CLIENT.upload_file(
                item,
                bucket,
                os.path.join(path, os.path.basename(item))
            )

# Setup Spark Session

In [None]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder.appName('CreateDeltaTables') \
    .config(
        'spark.jars.packages',
        'io.delta:delta-core_2.12:2.2.0') \
    .config(
        'spark.sql.extensions',
        'io.delta.sql.DeltaSparkSessionExtension') \
    .config(
        'spark.sql.catalog.spark_catalog',
        'org.apache.spark.sql.delta.catalog.DeltaCatalog')

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# Load CSVs

In [None]:
avocado = spark.read \
    .format('csv') \
    .option('header', 'true') \
    .option('inferSchema', 'true') \
    .load('../../data/avocado.csv')

In [None]:
avocado.show(n=5, truncate=False, vertical=True)

# Create Versions

In [None]:
avocado_2015 = avocado.filter(avocado.year == '2015')
avocado_2016 = avocado.filter(avocado.year == '2016')
avocado_2017 = avocado.filter(avocado.year == '2017')
avocado_2018 = avocado.filter(avocado.year == '2018')
avocado_2015.show(n=1, truncate=False, vertical=True)
avocado_2016.show(n=1, truncate=False, vertical=True)
avocado_2017.show(n=1, truncate=False, vertical=True)
avocado_2018.show(n=1, truncate=False, vertical=True)

# Create Delta Tables

In [None]:
avocado_2015.write \
    .format('delta') \
    .save('../../data/avocado-table')
avocado_2016.write \
    .mode('append') \
    .format('delta') \
    .save('../../data/avocado-table')
avocado_2017.write \
    .mode('append') \
    .format('delta') \
    .save('../../data/avocado-table')
avocado_2018.write \
    .mode('append') \
    .format('delta') \
    .save('../../data/avocado-table')

# Upload Tables to S3

In [None]:
upload('../../data/avocado-table/', 'kotosiro-sharing-example', 'avocado')