#### Importing commonly used functions...
* ezView()
* add_ingestion_date()
* re_arrange_partition_column()
* overwrite_partition()
* df_column_to_list()

In [0]:
from pyspark.sql import DataFrame
# Because display isn't github friendly
def ezView(df, n=5, m=8):
    """
    Display the first n rows and the first m columns of a DataFrame.

    Parameters:
    df (DataFrame): The DataFrame to display.
    n (int): Number of rows to display. Default is 5.
    m (int): Number of columns to display. Default is 8.
    """
    # Ensure that n and m are within the DataFrame's bounds
    num_rows = df.count()
    num_columns = len(df.columns)
    n = min(n, num_rows)
    m = min(m, num_columns)

    # Select the first m columns and display the first n rows
    selected_columns = df.select(df.columns[:m])
    selected_columns.limit(n).show()

In [0]:
from pyspark.sql.functions import current_timestamp
def add_ingestion_date(input_df):
  output_df = input_df.withColumn("ingestion_date", current_timestamp())
  return output_df

##### Makes sure a certain column is the last column

If you type:
```
re_arrange_partition_column(df, race_id)
```
Then this function will rearrange the dataframe `df` so race_id comes in as the last column.

In [0]:
def re_arrange_partition_column(input_df, partition_column):
  column_list = []
  for column_name in input_df.schema.names:
    if column_name != partition_column:
      column_list.append(column_name)
  column_list.append(partition_column)
  output_df = input_df.select(column_list)
  return output_df

##### Incremental Update Partition

Incremental load logic
* works with cutover file (migration friendly)
* works with delta files
* will over that specific partition if that partition exists
* does not produce duplicates of data


In [0]:
def overwrite_partition(input_df, db_name, table_name, partition_column):
  output_df = re_arrange_partition_column(input_df, partition_column)
  spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")
  if (spark._jsparkSession.catalog().tableExists(f"{db_name}.{table_name}")):
    output_df.write.mode("overwrite").insertInto(f"{db_name}.{table_name}")
  else:
    output_df.write.mode("overwrite").partitionBy(partition_column).format("parquet").saveAsTable(f"{db_name}.{table_name}")

##### Incremental Update Based on Distinct List of Values
* Pulls distinct values from a specified column
* Return all distinct values as a python list
* Used for incrimental updating a database

Not as readable as append method 

In [0]:
def df_column_to_list(input_df, column_name):
  df_row_list = input_df.select(column_name) \
                        .distinct() \
                        .collect()
  
  column_value_list = [row[column_name] for row in df_row_list]
  return column_value_list

#### Delta Table Merge
So this is the first time you've got no table. You're going to write the data to the table with the new data, it'll just work as it is. So there is nothing we need to do.

Source: S22:v146

In [0]:
def merge_delta_data(input_df, db_name, table_name, folder_path, merge_condition, partition_column):
    # Enable dynamic partition pruning for optimization
    spark.conf.set("spark.databricks.optimizer.dynamicPartitionPruning", "true")

    from delta.tables import DeltaTable

    # Check if the table exists in the specified database
    if spark._jsparkSession.catalog().tableExists(f"{db_name}.{table_name}"):
        # Access the Delta table from the specified folder path
        deltaTable = DeltaTable.forPath(spark, f"{folder_path}/{table_name}")

        # Perform merge operation
        deltaTable.alias("tgt").merge(
            input_df.alias("src"),
            merge_condition) \
            .whenMatchedUpdateAll() \  # Update all matching records
            .whenNotMatchedInsertAll() \  # Insert all records that do not match
            .execute()
    else:
        # If the table doesn't exist, write the DataFrame as a new Delta table
        # with partitioning
        input_df.write.mode("overwrite").partitionBy(partition_column).format("delta").saveAsTable(f"{db_name}.{table_name}")