In [13]:
from datetime import datetime, timedelta
from pyspark.sql.types import StringType
from pyspark.sql import SparkSession, DataFrame
from google.cloud import storage, bigquery
from collections import namedtuple
from functools import reduce
from typing import List

In [14]:
CURRENT_DATE_ARG = None

In [15]:
PARENT_PROJECT = "cloud-macro-blip"
BUCKET_NAME = "vivo-api-manager-gcs-sp"
ORIGIN = "vivo-b2b-movel"
TABLE_PREFIX = "vivo_b2b_movel"
CURRENT_DATE = datetime.strptime(CURRENT_DATE_ARG, '%Y-%m-%dT%H:%M:%S') if CURRENT_DATE_ARG is not None else datetime.today() 


In [16]:
BRAZILIAN_TIMEDIFF = timedelta(hours=3)
CURRENT_DATE = CURRENT_DATE - BRAZILIAN_TIMEDIFF

In [17]:
def is_midnight_hour(hour):
  return hour >= 0 and hour <= 4

if is_midnight_hour(CURRENT_DATE.hour):
  CURRENT_DATE = CURRENT_DATE - timedelta(days=1)
  CURRENT_DATE = CURRENT_DATE.replace(hour=23, minute=59, second=59)

In [18]:
spark = SparkSession.builder.appName("refined_to_bigquery")\
  .config('parentProject', PARENT_PROJECT)\
  .config("spark.sql.caseSensitive", "True")\
  .config('spark.sql.session.timeZone', 'America/Sao_Paulo')\
  .config("spark.jars", "gs://spark-lib/bigquery/spark-3.3-bigquery-0.34.0.jar")\
  .getOrCreate()

gcsClient = storage.Client()
bqClient = bigquery.Client()

In [19]:
def get_data_path_in_refined_zone(datetime: datetime, origin: str, project_type: str, table_name: str):
  return f"gs://{BUCKET_NAME}/{origin}/refined-zone/data/{project_type}/{datetime.strftime('%Y%m')}/{table_name}.parquet"

# def get_funnel_path_in_refined_zone(datetime: datetime, origin: str, project_type: str, table_name: str):
#   return f"gs://{BUCKET_NAME}/{origin}/refined-zone/funnel/{project_type}/{datetime.strftime('%Y%m')}/{table_name}.parquet"

# def get_funnel_prefix_in_refined_zone(datetime: datetime, origin: str, project_type: str):
#   return f"{origin}/refined-zone/funnel/{project_type}/{datetime.strftime('%Y%m')}"

def get_data_prefix_in_refined_zone(datetime: datetime, origin: str, project_type: str):
  return f"{origin}/refined-zone/data/{project_type}/{datetime.strftime('%Y%m')}"

def reduce_dataframe(dfs: list):
  if len(dfs) == 0:
    return spark.createDataFrame([], StringType())
  elif len(dfs) == 1:
    return dfs[0]
  else:
    return reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), dfs)
  
def get_tmp_table_name(table_name: str):
  return f'{TABLE_PREFIX}.{table_name}_tmp'

def get_full_table_name(table_name: str):
  return f'{TABLE_PREFIX}.{table_name}'

In [20]:
bucket = gcsClient.get_bucket(BUCKET_NAME)

# chatbot_funnel_blobs = []
chatbot_data_blobs = []

checkout_data_blobs = []

# blobs = bucket.list_blobs(prefix=get_funnel_prefix_in_refined_zone(CURRENT_DATE, ORIGIN, "chatbot"))
# chatbot_funnel_blobs = [f"gs://{BUCKET_NAME}/{blob.name}" for blob in blobs if blob.name.endswith(".parquet/")]

blobs = bucket.list_blobs(prefix=get_data_prefix_in_refined_zone(CURRENT_DATE, ORIGIN, "chatbot"))
chatbot_data_blobs = [f"gs://{BUCKET_NAME}/{blob.name}" for blob in blobs if blob.name.endswith(".parquet/")]

blobs = bucket.list_blobs(prefix=get_data_prefix_in_refined_zone(CURRENT_DATE, ORIGIN, "checkout"))
checkout_data_blobs = [f"gs://{BUCKET_NAME}/{blob.name}" for blob in blobs if blob.name.endswith(".parquet/")]

In [21]:
JoinOnField = namedtuple('JoinOnField', ['field_name', 'isNullable'])

In [22]:
def save_on_bigquery(df: DataFrame, table_name: str, fields: list, joinOn: List[JoinOnField]):
  tmp_table_name = get_tmp_table_name(table_name)
  full_table_name = get_full_table_name(table_name)
  table = None

  try:
    table = bqClient.get_table(full_table_name)
  except:
    table = None

  if table is None:
    df.write.format('bigquery')\
      .option("temporaryGcsBucket", BUCKET_NAME)\
      .option("table", full_table_name)\
      .option("parentProject", PARENT_PROJECT)\
      .mode("overwrite")\
      .save()
  else:
    bqClient.query(f"DROP TABLE IF EXISTS {tmp_table_name}")

    df.write.format("bigquery")\
      .option("temporaryGcsBucket", BUCKET_NAME)\
      .option("table", tmp_table_name)\
      .option("parentProject", PARENT_PROJECT)\
      .mode("append")\
      .save()

    select = ", ".join(fields)
    joinCondition = " AND ".join([
      f"(COALESCE(target.{field.field_name}, \"\") = COALESCE(origin.{field.field_name}, \"\"))" if field.isNullable 
      else f"target.{field.field_name} = origin.{field.field_name}" 
      for field in joinOn
    ])
    update = ", ".join([f"target.{field} = origin.{field}" for field in fields])
    insertFields = ", ".join(fields)
    insertValues = ", ".join([f"origin.{field}" for field in fields])

    query = f"""
        MERGE INTO {full_table_name} AS target
        USING (SELECT {select} FROM {tmp_table_name}) AS origin
        ON {joinCondition}
        WHEN MATCHED THEN 
          UPDATE SET {update}
        WHEN NOT MATCHED THEN
          INSERT ({insertFields})
          VALUES ({insertValues})
      """
    bqClient.query(query).result()
    print("Saved on BigQuery: ", table_name)
    bqClient.query(f"DROP TABLE {tmp_table_name}")

pass

In [23]:
for data_blob in chatbot_data_blobs:
  df_data = spark.read.parquet(data_blob)
  save_on_bigquery(df_data.dropDuplicates(["order_id", "product_type", "project_id"]), "chatbot_history", list(df_data.columns), [
    JoinOnField("order_id", False),
    JoinOnField("product_type", False),
    JoinOnField("project_id", False),
  ])

# for funnel_blob in chatbot_funnel_blobs:
#   funnel_data = spark.read.parquet(funnel_blob)
#   save_on_bigquery(funnel_data.dropDuplicates(["order_id", "product_type", "project_id", "step"]), "chatbot_funnel", list(funnel_data.columns), [
#     JoinOnField("order_id", False),
#     JoinOnField("product_type", False),
#     JoinOnField("project_id", False),
#     JoinOnField("step", False),
#   ])

for data_blob in checkout_data_blobs:
  df_data = spark.read.parquet(data_blob)
  save_on_bigquery(df_data.dropDuplicates(["order_id", "product_type", "project_id"]), "checkout_history", list(df_data.columns), [
    JoinOnField("order_id", False),
    JoinOnField("product_type", False),
    JoinOnField("project_id", False),
  ])

