In [1]:
from datetime import datetime, timedelta
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from pyspark.sql import functions as F
from google.cloud import storage, bigquery
from collections import namedtuple
from typing import List

In [2]:
CURRENT_DATE_ARG = None

In [3]:
BUCKET_NAME = "tim-ultrafibra-gcs-sp"
PARENT_PROJECT = "cloud-macro-tim"
CURRENT_DATE = datetime.strptime(CURRENT_DATE_ARG, '%Y-%m-%dT%H:%M:%S') if CURRENT_DATE_ARG is not None else datetime.today() 
ORIGINS = ["bot_tim"]

In [4]:
BRAZILIAN_TIMEDIFF = timedelta(hours=3)
CURRENT_DATE = CURRENT_DATE - BRAZILIAN_TIMEDIFF

In [5]:
def is_midnight_hour(hour):
  return hour >= 0 and hour <= 4

if is_midnight_hour(CURRENT_DATE.hour):
  CURRENT_DATE = CURRENT_DATE - timedelta(days=1)
  CURRENT_DATE = CURRENT_DATE.replace(hour=23, minute=59, second=59)

In [6]:
spark = SparkSession.builder.appName("tim_ultrafibra_curated_to_refined")\
  .config('parentProject', PARENT_PROJECT)\
  .config("spark.sql.caseSensitive", "True")\
  .config('spark.sql.session.timeZone', 'America/Sao_Paulo')\
  .config("spark.jars", "gs://spark-lib/bigquery/spark-3.3-bigquery-0.34.0.jar")\
  .getOrCreate()

gcsClient = storage.Client()
bqClient = bigquery.Client()

In [7]:
def get_path_in_refined_zone(datetime: datetime, origin: str, table_name: str):
  return f"gs://{BUCKET_NAME}/refined-zone/{origin}/{datetime.strftime('%Y%m')}/{table_name}.parquet"

def get_prefix_in_refined(datetime: datetime, origin: str):
  return f"refined-zone/{origin}/{datetime.strftime('%Y%m')}"

def get_tmp_table_name(table_name: str):
  return f'tim_ultrafibra.{table_name}_tmp'

def get_full_table_name(table_name: str):
  return f'tim_ultrafibra.{table_name}'

In [8]:
blobs_per_origin = {}

for origin_name in ORIGINS:
  blobs = gcsClient.list_blobs(BUCKET_NAME, prefix=get_prefix_in_refined(CURRENT_DATE, origin_name))
  blobs_per_origin[origin_name] = [f"gs://{BUCKET_NAME}/{blob.name}" for blob in blobs if blob.name.endswith(".parquet/")]

In [9]:
JoinOnField = namedtuple('JoinOnField', ['field_name', 'isNullable'])

In [10]:
def save_on_bigquery(df: DataFrame, table_name: str, fields: list, joinOn: List[JoinOnField]):
  tmp_table_name = get_tmp_table_name(table_name)
  full_table_name = get_full_table_name(table_name)
  table = None

  try:
    table = bqClient.get_table(full_table_name)
  except:
    table = None

  if table is None:
    df.write.format('bigquery')\
      .option("temporaryGcsBucket", BUCKET_NAME)\
      .option("table", full_table_name)\
      .option("parentProject", PARENT_PROJECT)\
      .mode("overwrite")\
      .save()
  else:
    bqClient.query(f"DROP TABLE IF EXISTS {tmp_table_name}")

    df.write.format("bigquery")\
      .option("temporaryGcsBucket", BUCKET_NAME)\
      .option("table", tmp_table_name)\
      .option("parentProject", PARENT_PROJECT)\
      .mode("append")\
      .save()

    select = ", ".join(fields)
    joinCondition = " AND ".join([
      f"(COALESCE(target.{field.field_name}, \"\") = COALESCE(origin.{field.field_name}, \"\"))" if field.isNullable 
      else f"target.{field.field_name} = origin.{field.field_name}" 
      for field in joinOn
    ])
    update = ", ".join([f"target.{field} = origin.{field}" for field in fields])
    insertFields = ", ".join(fields)
    insertValues = ", ".join([f"origin.{field}" for field in fields])

    query = f"""
        MERGE INTO {full_table_name} AS target
        USING (SELECT {select} FROM {tmp_table_name}) AS origin
        ON {joinCondition}
        WHEN MATCHED THEN 
          UPDATE SET {update}
        WHEN NOT MATCHED THEN
          INSERT ({insertFields})
          VALUES ({insertValues})
      """
    bqClient.query(query).result()
    print("Saved on BigQuery: ", table_name)
    bqClient.query(f"DROP TABLE {tmp_table_name}")

pass

In [11]:
for origin in ORIGINS:
  df_leads = spark.read.parquet(get_path_in_refined_zone(CURRENT_DATE, origin, "leads"))
  df_leads = df_leads.withColumn("cpf", F.when(F.trim("cpf") == "", F.lit(None).cast(StringType())).otherwise(F.col("cpf")))
  save_on_bigquery(df_leads.dropDuplicates(["identity", "cpf"]), "leads", list(df_leads.columns), [
    JoinOnField("identity", False),
    JoinOnField("cpf", True),
  ])

  df_history_leads = spark.read.parquet(get_path_in_refined_zone(CURRENT_DATE, origin, "history_leads"))
  save_on_bigquery(
    df_history_leads.dropDuplicates(["identity", "last_message_date"]), 
    "history_leads", 
    df_history_leads.columns, 
    [JoinOnField("identity", False), JoinOnField("last_message_date", False)]
  )

  df_orders = spark.read.parquet(get_path_in_refined_zone(CURRENT_DATE, origin, "orders"))
  save_on_bigquery(
    df_orders.dropDuplicates(["identity", "last_message_date"]), 
    "orders", 
    df_orders.columns, 
    [JoinOnField("identity", False),JoinOnField("last_message_date", False)]
  )

Saved on BigQuery:  leads


Saved on BigQuery:  history_leads


Saved on BigQuery:  orders
