In [1]:
from pyspark.sql import SparkSession, DataFrame
from google.cloud import storage, bigquery
from datetime import datetime, timedelta
from pyspark.sql.types import StringType
from pyspark.sql import functions as F
from functools import reduce

In [2]:
CURRENT_DATE_ARG = None

In [3]:
BUCKET_NAME = "tim-ultrafibra-gcs-sp"
PARENT_PROJECT = "cloud-macro-tim"
CURRENT_DATE = datetime.strptime(CURRENT_DATE_ARG, '%Y-%m-%dT%H:%M:%S') if CURRENT_DATE_ARG is not None else datetime.today() 
ORIGINS = ["bot_tim"]

In [4]:
BRAZILIAN_TIMEDIFF = timedelta(hours=3)
CURRENT_DATE = CURRENT_DATE - BRAZILIAN_TIMEDIFF

In [5]:
def is_midnight_hour(hour):
  return hour >= 0 and hour <= 4

if is_midnight_hour(CURRENT_DATE.hour):
  CURRENT_DATE = CURRENT_DATE - timedelta(days=1)
  CURRENT_DATE = CURRENT_DATE.replace(hour=23, minute=59, second=59)

In [6]:
spark = SparkSession.builder.appName("tim_ultrafibra_curated_to_refined")\
  .config('parentProject', PARENT_PROJECT)\
  .config("spark.sql.caseSensitive", "True")\
  .config('spark.sql.session.timeZone', 'America/Sao_Paulo')\
  .config("spark.jars", "gs://spark-lib/bigquery/spark-3.3-bigquery-0.34.0.jar")\
  .getOrCreate()

gcsClient = storage.Client()
bqClient = bigquery.Client()

In [7]:
def get_path_in_curated_zone(datetime: datetime, origin: str):
  return f"gs://{BUCKET_NAME}/curated-zone/{origin}/{datetime.strftime('%Y%m')}/{datetime.strftime('%Y%m%d')}.parquet"

def get_path_in_refined_zone(datetime: datetime, origin: str, table_name: str):
  return f"gs://{BUCKET_NAME}/refined-zone/{origin}/{datetime.strftime('%Y%m')}/{table_name}.parquet"

def get_prefix_in_curated(datetime: datetime, origin: str):
  return f"curated-zone/{origin}/{datetime.strftime('%Y%m')}"

In [8]:
def reduce_dataframe(dfs: list):
  if len(dfs) == 0:
    return spark.createDataFrame([], StringType())
  elif len(dfs) == 1:
    return dfs[0]
  else:
    return reduce(DataFrame.unionByName, dfs)

def save_dataframe(df: DataFrame, origin: str, table_name: str):
  df.write.format("parquet").mode("overwrite").save(get_path_in_refined_zone(CURRENT_DATE,origin, table_name))

In [9]:
blobs_per_origin = {}

for origin_name in ORIGINS:
  blobs = gcsClient.list_blobs(BUCKET_NAME, prefix=get_prefix_in_curated(CURRENT_DATE, origin_name))
  blobs_per_origin[origin_name] = [f"gs://{BUCKET_NAME}/{blob.name}" for blob in blobs if blob.name.endswith(".parquet/")]

In [10]:
for origin in ORIGINS:
  dfs_leads = []
  dfs_lead_addresses = []
  dfs_orders = []
  dfs_histories = []

  for path in blobs_per_origin[origin]:
    df = spark.read.parquet(path)

    step_columns = [col for col in df.columns if col.startswith("etapa_")]

    df = df.withColumn("last_message_date",
      F.col("last_message_date").cast(StringType()).substr(0, 19)
    )

    df_leads = df.dropDuplicates(["identity", "cpf"])\
      .select("identity", "name", "cpf", "phone_number")\
      .distinct()

    df_order = df.dropDuplicates(["identity", "last_message_date"])\
      .where(F.col("etapa_finalizacao") == True)\
      .select([
        "identity",
        "last_message_date",
        "identificador_pedido",
        "plan_name",
        "plan_description",
        "price"
      ])\
      .distinct()

    df_history = df.withColumn("date", F.to_date(F.col("last_message_date")))\
      .withColumn("hour", F.date_format(F.col("last_message_date"), 'HH'))\
      .withColumn("identity_date", F.concat(F.col("identity"), F.lit("_"), F.col("date")))\
      .select("identity", "date", "hour", "tipo", "identificador_pedido", "num_bot",
      'utm_term', 'utm_campaign', "identity_date", "primeira_mensagem", "last_message_date", "step_abandono", *step_columns)\
      .dropDuplicates(["identity", "date", "hour"])\
      .distinct()\
      .sort(F.col("last_message_date").asc())

    dfs_leads.append(df_leads)
    dfs_orders.append(df_order)
    dfs_histories.append(df_history)
    pass

  df_final_leads = reduce_dataframe(dfs_leads)
  df_final_lead_addresses = reduce_dataframe(dfs_lead_addresses)
  df_final_order = reduce_dataframe(dfs_orders)
  df_final_history = reduce_dataframe(dfs_histories)

  if df_final_leads is not None:
    save_dataframe(df=df_final_leads, origin=origin, table_name="leads")

  if df_final_lead_addresses is not None:
    save_dataframe(df=df_final_lead_addresses, origin=origin, table_name="lead_addresses")

  if df_final_order is not None:
    save_dataframe(df=df_final_order, origin=origin, table_name="orders")

  if df_final_history is not None:
    save_dataframe(df=df_final_history, origin=origin, table_name="history_leads")