In [3]:
from pyspark.sql.types import StringType
from datetime import datetime, timedelta
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from google.cloud import storage
from functools import reduce

In [4]:
CURRENT_DATE_ARG = None

In [5]:
BUCKET_NAME = "tim-ultrafibra-gcs-sp"
ORIGINS = ["bot_social", "bot_midia", "bot_crm", "bot_ativo", "bot_alpha", "bot_comparador", "bot_crm_2", "bot_social_2"]
CURRENT_DATE = datetime.strptime(CURRENT_DATE_ARG, '%Y-%m-%dT%H:%M:%S') if CURRENT_DATE_ARG is not None else datetime.today() 

In [7]:
BRAZILIAN_TIMEDIFF = timedelta(hours=3)
CURRENT_DATE = CURRENT_DATE - BRAZILIAN_TIMEDIFF
print(CURRENT_DATE)

2024-10-06 17:59:59


In [5]:
def is_midnight_hour(hour):
  return hour >= 0 and hour <= 4

if is_midnight_hour(CURRENT_DATE.hour):
  CURRENT_DATE = CURRENT_DATE - timedelta(days=1)
  CURRENT_DATE = CURRENT_DATE.replace(hour=23, minute=59, second=59)

In [6]:
spark = SparkSession.builder.appName("tim_ultrafibra_landing_to_raw")\
  .config("spark.sql.caseSensitive", "True")\
  .config("spark.sql.session.timeZone", "America/Sao_Paulo")\
  .getOrCreate()

gcsClient = storage.Client()

In [7]:
def get_path_in_landing_zone(datetime: datetime, origin: str):
  return f"gs://{BUCKET_NAME}/landing-zone/{origin}/{datetime.strftime('%Y%m%d')}.parquet"

def get_path_in_raw_zone(datetime: datetime, origin: str):
  return f"gs://{BUCKET_NAME}/raw-zone/{origin}/{datetime.strftime('%Y%m')}/{datetime.strftime('%Y%m%d-%H%M%S')}.parquet"

def get_prefix_in_landing_zone(datetime: datetime, origin: str):
  return f"landing-zone/{origin}/{datetime.strftime('%Y%m%d')}"

def reduce_dataframe(dfs: list):
  if len(dfs) == 0:
    return spark.createDataFrame([], StringType())
  elif len(dfs) == 1:
    return dfs[0]
  else:
    return reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), dfs)

In [8]:
bucket = gcsClient.get_bucket(BUCKET_NAME)

blobs_per_origin = {}

for origin in ORIGINS:
  blobs = bucket.list_blobs(prefix=get_prefix_in_landing_zone(CURRENT_DATE, origin))
  blobs_per_origin[origin] = [f"gs://{BUCKET_NAME}/{blob.name}" for blob in blobs]

In [9]:
for origin in ORIGINS:
  dataframes_list = []
  for blob_path in blobs_per_origin[origin]:
    df = spark.read.option("multiline","true").json(blob_path)
    if "resource" not in df.columns:
      continue

    is_empty = df.select("resource.items").collect()[0][0] == []
    if not is_empty:
      messageFieldNames = df\
          .withColumn("items", F.explode(F.col("resource.items")))\
          .select("items.*")\
          .schema.fieldNames()
      
      if "extras" in messageFieldNames:
        messageFieldNames.remove("extras")

        extraFieldNames = df.select("resource.items")\
          .withColumn("extras", F.explode(F.col("items.extras")))\
          .select("extras.*")\
          .schema.fieldNames()

        extraFieldNames = [fieldName for fieldName in extraFieldNames if fieldName not in messageFieldNames]

        flattened_df = df.select(F.explode(df['resource.items']).alias('items'))
        flattened_df = flattened_df.select(["*"] + [F.col("items.extras")])

        flattened_df = flattened_df.select(
            [F.col(f"extras.`{fieldName}`").alias(f"{fieldName}") for fieldName in extraFieldNames] +
            [F.col(f"items.`{fieldName}`").alias(f"{fieldName}") for fieldName in messageFieldNames]
          )
        flattened_df = flattened_df\
          .withColumn("lastMessageDate", F.from_utc_timestamp(F.col("lastMessageDate"), "UTC"))\
          .filter(F.col("lastMessageDate").cast("date") == CURRENT_DATE.strftime("%Y-%m-%d"))

        dataframes_list.append(flattened_df)
  
  final_dataframe = reduce_dataframe(dataframes_list)
  final_dataframe.write.parquet(get_path_in_raw_zone(CURRENT_DATE, origin), mode="append", compression="gzip")

In [10]:
for origin in ORIGINS:
  bucket.delete_blobs([blob_path.replace(f"gs://{BUCKET_NAME}/", "") for blob_path in blobs_per_origin[origin]])