In [1]:
from pyspark.sql.types import StringType
from datetime import datetime, timedelta
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from google.cloud import storage
from functools import reduce

import os
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "cloud-macro-tim-2623594ed8c4.json"


In [2]:
CURRENT_DATE_ARG = None

In [3]:
BUCKET_NAME = "tim-ultrafibra-gcs-sp"
ORIGINS = ["bot_social", "bot_midia", "bot_crm", "bot_ativo", "bot_alpha", "bot_comparador", "bot_crm_2", "bot_social_2"]
CURRENT_DATE = datetime.strptime(CURRENT_DATE_ARG, '%Y-%m-%dT%H:%M:%S') if CURRENT_DATE_ARG is not None else datetime.today() 

In [4]:
BRAZILIAN_TIMEDIFF = timedelta(hours=3)
CURRENT_DATE = CURRENT_DATE - BRAZILIAN_TIMEDIFF
print(CURRENT_DATE)

2024-10-22 15:35:41.320554


In [5]:
def is_midnight_hour(hour):
  return hour >= 0 and hour <= 4

if is_midnight_hour(CURRENT_DATE.hour):
  CURRENT_DATE = CURRENT_DATE - timedelta(days=1)
  CURRENT_DATE = CURRENT_DATE.replace(hour=23, minute=59, second=59)

In [6]:
# spark = SparkSession.builder.appName("tim_ultrafibra_landing_to_raw")\
#   .config("spark.sql.caseSensitive", "True")\
#   .config("spark.sql.session.timeZone", "America/Sao_Paulo")\
#   .getOrCreate()

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/jovyan/work/cloud-macro-tim-2623594ed8c4.json"

# Criar a sessÃ£o Spark
spark = SparkSession.builder \
    .appName("Ler Parquet do GCS") \
    .config("spark.jars", 
            "/usr/local/spark/jars/gcs-connector-hadoop3-latest.jar,"
            "/usr/local/spark/jars/hadoop-common-3.3.6.jar,"
            "/usr/local/spark/jars/hadoop-client-api-3.3.4.jar,"
            "/usr/local/spark/jars/hadoop-client-runtime-3.3.4.jar") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", os.environ["GOOGLE_APPLICATION_CREDENTIALS"]) \
    .config("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .getOrCreate()

gcsClient = storage.Client()
print(gcsClient)

<google.cloud.storage.client.Client object at 0x7f54bf69d6d0>


In [7]:
def get_path_in_landing_zone(datetime: datetime, origin: str):
  return f"gs://{BUCKET_NAME}/landing-zone/{origin}/{datetime.strftime('%Y%m%d')}.parquet"

def get_path_in_raw_zone(datetime: datetime, origin: str):
  return f"gs://{BUCKET_NAME}/raw-zone/{origin}/{datetime.strftime('%Y%m')}/{datetime.strftime('%Y%m%d-%H%M%S')}.parquet"

def get_prefix_in_landing_zone(datetime: datetime, origin: str):
  return f"landing-zone/{origin}/{datetime.strftime('%Y%m%d')}"

def reduce_dataframe(dfs: list):
  if len(dfs) == 0:
    return spark.createDataFrame([], StringType())
  elif len(dfs) == 1:
    return dfs[0]
  else:
    return reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), dfs)

In [12]:
bucket = gcsClient.get_bucket(BUCKET_NAME)

blobs_per_origin = {}

for origin in ORIGINS:
    blobs = bucket.list_blobs(prefix=get_prefix_in_landing_zone(CURRENT_DATE, origin))
    blobs_per_origin[origin] = [f"gs://{BUCKET_NAME}/{blob.name}" for blob in blobs]
    print(blobs_per_origin[origin])

['gs://tim-ultrafibra-gcs-sp/landing-zone/bot_social/20241022-143007.json']
['gs://tim-ultrafibra-gcs-sp/landing-zone/bot_midia/20241022-143007.json']
['gs://tim-ultrafibra-gcs-sp/landing-zone/bot_crm/20241022-143007.json']
['gs://tim-ultrafibra-gcs-sp/landing-zone/bot_ativo/20241022-143007.json']
['gs://tim-ultrafibra-gcs-sp/landing-zone/bot_alpha/20241022-143007.json']
['gs://tim-ultrafibra-gcs-sp/landing-zone/bot_comparador/20241022-143006.json']
['gs://tim-ultrafibra-gcs-sp/landing-zone/bot_crm_2/20241022-143006.json']
['gs://tim-ultrafibra-gcs-sp/landing-zone/bot_social_2/20241022-143118.json']


In [13]:
for origin in ORIGINS:
  dataframes_list = []
  for blob_path in blobs_per_origin[origin]:
    print(blob_path)
    df = spark.read.option("multiline","true").json(blob_path)
    if "resource" not in df.columns:
      continue

    is_empty = df.select("resource.items").collect()[0][0] == []
    if not is_empty:
      messageFieldNames = df\
          .withColumn("items", F.explode(F.col("resource.items")))\
          .select("items.*")\
          .schema.fieldNames()
      
      if "extras" in messageFieldNames:
        messageFieldNames.remove("extras")

        extraFieldNames = df.select("resource.items")\
          .withColumn("extras", F.explode(F.col("items.extras")))\
          .select("extras.*")\
          .schema.fieldNames()

        extraFieldNames = [fieldName for fieldName in extraFieldNames if fieldName not in messageFieldNames]

        flattened_df = df.select(F.explode(df['resource.items']).alias('items'))
        flattened_df = flattened_df.select(["*"] + [F.col("items.extras")])

        flattened_df = flattened_df.select(
            [F.col(f"extras.`{fieldName}`").alias(f"{fieldName}") for fieldName in extraFieldNames] +
            [F.col(f"items.`{fieldName}`").alias(f"{fieldName}") for fieldName in messageFieldNames]
          )
        flattened_df = flattened_df\
          .withColumn("lastMessageDate", F.from_utc_timestamp(F.col("lastMessageDate"), "UTC"))\
          .filter(F.col("lastMessageDate").cast("date") == CURRENT_DATE.strftime("%Y-%m-%d"))

        dataframes_list.append(flattened_df)
  
  final_dataframe = reduce_dataframe(dataframes_list)
  final_dataframe.write.parquet(get_path_in_raw_zone(CURRENT_DATE, origin), mode="append", compression="gzip")

gs://tim-ultrafibra-gcs-sp/landing-zone/bot_social/20241022-143007.json


AnalysisException: [COLUMN_ALREADY_EXISTS] The column `utmmedium` already exists. Consider to choose another name or rename the existing column.

In [None]:
for origin in ORIGINS:
  bucket.delete_blobs([blob_path.replace(f"gs://{BUCKET_NAME}/", "") for blob_path in blobs_per_origin[origin]])