In [1]:
from pyspark.sql.types import StringType
from datetime import datetime, timedelta
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from google.cloud import storage
from functools import reduce

In [2]:
CURRENT_DATE_ARG = None

In [3]:
BUCKET_NAME = "vivo-api-manager-gcs-sp"
ORIGINS = ["vivo-b2b-movel", "vivo-b2b-cross"]
CURRENT_DATE = datetime.strptime(CURRENT_DATE_ARG, '%Y-%m-%dT%H:%M:%S') if CURRENT_DATE_ARG is not None else datetime.today()

In [4]:
BRAZILIAN_TIMEDIFF = timedelta(hours=3)
CURRENT_DATE = CURRENT_DATE - BRAZILIAN_TIMEDIFF
print(CURRENT_DATE)

2024-10-22 16:08:29.007205


In [5]:
def is_midnight_hour(hour):
  return hour >= 0 and hour <= 4

if is_midnight_hour(CURRENT_DATE.hour):
  CURRENT_DATE = CURRENT_DATE - timedelta(days=1)
  CURRENT_DATE = CURRENT_DATE.replace(hour=23, minute=59, second=59)


In [6]:
# spark = SparkSession.builder.appName("landing_to_raw")\
#   .config("spark.sql.caseSensitive", "True")\
#   .config("spark.sql.session.timeZone", "America/Sao_Paulo")\
#   .getOrCreate()
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/jovyan/work/macro_blip_b2b_movel/cloud-macro-blip-cb70a4da44e6.json"

# Criar a sessão Spark
spark = SparkSession.builder \
    .appName("Ler Parquet do GCS") \
    .config("spark.jars", 
            "/usr/local/spark/jars/gcs-connector-hadoop3-latest.jar,"
            "/usr/local/spark/jars/hadoop-common-3.3.6.jar,"
            "/usr/local/spark/jars/hadoop-client-api-3.3.4.jar,"
            "/usr/local/spark/jars/hadoop-client-runtime-3.3.4.jar") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", os.environ["GOOGLE_APPLICATION_CREDENTIALS"]) \
    .config("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .getOrCreate()

gcsClient = storage.Client()
print(gcsClient)

gcsClient = storage.Client()

<google.cloud.storage.client.Client object at 0x7f2ec96726d0>


In [7]:
def get_path_in_landing_zone(datetime: datetime, origin: str):
  return f"gs://{BUCKET_NAME}/{origin}/landing-zone/{datetime.strftime('%Y%m%d')}.parquet"

def get_data_path_in_raw_zone(datetime: datetime, origin: str):
  return f"gs://{BUCKET_NAME}/{origin}/raw-zone/data/{datetime.strftime('%Y%m')}/{datetime.strftime('%Y%m%d-%H%M%S')}.parquet"

# def get_funnel_path_in_raw_zone(datetime: datetime, origin: str):
#   return f"gs://{BUCKET_NAME}/{origin}/raw-zone/funnel/{datetime.strftime('%Y%m')}/{datetime.strftime('%Y%m%d-%H%M%S')}.parquet"

def get_prefix_in_landing_zone(datetime: datetime, origin: str):
  return f"{origin}/landing-zone/{datetime.strftime('%Y%m%d')}"

def get_prefix_in_raw_zone(datetime: datetime, origin: str):
  return f"{origin}/raw-zone/{datetime.strftime('%Y%m%d')}"

def reduce_dataframe(dfs: list):
  if len(dfs) == 0:
    return spark.createDataFrame([], StringType())
  elif len(dfs) == 1:
    return dfs[0]
  else:
    return reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), dfs)

In [8]:
bucket = gcsClient.get_bucket(BUCKET_NAME)

blobs_per_origin = {}

for origin in ORIGINS:
    blobs = bucket.list_blobs(prefix=get_prefix_in_landing_zone(CURRENT_DATE, origin))

    blobs_per_origin[origin] = [f"gs://{BUCKET_NAME}/{blob.name}" for blob in blobs]



In [12]:
dataframes_list = []
for origin, blob_paths in blobs_per_origin.items():
    for blob_path in blob_paths:
        print(blob_path)
        df = spark.read.option("multiline","true").json(blob_path)

        base_columns = [
        "Attempts",
        "Automation",
        "CrmOrderId",
        "EndDateProcess",
        "InitDateProcess",
        "OrderId",
        "OrderRef",
        "ProductType",
        "ProjectId",
        "ProjectType",
        "Retry",
        "SaleType",
        "SendCRM",
        "Status",
      ]

        base_df = df.select(base_columns).withColumn("ProcessedAt", F.lit(CURRENT_DATE))

      # funnel_df = df.select("Statuses", "OrderId")\
      #   .withColumn("Statuses", F.explode(F.col("Statuses")))\
      #   .select(
      #     "OrderId",
      #     F.col("Statuses.CreatedIn").alias("CreatedIn"),
      #     F.col("Statuses.Name").alias("Step")
      #   ).join(base_df, "OrderId")\

      # funnel_df =  funnel_df.withColumn("date", F.to_date(F.col("InitDateProcess")))

        FILE_DATE = datetime.today()
      # funnel_dates = funnel_df.select("date").distinct().collect()
      # funnel_dates = [row.date for row in funnel_dates]
      # funnel_dates = [datetime.strptime(str(date), "%Y-%m-%d").replace(hour=FILE_DATE.hour, minute=FILE_DATE.minute, second=FILE_DATE.second) for date in funnel_dates]

      # for date in funnel_dates:
      #   funnel_df.filter(F.col("date") == F.lit(date.date())).write.parquet(get_funnel_path_in_raw_zone(date, origin), mode="append", compression="gzip")


        data_df = df.select("Datas", "OrderId")\
        .withColumn("Datas", F.explode(F.col("Datas")))\
        .select(
          "OrderId",
          F.col("Datas.Name").alias("Name"),
          F.col("Datas.Value").alias("Value")
        )\
        .groupBy("OrderId")\
        .pivot("Name")\
        .agg({"Value": "first"})\
        .join(base_df, "OrderId")

        data_df =  data_df.withColumn("date", F.to_date(F.col("InitDateProcess")))

        data_dates = data_df.select("date").distinct().collect()
        data_dates = [row.date for row in data_dates]
        data_dates = [datetime.strptime(str(date), "%Y-%m-%d").replace(hour=FILE_DATE.hour, minute=FILE_DATE.minute, second=FILE_DATE.second) for date in data_dates]

        for date in data_dates:
            data_df.filter(F.col("date") == F.lit(date.date())).write.parquet(get_data_path_in_raw_zone(date, origin), mode="append", compression="gzip")

gs://vivo-api-manager-gcs-sp/vivo-b2b-movel/landing-zone/20241022-002009.json


AnalysisException: [COLUMN_ALREADY_EXISTS] The column `empresa`.`razaosocial` already exists. Consider to choose another name or rename the existing column.

In [None]:
# for origin in ORIGINS:
#   bucket.delete_blobs([blob_path.replace(f"gs://{BUCKET_NAME}/", "") for blob_path in blobs_per_origin])