In [3]:
from pyspark.sql.types import StringType, BooleanType
from datetime import datetime, timedelta
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from google.cloud import storage, bigquery
from functools import reduce

In [4]:
CURRENT_DATE_ARG = None

In [5]:
PARENT_PROJECT = "cloud-macro-blip"
BUCKET_NAME = "vivo-api-manager-gcs-sp"
ORIGIN = "vivo-b2b-movel"
CURRENT_DATE = datetime.strptime(CURRENT_DATE_ARG, '%Y-%m-%dT%H:%M:%S') if CURRENT_DATE_ARG is not None else datetime.today() 

In [6]:
BRAZILIAN_TIMEDIFF = timedelta(hours=3)
CURRENT_DATE = CURRENT_DATE - BRAZILIAN_TIMEDIFF

In [7]:
def is_midnight_hour(hour):
  return hour >= 0 and hour <= 4

if is_midnight_hour(CURRENT_DATE.hour):
  CURRENT_DATE = CURRENT_DATE - timedelta(days=1)
  CURRENT_DATE = CURRENT_DATE.replace(hour=23, minute=59, second=59)


In [8]:
# spark = SparkSession.builder.appName("curated_to_refined")\
#   .config('parentProject', PARENT_PROJECT)\
#   .config("spark.sql.caseSensitive", "True")\
#   .config('spark.sql.session.timeZone', 'America/Sao_Paulo')\
#   .config("spark.jars", "gs://spark-lib/bigquery/spark-3.3-bigquery-0.34.0.jar")\
#   .getOrCreate()

# gcsClient = storage.Client()

import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/jovyan/work/macro_blip_b2b_movel/cloud-macro-blip-cb70a4da44e6.json"

# Criar a sessão Spark
spark = SparkSession.builder \
    .appName("Ler Parquet do GCS") \
    .config("spark.jars", 
            "/usr/local/spark/jars/gcs-connector-hadoop3-latest.jar,"
            "/usr/local/spark/jars/hadoop-common-3.3.6.jar,"
            "/usr/local/spark/jars/hadoop-client-api-3.3.4.jar,"
            "/usr/local/spark/jars/hadoop-client-runtime-3.3.4.jar") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", os.environ["GOOGLE_APPLICATION_CREDENTIALS"]) \
    .config("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .getOrCreate()

gcsClient = storage.Client()
print(gcsClient)

bqClient = bigquery.Client()

<google.cloud.storage.client.Client object at 0x7fb3b05df090>


In [9]:
def get_data_path_in_refined_zone(datetime: datetime, origin: str, project_type: str, table_name: str):
  return f"gs://{BUCKET_NAME}/{origin}/refined-zone/data/{project_type}/{datetime.strftime('%Y%m')}/{table_name}.parquet"

# def get_funnel_path_in_refined_zone(datetime: datetime, origin: str, project_type: str, table_name: str):
#   return f"gs://{BUCKET_NAME}/{origin}/refined-zone/funnel/{project_type}/{datetime.strftime('%Y%m')}/{table_name}.parquet"

# def get_funnel_prefix_in_curated_zone(datetime: datetime, origin: str):
#   return f"{origin}/curated-zone/funnel/{datetime.strftime('%Y%m')}"

def get_data_prefix_in_curated_zone(datetime: datetime, origin: str):
  return f"{origin}/curated-zone/data/{datetime.strftime('%Y%m')}"

def reduce_dataframe(dfs: list):
  if len(dfs) == 0:
    return spark.createDataFrame([], StringType())
  elif len(dfs) == 1:
    return dfs[0]
  else:
    return reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), dfs)

In [10]:
bucket = gcsClient.get_bucket(BUCKET_NAME)

funnel_blobs = []
data_blobs = []

# blobs = bucket.list_blobs(prefix=get_funnel_prefix_in_curated_zone(CURRENT_DATE, ORIGIN))
# funnel_blobs = [f"gs://{BUCKET_NAME}/{blob.name}" for blob in blobs if blob.name.endswith(".parquet/")]

blobs = bucket.list_blobs(prefix=get_data_prefix_in_curated_zone(CURRENT_DATE, ORIGIN))
data_blobs = [f"gs://{BUCKET_NAME}/{blob.name}" for blob in blobs if blob.name.endswith(".parquet/")]

In [11]:
# funnel_columns = [
#   "order_id",
#   "product_type",
#   "project_id",
#   "processed_at",
#   "created_in",
#   "step",
#   "end_date_process",
#   "init_date_process",
#   "order_ref",
#   "project_type",
#   "sale_type",
#   "send_crm",
#   "status"
# ]

In [12]:
# funnel_dfs = []
# for funnel_blob in funnel_blobs:
#   funnel_df = spark.read.parquet(funnel_blob)
#   funnel_dfs.append(funnel_df)

# final_funnel_df = reduce_dataframe(funnel_dfs)
# funnel_df_grouped = final_funnel_df\
#   .groupBy("order_id", "product_type", "project_id")\
#   .agg(F.max("processed_at").alias("processed_at"))\
#   .orderBy("processed_at")

# final_funnel_df = funnel_df_grouped.join(final_funnel_df, funnel_df_grouped.columns, "inner")
# final_funnel_df = final_funnel_df\
#   .withColumn("processed_at", F.date_format(F.from_utc_timestamp(F.col("processed_at"), "UTC"), "yyyy-MM-dd HH:mm:ss"))\
#   .withColumn("created_in", F.date_format(F.from_utc_timestamp(F.col("created_in"), "America/Sao_Paulo"), "yyyy-MM-dd HH:mm:ss"))\
#   .withColumn("end_date_process", F.date_format(F.from_utc_timestamp(F.col("end_date_process"), "America/Sao_Paulo"), "yyyy-MM-dd HH:mm:ss"))\
#   .withColumn("init_date_process", F.date_format(F.from_utc_timestamp(F.col("init_date_process"), "America/Sao_Paulo"), "yyyy-MM-dd HH:mm:ss"))\
#   .select(funnel_columns)

# final_funnel_df.where(F.col("project_type") == "Chatbot").write.mode("overwrite").parquet(get_funnel_path_in_refined_zone(CURRENT_DATE, ORIGIN, "chatbot", "chatbot_funnel"))
# # final_funnel_df.where(F.col("project_type") == "Chatbot").write.mode("overwrite").parquet(get_funnel_path_in_refined_zone(CURRENT_DATE, ORIGIN, "chatbot", "chatbot_funnel"))
# final_funnel_df.where(F.col("project_type") == "Checkout").write.mode("overwrite").parquet(get_funnel_path_in_refined_zone(CURRENT_DATE, ORIGIN, "checkout", "checkout_funnel"))

In [13]:
base_history_columns = [
  'order_id',
  'crm_order_id',
  'end_date_process',
  'init_date_process',
  'order_ref',
  'product_type',
  'project_id',
  'project_type',
  'send_crm',
  'status',
  'processed_at',
  'sku',
  'telefone',
  'telefone_secundario',
  'utm_dados_adicionais',
  'utm_medium',
  'utm_source',
  'utm_campaign',
  'utm_referrer',
  'utm_term',
  'sale_type',
]

chatbot_history_columns = [
  'email',
  'cep',
  'logradouro',
  'num_imovel',
  'bairro',
  'estado',
  'cidade',
  'complemento',
  'ponto_referencia',
  'cpf',
  'empresa_bairro',
  'empresa_cep',
  'empresa_cidade',
  'empresa_complemento',
  'empresa_estado',
  'empresa_logradouro',
  'empresa_num_imovel',
  'cnpj',
  'razao_social',
  'nome_fantasia',
  'socios',
  'empresa_telefone',
  'nome',
  'data_nascimento',
  'nome_mae',
  'contrato_checked',
  'termo',
  'linhas_moveis',
  'cliente_base',
  'codinome',
  'licencas_office',
  'projeto',
]

checkout_history_columns = ['autenticacao_data', 'autenticacao_id', 'autenticacao_token']


In [14]:
data_dfs = []
for data_blob in data_blobs:
  data_df = spark.read.parquet(data_blob)
  data_dfs.append(data_df)

data_df = reduce_dataframe(data_dfs)
data_df_grouped = data_df\
  .groupBy("order_id", "product_type", "project_id")\
  .agg(F.max("processed_at").alias("processed_at"))\
  .orderBy("processed_at")\
  .select("*")

final_data_df = data_df_grouped.join(data_df, data_df_grouped.columns, "inner")
final_data_df = final_data_df\
  .withColumn("processed_at", F.date_format(F.from_utc_timestamp(F.col("processed_at"), "UTC"), "yyyy-MM-dd HH:mm:ss"))\
  .withColumn("end_date_process", F.date_format(F.from_utc_timestamp(F.col("end_date_process"), "America/Sao_Paulo"), "yyyy-MM-dd HH:mm:ss"))\
  .withColumn("init_date_process", F.date_format(F.from_utc_timestamp(F.col("init_date_process"), "America/Sao_Paulo"), "yyyy-MM-dd HH:mm:ss"))\
  .select("*")

chatbot_history = final_data_df.where(F.col("project_type") == "Chatbot").select(*[base_history_columns + chatbot_history_columns])
checkout_history = final_data_df.where(F.col("project_type") == "Checkout").select(*[base_history_columns])

chatbot_history.write.mode("overwrite").parquet(get_data_path_in_refined_zone(CURRENT_DATE, ORIGIN, "chatbot", "chatbot_history"))
checkout_history.write.mode("overwrite").parquet(get_data_path_in_refined_zone(CURRENT_DATE, ORIGIN, "checkout", "checkout_history"))