In [1]:
from pyspark.sql.types import *
from datetime import datetime, timedelta
from pyspark.sql import functions as F
from pyspark.sql import SparkSession, DataFrame
from google.cloud import storage
from functools import reduce
import re

In [2]:
CURRENT_DATE_ARG = "2024-09-30T23:59:59"

In [3]:
BUCKET_NAME = "vivo-api-manager-gcs-sp"
ORIGINS = ["vivo-b2b-movel", "vivo-b2b-cross"]
CURRENT_DATE = datetime.strptime(CURRENT_DATE_ARG, '%Y-%m-%dT%H:%M:%S') if CURRENT_DATE_ARG is not None else datetime.today()

In [4]:
BRAZILIAN_TIMEDIFF = timedelta(hours=3)
CURRENT_DATE = CURRENT_DATE - BRAZILIAN_TIMEDIFF
print(CURRENT_DATE)

2024-09-30 20:59:59


In [5]:
def is_midnight_hour(hour):
  return hour >= 0 and hour <= 4

if is_midnight_hour(CURRENT_DATE.hour):
  CURRENT_DATE = CURRENT_DATE - timedelta(days=1)
  CURRENT_DATE = CURRENT_DATE.replace(hour=23, minute=59, second=59)


In [6]:
# spark = SparkSession.builder.appName("raw_to_curated")\
#   .config("spark.sql.caseSensitive", "True")\
#   .config("spark.sql.session.timeZone", "America/Sao_Paulo")\
#   .getOrCreate()

import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/jovyan/work/macro_blip_b2b_movel/cloud-macro-blip-cb70a4da44e6.json"

# Criar a sessão Spark
spark = SparkSession.builder \
    .appName("Ler Parquet do GCS") \
    .config("spark.sql.caseSensitive", "True")\
    .config("spark.sql.session.timeZone", "America/Sao_Paulo")\
    .config("spark.network.timeout", "600s")\
    .config("spark.jars", 
            "/usr/local/spark/jars/gcs-connector-hadoop3-latest.jar,"
            "/usr/local/spark/jars/hadoop-common-3.3.6.jar,"
            "/usr/local/spark/jars/hadoop-client-api-3.3.4.jar,"
            "/usr/local/spark/jars/hadoop-client-runtime-3.3.4.jar") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", os.environ["GOOGLE_APPLICATION_CREDENTIALS"]) \
    .config("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .getOrCreate()

gcsClient = storage.Client()
print(gcsClient)

<google.cloud.storage.client.Client object at 0x7f18e7acff50>


In [7]:
def get_data_path_in_curated_zone(datetime: datetime, origin: str):
    return f"gs://{BUCKET_NAME}/{origin}/curated-zone/data/{datetime.strftime('%Y%m')}/{datetime.strftime('%Y%m%d')}.parquet"

def get_data_prefix_in_raw_zone(datetime: datetime, origin: str):
    return f"{origin}/raw-zone/data/{datetime.strftime('%Y%m')}/{datetime.strftime('%Y%m%d')}"

In [8]:
def to_snake_case(input_string):
    input_string = input_string.replace(" ", "_").replace("-", "_")
    snake_case_string = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', input_string)
    return snake_case_string.lower()

def sanitize_column_name(text):
    text = to_snake_case(text).replace("/", "_")
    text = re.sub(r"_\d+(\.\d?)*_", "_", text, count=1)
    return text

def rename_columns(df: DataFrame) -> DataFrame:
    for col_name in df.columns:
        df = df.withColumnRenamed(col_name, sanitize_column_name(col_name))
    return df

In [9]:
def deduplicate_columns(df: DataFrame) -> DataFrame:
    from collections import defaultdict

    new_column_names = [f"{col_name}_{i}" for i, col_name in enumerate(df.columns)]
    df = df.toDF(*new_column_names)

    column_groups = defaultdict(list)
    for new_col_name, old_col_name in zip(new_column_names, df.columns):
        base_name = old_col_name.rsplit('_', 1)[0]
        column_groups[base_name].append(new_col_name)

    for col_name, group in column_groups.items():
        if len(group) > 1:
            df = df.withColumn(f"{col_name}", F.coalesce(*[F.col(f"`{c}`") for c in group]))
        else:
            df = df.withColumnRenamed(group[0], f"{col_name}")

    unique_columns = [f"`{col}`" for col in column_groups.keys()]
    df = df.select(*unique_columns)
    return df

In [10]:
def convert_empty_to_null(df: DataFrame, columns: list):
    for column_name in columns:
        column_type = df.schema[column_name].dataType
        if column_type == StringType():
            df = df.withColumn(column_name, F.when(F.col(column_name) == "", F.lit(None).cast(column_type)).otherwise(F.col(column_name)))
        elif column_type == BooleanType():
            df = df.withColumn(column_name, F.when(F.col(column_name).isNull(), F.lit(None).cast(BooleanType())).otherwise(F.col(column_name)))
    return df

def reduce_dataframe(dfs: list):
    if len(dfs) == 0:
        return spark.createDataFrame([], StringType())
    elif len(dfs) == 1:
        return dfs[0]
    else:
        return reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), dfs)


In [11]:
bucket = gcsClient.get_bucket(BUCKET_NAME)
processing_dates = [CURRENT_DATE - timedelta(days=i) for i in range(0, 3)]
print(processing_dates)
funnel_blobs = {}
data_blobs = {}

for processing_date in processing_dates:
    print(processing_date)
    funnel_blobs[processing_date] = []
    data_blobs[processing_date] = []

    for origin in ORIGINS:
        # blobs = bucket.list_blobs(prefix=get_funnel_prefix_in_raw_zone(processing_date, origin))
        # funnel_blobs[processing_date].extend([f"gs://{BUCKET_NAME}/{blob.name}" for blob in blobs if blob.name.endswith(".parquet/")])
        print(origin)
        blobs = bucket.list_blobs(prefix=get_data_prefix_in_raw_zone(processing_date, origin))
        data_blobs[processing_date].extend([f"gs://{BUCKET_NAME}/{blob.name}" for blob in blobs if blob.name.endswith(".parquet/")])
        print(data_blobs[processing_date])

[datetime.datetime(2024, 9, 30, 20, 59, 59), datetime.datetime(2024, 9, 29, 20, 59, 59), datetime.datetime(2024, 9, 28, 20, 59, 59)]
2024-09-30 20:59:59
vivo-b2b-movel
['gs://vivo-api-manager-gcs-sp/vivo-b2b-movel/raw-zone/data/202409/20240930-002857.parquet/', 'gs://vivo-api-manager-gcs-sp/vivo-b2b-movel/raw-zone/data/202409/20240930-002929.parquet/', 'gs://vivo-api-manager-gcs-sp/vivo-b2b-movel/raw-zone/data/202409/20240930-002942.parquet/', 'gs://vivo-api-manager-gcs-sp/vivo-b2b-movel/raw-zone/data/202409/20240930-150702.parquet/', 'gs://vivo-api-manager-gcs-sp/vivo-b2b-movel/raw-zone/data/202409/20240930-150746.parquet/', 'gs://vivo-api-manager-gcs-sp/vivo-b2b-movel/raw-zone/data/202409/20240930-150803.parquet/', 'gs://vivo-api-manager-gcs-sp/vivo-b2b-movel/raw-zone/data/202409/20240930-152818.parquet/', 'gs://vivo-api-manager-gcs-sp/vivo-b2b-movel/raw-zone/data/202409/20240930-152901.parquet/', 'gs://vivo-api-manager-gcs-sp/vivo-b2b-movel/raw-zone/data/202409/20240930-152917.parqu

In [12]:
# # Fatiar a lista para pegar apenas os 5 primeiros blobs
# all_blobs_t = [blob for processing_date in processing_dates for blob in data_blobs[processing_date]]


# first_five_blobs = all_blobs_t[:1]
# print(first_five_blobs)

# # Ler os primeiros 5 arquivos Parquet no DataFrame
# df = spark.read.parquet(*first_five_blobs)

# # # Exibir o DataFrame
# # df.show()  # Exibe as primeiras linhas do DataFrame


In [13]:
# df.show()

In [None]:
all_blobs = [blob for processing_date in processing_dates for blob in data_blobs[processing_date]]
#print(all_blobs)
if all_blobs:
    df = spark.read.parquet(*all_blobs)
    #print(df)

#Aplicar transformações
df = rename_columns(df)
df = deduplicate_columns(df)
print('rodou')

In [None]:
data_columns = {
  'order_id': 'order_id',
  'attempts': 'attempts',
  'automation': 'automation',
  'crm_order_id': 'crm_order_id',
  'end_date_process': 'end_date_process',
  'init_date_process': 'init_date_process',
  'order_ref': 'order_ref',
  'product_type': 'product_type',
  'project_id': 'project_id',
  'project_type': 'project_type',
  'retry': 'retry',
  'sale_type': 'sale_type',
  'send_crm': 'send_crm',
  'status': 'status',
  'processed_at': 'processed_at',
  "utm.dados_adicionais": "utm_dados_adicionais",
  "utm.medium": "utm_medium",
  "utm.source": "utm_source",
  'utm.campaign': 'utm_campaign',
  'utm.referrer': 'utm_referrer',
  "utm.term": "utm_term",
  "sku": "sku",
  "telefone": "telefone",
  "telefonesecundario": "telefone_secundario",
  "email": "email",
  "endereco.cep": "cep",
  "endereco.logradouro": "logradouro",
  "endereco.num_imovel": "num_imovel",
  "endereco.bairro": "bairro",
  "endereco.estado": "estado",
  "endereco.cidade": "cidade",
  "endereco.complemento": "complemento",
  "endereco.ponto_referencia ": "ponto_referencia",
  "cpf": "cpf",
  "empresa.endereco.bairro": "empresa_bairro",
  "empresa.endereco.cep": "empresa_cep",
  "empresa.endereco.cidade": "empresa_cidade",
  "empresa.endereco.complemento": "empresa_complemento",
  "empresa.endereco.estado": "empresa_estado",
  "empresa.endereco.logradouro": "empresa_logradouro",
  "empresa.endereco.numimovel": "empresa_num_imovel",
  "empresa.cnpj": "cnpj",
  "empresa.razao_social": "razao_social",
  "empresa.nomefantasia": "nome_fantasia",
  "empresa.socios": "socios",
  "empresa.telefone": "empresa_telefone",
  "nome": "nome",
  "datanascimento": "data_nascimento",
  "nomemae": "nome_mae",
  "contrato.checked": "contrato_checked",
  "termo": "termo",
  "linhas_moveis": "linhas_moveis",
  "autenticacao.data": "autenticacao_data",
  "autenticacao.id": "autenticacao_id",
  "autenticacao.token": "autenticacao_token",
  "cliente_base": "cliente_base",
  "codinome": "codinome",
  "licencas_office": "licencas_office",
  "projeto": "projeto",
}

In [None]:
final_data_df = df\
    .where(F.col("product_type").isin("Controle", "ChatMovelB2B", "ChatMovelB2BOficial", "MDM"))\
    .selectExpr(*[f"`{column}` as {alias}" if column in df.columns else f"CAST(NULL as string) as {alias}" for column, alias in data_columns.items()])\
    .drop(*["attempts", "automation", "retry"])\
    .withColumn("init_date_process", F.to_utc_timestamp("init_date_process", "America/Sao_Paulo"))\
    .withColumn("end_date_process", F.to_utc_timestamp("end_date_process", "America/Sao_Paulo"))\
    .withColumn("send_crm", F.when(F.col("send_crm") == "1", True).otherwise(False))\
    .withColumn("processed_at", F.to_utc_timestamp("processed_at", "America/Sao_Paulo"))
print('ok final_data_df')

# Tratamento de valores vazios
final_data_df = convert_empty_to_null(final_data_df, final_data_df.columns)

print('final_data_df')
# Gravar arquivo final





In [None]:
CURRENT_DATE = datetime.strptime(CURRENT_DATE_ARG, '%Y-%m-%dT%H:%M:%S')

In [None]:
final_data_df.write.mode("overwrite").parquet(get_data_path_in_curated_zone(CURRENT_DATE, "vivo-b2b-movel"))
print('overwrite')


In [None]:
print(final_data_df)