## Transformação camada prata: CountryRegion

In [0]:
%run ../Config/DeltaFunctions

In [0]:
%run ../Config/LogProcessamento 

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql import functions as F
from pyspark.sql import DataFrame, Window

In [0]:
# Habilitar a evolução automática de esquemas
spark.sql("SET spark.databricks.delta.schema.autoMerge.enabled = true")


# Informações da Tabela Fonte
source_table =  "person_person"
source_database = "adventure_works_bronze"
bronze_source_table = spark.read.table(f"{source_database}.{source_table}")

# Informações da Tabela Destino (target)
target_table_name =  "person_person"
target_database = "adventure_works_silver"
target_table = f"{target_database}.{target_table_name}"

primary_keys = ["EmailAddressID"]

In [0]:
expected_schema = StructType([
    StructField("BusinessEntityID", IntegerType(), False),                # int NOT NULL
    StructField("PersonType", StringType(), False),                       # nchar(2) NOT NULL
    StructField("NameStyle", IntegerType(), False),                       # dbo.NameStyle NOT NULL (Assumed IntegerType)
    StructField("Title", StringType(), True),                             # nvarchar(8) NULL
    StructField("FirstName", StringType(), False),                        # dbo.Name NOT NULL
    StructField("MiddleName", StringType(), True),                        # dbo.Name NULL
    StructField("LastName", StringType(), False),                         # dbo.Name NOT NULL
    StructField("Suffix", StringType(), True),                            # nvarchar(10) NULL
    StructField("EmailPromotion", IntegerType(), False),                  # int NOT NULL
    StructField("AdditionalContactInfo", StringType(), True),             # xml NULL
    StructField("Demographics", StringType(), True),                      # xml NULL
    StructField("rowguid", StringType(), False),                          # uniqueidentifier NOT NULL
    StructField("ModifiedDate", TimestampType(), False)                   # datetime NOT NULL
])


In [0]:
def transform_Person(Person: DataFrame) -> DataFrame:
    '''
    Transformação da tabela: Person

    Parâmetros:
        Person (DataFrame): DataFrame contendo os dados da tabela Person

    Retorna:
        DataFrame: O DataFrame resultante após a transformação e deduplicação.
    '''

    person = person.withColumn(
        'rowguid',
        F.when(F.col('rowguid').isNull(), F.expr('uuid()')).otherwise(F.col('rowguid'))
    )
    person = person.withColumn(
        'ModifiedDate',
        F.when(F.col('ModifiedDate').isNull(), current_timestemp()).otherwise(F.col('ModifiedDate'))
    )

    window_spec = windonw.partitionrBy('person_person','BusinessEntityID').orderBy(F.col('ModifiedDate').desc())
    Person = Person.withColumn('row_number', F.row_number().over(window_spec))
    Person = Person.filter(F.col('row_number') == 1).drop(F.col('row_number'))

    Person = Person.filter((F.col('EmailPromotion')>=0) & (F.col('EmailPromotion') <=2))
    person = Person.filter(F.col('PersonType').isin(['GC', 'SP', 'EM', 'IN', 'VC', 'SC']))

    Person = Person.select(
        F.col('BusinessEntityID').cast(IntegerType()).alias('BusinessEntityID'),
        F.col('PersonType').cast(StringType()).alias('PersonType'),
        F.col('NameStyle').cast(IntegerType()).alias('NameStyle'),
        F.col('Title').cast(StringType()).alias('Title'),
        F.col('FirstName').cast(StringType()).alias('FirstName'),
        F.col('MiddleName').cast(StringType()).alias('MiddleName'),
        F.col('LastName').cast(StringType()).alias('LastName'),
        F.col('Suffix').cast(StringType()).alias('Suffix'),
        F.col('EmailPromotion').cast(IntegerType()).alias('EmailPromotion'),
        F.col('AdditionalContactInfo').cast(StringType()).alias('AdditionalContactInfo'),  # XML handled as StringType
        F.col('Demographics').cast(StringType()).alias('Demographics'),                    # XML handled as StringType
        F.col('rowguid').cast(StringType()).alias('rowguid'),
        F.col('ModifiedDate').cast(TimestampType()).alias('ModifiedDate')
    )

    return Person



In [0]:
person = spark.read.table('hive_metastore.adventure_works_bronze.person_person')

In [0]:
    person = person.withColumn(
        'rowguid',
        F.when(F.col('rowguid').isNull(), F.expr('uuid()')).otherwise(F.col('rowguid'))
    )
    person = person.withColumn(
        'ModifiedDate',
        F.when(F.col('ModifiedDate').isNull(), current_timestemp()).otherwise(F.col('ModifiedDate'))
    )

    window_spec = windonw.partitionrBy('person_person','BusinessEntityID').orderBy(F.col('ModifiedDate').desc())
    Person = Person.withColumn('row_number', F.row_number().over(window_spec))
    Person = Person.filter(F.col('row_number') == 1).drop(F.col('row_number'))

    Person = Person.filter((F.col('EmailPromotion')>=0) & (F.col('EmailPromotion') <=2))
    person = Person.filter(F.col('PersonType').isin(['GC', 'SP', 'EM', 'IN', 'VC', 'SC']))

    Person = Person.select(
        F.col('BusinessEntityID').cast(IntegerType()).alias('BusinessEntityID'),
        F.col('PersonType').cast(StringType()).alias('PersonType'),
        F.col('NameStyle').cast(IntegerType()).alias('NameStyle'),
        F.col('Title').cast(StringType()).alias('Title'),
        F.col('FirstName').cast(StringType()).alias('FirstName'),
        F.col('MiddleName').cast(StringType()).alias('MiddleName'),
        F.col('LastName').cast(StringType()).alias('LastName'),
        F.col('Suffix').cast(StringType()).alias('Suffix'),
        F.col('EmailPromotion').cast(IntegerType()).alias('EmailPromotion'),
        F.col('AdditionalContactInfo').cast(StringType()).alias('AdditionalContactInfo'),  # XML handled as StringType
        F.col('Demographics').cast(StringType()).alias('Demographics'),                    # XML handled as StringType
        F.col('rowguid').cast(StringType()).alias('rowguid'),
        F.col('ModifiedDate').cast(TimestampType()).alias('ModifiedDate')
    )

    return Person



In [0]:
person = person.withColumn(
    'ModifiedDate',
    F.when(F.col('ModifiedDate').isNull(), current_timestemp()).otherwise(F.col('ModifiedDate'))
)

In [0]:
window_spec = windonw.partitionrBy('person_person','BusinessEntityID').orderBy(F.col('ModifiedDate').desc())
Person = Person.withColumn('row_number', F.row_number().over(window_spec))
Person = Person.filter(F.col('row_number') == 1).drop(F.col('row_number'))

In [0]:
Person = Person.filter((F.col('EmailPromotion')>=0) & (F.col('EmailPromotion') <=2))
person = Person.filter(F.col('PersonType').isin(['GC', 'SP', 'EM', 'IN', 'VC', 'SC']))

In [0]:
expected_schema = StructType([
    StructField("EmailAddressID", IntegerType(), False),
    StructField("EmailAddress", StringType(), False),
    StructField("rowguid", StringType(), False),
    StructField("ModifiedDate", TimestampType(), False),
    StructField("BusinessEntityID", IntegerType(), False)
                             
])

In [0]:
# Estrutura do log para registrar informações sobre o processo
log_data = {
    "log_tabela": source_table,
    "log_camada": "Silver",
    "log_origem": "adventure_works_bronze",
    "log_destino": "adventure_works_silver",
}

# Registra o início do processo
addlog(**log_data, log_status='Início', atualizacao=0)

try:
    # Realiza a transformação dos dados
    transformed_df = transform_Person_CountryRegion(Person=bronze_source_table)

    # Verifica rapidamente o número de linhas e o schema do DataFrame
    row_count = transformed_df.count()
    transformed_df.printSchema()

    # Validação do schema
    is_schema_valid = _validate_schema(transformed_df, expected_schema)
    if is_schema_valid:
        addlog(**log_data, log_status='Sucesso', atualizacao=1)
        print("O schema do DataFrame está correto.")
    else:
        raise ValueError("Schema validation failed.")
    
except Exception as e:
    # Registra erro caso ocorra uma exceção
    addlog(**log_data, log_status='Falha', atualizacao=1)
    print(f"Erro ao processar a tabela: {str(e)}")
    raise  

# Se o schema for válido, realiza o upsert
_upsert_silver_table(transformed_df, target_table, primary_keys, not_matched_by_source_action="DELETE")
