In [0]:
from pyspark.sql.functions import col, when, lower
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType


In [0]:
%run ./001_common_utils

In [0]:
# --- Clase para transformar datos ---
class UserVerifiedTransformer:
    def __init__(self, df: DataFrame):
        self.df = df

    def add_solucion_column(self) -> "UserVerifiedTransformer":
        self.df = self.df.withColumn(
            "solucion",
            when(lower(col("grupo")).contains("sl"), "Storelive")
            .when(lower(col("grupo")).contains("sv"), "StoreView")
            .when(lower(col("grupo")).contains("sc"), "StoreConnect")
            .when(lower(col("grupo")).contains("ml"), "Marketlink")
            .otherwise("Otros"),
        )
        return self

    def add_user_type_column(self) -> "UserVerifiedTransformer":
        self.df = self.df.withColumn(
            "tipo_de_usuario",
            when(col("email").contains("dichter"), "Interno").otherwise("Externo"),
        )
        return self

    def clean_column_names(self) -> "UserVerifiedTransformer":
        self.df = self.df.toDF(*(c.replace(" ", "_") for c in self.df.columns))
        return self

    def get_transformed_df(self) -> DataFrame:
        return self.df

In [0]:
# Crear lector de base de datos
reader = PostgresReader(
    scope="secret-storeview",
    username_key="username-keycloak-db",
    password_key="password-keycloak-db",
    hostname="psql-dn-keycloak-restore.postgres.database.azure.com",
    port=5432,
    database="keycloak",
)

# definir el esquema del df
schema_user_verified = StructType([
    StructField("parent_group", StringType(), True),
    StructField("grupo", StringType(), True),
    StructField("nombre", StringType(), True),
    StructField("apellido", StringType(), True),
    StructField("email", StringType(), True),
    StructField("verified", BooleanType(), True),
])

# Leer tabla
df = reader.read_table("dn_user_verified", schema=schema_user_verified)

# Transformar datos
transformer = UserVerifiedTransformer(df)
access_df = (
    transformer.add_solucion_column()
    .add_user_type_column()
    .clean_column_names()
    .get_transformed_df()
)

writer = DeltaWriter(base_path="/mnt/bronze")
writer.write(df=access_df, table_name="dn_user_verified")

# Escribir con particiones
# writer.write(df=access_df, table_name="dn_user_verified", partition_cols=["solucion", "tipo_de_usuario"])

In [0]:
# se borrar las variables para liberar memoria
access_df = df = transformer = reader = writer = None
del access_df, df, transformer, reader, writer