#Imports

In [0]:
from pyspark.sql.functions import col, lit, to_date, year, udf, concat_ws, when
from pyspark.sql.types import StringType
import pyspark.sql.functions as F

In [0]:
%run ../common/nb_init

In [0]:
def add_percent_dynamic(rows):
    result = []
    i = 0
    n = None  # wird später auf die Blocklänge gesetzt

    while i < len(rows):
        if rows[i]["Kategorie"] == "BASIS gewichtet":
            if n is None:
                n = 0
                while i + n < len(rows) and rows[i + n]["Kategorie"] == "BASIS gewichtet":
                    n += 1
            
            basis_block = rows[i:i+n]
            company_block = rows[i+n:i+2*n]

            for b, c in zip(basis_block, company_block):
                result.append({
                    "Datum": c["Datum"],
                    "Kategorie": c["Kategorie"],
                    "Bekanntheit_in_Prozent": round(c["Wert"] / b["Wert"] * 100, 2)
                })

            i += 2*n
        else:
            i += 1
    return result

#Define Target Schema

In [0]:
target_schema_name = "04_to_ai_assistant"
target_path = 'funnel'

#Retrieve Table

In [0]:
df_consolidated = spark.sql("""
    SELECT 
        *
    FROM 03_transformed.consolidated_socials
    WHERE Date >= '2025-01-01'
""")

df_consolidated = df_consolidated.withColumn(
    "Themenbereich",
    F.concat_ws(
        " | ",
        F.col("Themenbereich1"),
        F.col("Themenbereich2"),
        F.col("Themenbereich3")
    )
).withColumn(
    "StrategischesThema",
    F.concat_ws(
        " | ",
        F.col("StrategischesThema1"),
        F.col("StrategischesThema2"),
        F.col("StrategischesThema3")
    )
).drop(
    "Themenbereich1", "Themenbereich2", "Themenbereich3",
    "StrategischesThema1", "StrategischesThema2", "StrategischesThema3"
)


In [0]:
df_eco = spark.sql("""
                              SELECT 
                              Created_Date,
                              URL,
                              Engaged_sessions,
                              Sessions,
                              Views,
                              User_engagement,
                              Total_session_duration,
                              Bounce_rate,
                              Average_session_duration,
                              Views_per_session,
                              Active_Users_per_View,
                              Samples_read_rate,
                              Active_Users,
                              Total_Users,
                              Themenbereich1,
                              Themenbereich2,
                              Themenbereich3,
                              Strategische_Themen,
                              Abstract             
                              FROM  03_transformed.ga4_eco_journal_users_sessions_total_view
                              WHERE Created_Date >= '2025-01-01'
                              """)

In [0]:
df_eco = df_eco = df_eco.withColumn(
    "Themenbereich",
    F.concat_ws(
        " | ",
        *[F.col(c) for c in ["Themenbereich1", "Themenbereich2", "Themenbereich3"] if c is not None]
    )
)

df_eco = df_eco.drop("Themenbereich1", "Themenbereich2", "Themenbereich3")

In [0]:
df_reputatiosnindex = spark.sql("""
    SELECT 
        *
    FROM 03_transformed.mkm_reputationsindex
    WHERE Datum >= '2025-01-01'
""")

In [0]:
df_no_basis_ungewichtet = df_reputatiosnindex.filter(
    F.col("Kategorie") != "BASIS ungewichtet"
)

rows = df_no_basis_ungewichtet.collect()
rows_dict = [row.asDict() for row in rows]

result_rows = add_percent_dynamic(rows_dict)

df_repu = spark.createDataFrame(result_rows)

df_repu = df_repu.select("Datum", "Kategorie", "Bekanntheit_in_Prozent") \
    .orderBy( "Datum")

#Write Table and csv

In [0]:
fn_overwrite_table(df_source=df_consolidated, target_schema_name=target_schema_name, target_table_name="consolidated_socials", target_path=target_path)

In [0]:
fn_overwrite_table(df_source=df_eco, target_schema_name=target_schema_name, target_table_name="consolidated_eco_journal", target_path=target_path)

In [0]:
fn_overwrite_table(df_source=df_repu, target_schema_name=target_schema_name, target_table_name="reputationsindex", target_path=target_path)

In [0]:
fn_overwrite_csv(df_source=df_consolidated, target_schema_name=target_schema_name, target_folder_name="assistant_files", filename="socials.csv")

In [0]:
fn_overwrite_csv(df_source=df_repu, target_schema_name=target_schema_name, target_folder_name="assistant_files", filename="reputationsindex.csv")

In [0]:
fn_overwrite_csv(df_source=df_eco, target_schema_name=target_schema_name, target_folder_name="assistant_files", filename="eco_journal.csv")