#Imports

In [0]:
from pyspark.sql.functions import col, lit, to_date, year, udf, concat_ws, when
from pyspark.sql.types import StringType, StructType, StructField, IntegerType, DoubleType
import pyspark.sql.functions as F

In [0]:
%run ../../common/nb_init

#Functions

In [0]:
strategische_spalten = [
    'Strategie2030', 'FinanzierungEnergiewende', 'EMobilitaet',
    'VernetzeEnergiewelt', 'TransformationGasnetzeWasserstoff', 'ErneuerbareEnergien',
    'DisponibleErzeugung', 'IntelligenteStromnetze', 'EnBWAlsArbeitgeberIn',
    'NachhaltigkeitCSRESG', 'MarkeEnBW'
]

def get_top_topic(*args):
    thema_werte = dict(zip(strategische_spalten, args))
    top = max(thema_werte.items(), key=lambda x: x[1] if x[1] is not None else 0)
    return top[0] if top[1] is not None and top[1] > 0.8 else None

get_top_thema_udf = F.udf(get_top_topic, StringType())

In [0]:
def get_top_3_topics(*args):
    thema_werte = dict(zip(strategische_spalten, args))
    # Nur Werte >= 0.8 behalten
    gefiltert = [(thema, wert) for thema, wert in thema_werte.items() if wert is not None and wert >= 0.8]
    # Sortiert nach Wert absteigend
    top3 = sorted(gefiltert, key=lambda x: x[1], reverse=True)[:3]
    # Liste auf Länge 3 bringen (auffüllen mit None)
    return [t[0] for t in top3] + ["Kein strategisches Thema"] * (3 - len(top3))

top3_schema = StructType([
    StructField("StrategischesThema1", StringType(), True),
    StructField("StrategischesThema2", StringType(), True),
    StructField("StrategischesThema3", StringType(), True),
])

get_top3_thema_udf = udf(get_top_3_topics, top3_schema)

# Target Schema

In [0]:
target_schema_name = '03_transformed'
target_path = 'funnel'

#Configure Tables

In [0]:
relevant_tables = [
    {
        'table_name': '03_transformed.facebook_organic_total',
        'id_column': 'PostID',
        'date': 'CreatedDate',
        'channel': 'Facebook',
        'url': 'PostURL',
        'post_message': 'PostMessage',
        'post_type': 'PostType',
        'impressions': 'PostTotalImpressionsLifetime',
        'engagement_rate': 'EngagementRateInPercent',
    },
    {
        'table_name': '03_transformed.instagram_organic_total',
        'id_column': 'PostID',
        'date': 'CreatedDate',
        'channel': 'Instagram',
        'url': 'PostURL',
        'post_message': 'PostMessage',
        'post_type': 'PostType',
        'impressions': 'TotalViews',
        'engagement_rate': 'EngagementRateInPercent',
    },
    {
        'table_name': '03_transformed.linkedin_organic_total', #
        'id_column': 'PostID',
        'date': 'CreatedDate',
        'channel': 'LinkedIn',
        'url': 'PostURL',
        'post_message': 'PostContent',
        'post_type': 'ContentType',
        'impressions': 'TotalImpressions',
        'engagement_rate': 'EngagementRateInPercent',
    },
    {
        'table_name': '03_transformed.x_organic_total',
        'id_column': 'PostID',
        'date': 'CreatedDate',
        'channel': 'X',
        'url': 'PostURL',
        'post_message': 'PostMessage',
        'post_type': 'PostType',
        'impressions': 'TotalImpressions',
        'engagement_rate': 'EngagementRateInPercent',
    },
    { # TODO: include YouTube, if descriptions are available
       'table_name': '03_transformed.youtube_organic_post_total',
       'id_column': 'VideoID',
       'date': 'CreatedDate',
       'channel': 'YouTube',
       'url': 'VideoURL',
       'post_type': 'Video',
       'post_message': 'VideoDescription',
       'impressions': 'TotalViews',
       'engagement_rate': 'EngagementRateInPercent'
    },
    # TODO: include genios print media, if available
    # TODO: include genios online media, if available
]

# Combine relevant Tables

In [0]:
schema = T.StructType([
    T.StructField("ID", T.StringType(), True),
    T.StructField("Date", T.StringType(), True),
    T.StructField("Channel", T.StringType(), True),
    T.StructField("URL", T.StringType(), True),
    T.StructField("PostMessage", T.StringType(), True),
    T.StructField("PostType", T.StringType(), True),
    T.StructField("Impressions", T.IntegerType(), True),
    T.StructField("EngagementRateInPercent", T.DoubleType(), True),
    T.StructField("StrategischesThema1", T.StringType(), True),
    T.StructField("StrategischesThema2", T.StringType(), True),
    T.StructField("StrategischesThema3", T.StringType(), True),
    T.StructField("Themenbereich1", T.StringType(), True),
    T.StructField("Themenbereich2", T.StringType(), True),
    T.StructField("Themenbereich3", T.StringType(), True),
])
df_consolidated = spark.createDataFrame([], schema=schema)

for t in relevant_tables:
    df = spark.read.table(f"datif_pz_uk_{env}.{t['table_name']}")
    
    # Strategisches Thema berechnen
    df = df.withColumn(
            "StrategischeThemen",
            get_top3_thema_udf(*[F.col(col) for col in strategische_spalten])
            ).selectExpr("*", 
                "StrategischeThemen.StrategischesThema1", 
                "StrategischeThemen.StrategischesThema2", 
                "StrategischeThemen.StrategischesThema3"
            ).drop("StrategischeThemen")
    
    # Themenbereiche nach Confidence filtern
    df = df.withColumn("Themenbereich1", F.when(F.col("Themenbereich1_Conf") >= 80, F.col("Themenbereich1")))
    df = df.withColumn("Themenbereich2", F.when(F.col("Themenbereich2_Conf") >= 80, F.col("Themenbereich2")))
    df = df.withColumn("Themenbereich3", F.when(F.col("Themenbereich3_Conf") >= 80, F.col("Themenbereich3")))
    
    # PostType: YouTube hardcodiert
    if t['channel'] == "YouTube":
        post_type_column = F.lit("Video")
    else:
        post_type_column = F.col(t['post_type'])

    # X Daten nur wenn der Post Type nicht none ist
    if t['channel'] == "X":
        df = df.filter(F.col(t['post_type']).isNotNull())
    
    df = df.select(
        F.col(t['id_column']).alias("ID"),
        F.col(t['date']).alias("Date"),
        F.lit(t['channel']).alias("Channel"),
        F.col(t['url']).alias("URL"),
        F.col(t['post_message']).alias("PostMessage"),
        post_type_column.alias("PostType"),
        F.col(t['impressions']).alias("Impressions"),
        F.col(t['engagement_rate']).alias("EngagementRateInPercent"),
        F.col("StrategischesThema1"),
        F.col("StrategischesThema2"),
        F.col("StrategischesThema3"),
        F.col("Themenbereich1"),
        F.col("Themenbereich2"),
        F.col("Themenbereich3"),
    )
    
    df_consolidated = df_consolidated.unionByName(df)

df_consolidated.display()


### Einheitliches PostType Mapping

In [0]:
post_type_mapping = {
    ("YouTube", "Video"): "Video",
    ("Facebook", "video_direct_response"): "Video",
    ("Facebook", "video_inline"): "Video",
    ("Facebook", "Ohne Post Type"): "Other",
    ("Facebook", "album"): "Image",
    ("Facebook", "share"): "Other",
    ("Facebook", "photo"): "Image",
    ("Facebook", "profile_media"): "Other",
    ("Facebook", "cover_photo"): "Other",
    ("Facebook", "multi_share"): "Carousel / Document",
    ("Facebook", "multi_share_no_end_card"): "Other",
    ("Instagram", "VIDEO"): "Video",
    ("Instagram", "IMAGE"): "Image",
    ("Instagram", "CAROUSEL_ALBUM"): "Carousel / Document",
    ("Instagram", "Storie"): "Story",
    ("X", "mixed"): "Image",
    ("X", "gif"): "Other",
    ("X", "image"): "Image",
    ("X", "video"): "Video",
    ("LinkedIn", "Article"): "Other",
    ("LinkedIn", "Repost"): "Other",
    ("LinkedIn", "Image"): "Image",
    ("LinkedIn", "Poll"): "Poll",
    ("LinkedIn", "Document"): "Carousel / Document",
    ("LinkedIn", "Video"): "Video"
}



def map_post_type(channel, post_type):
    key = (channel, post_type)
    return post_type_mapping.get(key, "Keine Zuordnung möglich")

map_post_type_udf = udf(map_post_type, StringType())


df_consolidated = df_consolidated.withColumn(
    "PostTypeGeneralized",
    map_post_type_udf(F.col("Channel"), F.col("PostType"))
)

df_consolidated = df_consolidated.drop("PostType")
df_consolidated = df_consolidated.withColumnRenamed("PostTypeGeneralized", "PostType")

#Write Table

In [0]:
fn_overwrite_table(df_source=df_consolidated, target_schema_name=target_schema_name, target_table_name="consolidated_socials", target_path=target_path)