# UK - Anbindung Paid Organic Themen 

* Um was handelt es sich hier  (Kurzbeschreibung Inhalt):
- Alle Paid und Organic SoMe Total Tabellen in einer consolidierten Tabelle


---
* QUELLEN:  
- datif_pz_uk_{}.03_transformed.linkedin_paid_oraganic_daily
- datif_pz_uk_{}.03_transformed.youtube_paid_oraganic_daily
- datif_pz_uk_{}.03_transformed.meta_paid_oraganic_daily
- datif_pz_uk_{}.03_transformed.outbrain_paid_daily
- datif_pz_uk_{}.03_transformed.instagram_organic_stories_daily


* ZIEL:  
- datif_pz_uk_{}.03_transformed.consolidated_socials_paid_organic_daily


---
* Versionen (aktuelle immer oben):
- 08.10.2025 Max Mustermann: init

#Imports

In [0]:
from pyspark.sql.functions import col, lit, to_date, year, udf, concat_ws, when, regexp_replace, trim
from pyspark.sql.types import StringType, StructType, StructField
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [0]:
%run ../../common/nb_init

# Target Schema

In [0]:
target_schema_name = '03_transformed'
target_path = 'funnel'

#Configure Tables

In [0]:
relevant_tables = [
    {
        'table_name': '03_transformed.youtube_paid_organic_daily',
        # --- Allgemein ---
        'organic_id': 'OrganicID',
        'paid_id': 'AdID',
        'source': 'Source',
        'channel': 'YouTube',
        'created_date': 'CreatedDate',
        'campaign_name': 'CampaignName',
        'ad_name': 'AdName',
        'url': 'URL',
        'post_type': 'Video',
        'title': 'VideoDescription',
        'owner': 'Owner',
        'spend': 'Amount_Spend',

        # --- All_ ---
        'all_impressions': 'All_Views',
        'all_engagement': 'All_Engagements',
        'all_weighted_engagement': 'All_WeightedEngagements',
        'all_total_likes': 'All_Likes',
        'all_total_dislikes': 'All_Dislikes',
        'all_total_comments': 'All_Comments',
        'all_total_shares': 'All_Shares',
        'all_total_clicks': 'KPI existiert nicht',
        'all_total_reactions': 'KPI existiert nicht',
        'all_average_view_duration': 'All_AverageViewDuration',

        # --- Organic_ ---
        'organic_impressions': 'Organic_Views',
        'organic_engagement': 'Organic_Engagements',
        'organic_weighted_engagement': 'KPI existiert nicht',
        'organic_total_likes': 'KPI existiert nicht',
        'organic_total_comments': 'KPI existiert nicht',
        'organic_total_shares': 'KPI existiert nicht',
        'organic_total_clicks': 'KPI existiert nicht',
        'organic_total_reactions': 'KPI existiert nicht',

        # --- Paid_ ---
        'paid_impressions': 'Paid_Impressions',
        'paid_engagement': 'Paid_Engagements',
        'paid_weighted_engagement': 'KPI existiert nicht',
        'paid_total_likes': 'KPI existiert nicht',
        'paid_total_comments': 'KPI existiert nicht',
        'paid_total_shares': 'KPI existiert nicht',
        'paid_total_clicks': 'KPI existiert nicht',
        'paid_total_reactions': 'KPI existiert nicht',
    },

    {
        'table_name': '03_transformed.linkedIn_paid_organic_daily',
        # --- Allgemein ---
        'organic_id': 'OrganicID',
        'paid_id': 'AdID',
        'source': 'Source',
        'channel': 'LinkedIn',
        'created_date': 'CreatedDate',
        'campaign_name': 'CampaignName',
        'ad_name': 'AdName',
        'url': 'URL',
        'post_type': 'PostType',
        'title': 'Title',
        'owner': 'Existiert nicht',
        'spend': 'Amount_Spend',

        # --- All_ ---
        'all_impressions': 'All_Impressions',
        'all_engagement': 'All_Engagements',
        'all_weighted_engagement': 'All_WeightedEngagements',
        'all_total_likes': 'All_Likes',
        'all_total_comments': 'All_Comments',
        'all_total_shares': 'All_Shares',
        'all_total_clicks': 'All_Clicks',
        'all_total_reactions': 'KPI existiert nicht',
        'all_average_view_duration': 'KPI existiert nicht',
        'all_total_dislikes': 'KPI existiert nicht',

        # --- Organic_ ---
        'organic_impressions': 'Organic_Impressions',
        'organic_engagement': 'Organic_Engagements',
        'organic_weighted_engagement': 'Organic_WeightedEngagements',
        'organic_total_likes': 'Organic_Likes',
        'organic_total_comments': 'Organic_Comments',
        'organic_total_shares': 'Organic_Shares',
        'organic_total_clicks': 'Organic_Clicks',
        'organic_total_reactions': 'KPI existiert nicht',

        # --- Paid_ ---
        'paid_impressions': 'Paid_Impressions',
        'paid_engagement': 'Paid_Engagements',
        'paid_weighted_engagement': 'Paid_WeightedEngagements',
        'paid_total_likes': 'Paid_Likes',
        'paid_total_comments': 'Paid_Comments',
        'paid_total_shares': 'Paid_Shares',
        'paid_total_clicks': 'Paid_Clicks',
        'paid_total_reactions': 'KPI existiert nicht',
    },

    {
        'table_name': '03_transformed.meta_paid_organic_daily',
        # --- Allgemein ---
        'organic_id': 'OrganicID',
        'paid_id': 'AdID',
        'source': 'Source',
        'channel': 'Plattform',
        'created_date': 'CreatedDate',
        'campaign_name': 'CampaignName',
        'ad_name': 'AdName',
        'url': 'URL',
        'post_type': 'PostType',
        'title': 'Existiert nicht',
        'owner': 'Existiert nicht',
        'spend': 'Amount_Spend',

        # --- All_ ---
        'all_impressions': 'All_Impressions',
        'all_engagement': 'All_Engagements',
        'all_weighted_engagement': 'All_WeightedEngagements',
        'all_total_shares': 'All_Shares',
        'all_total_clicks': 'All_Clicks',
        'all_total_comments': 'All_Comments',
        'all_total_reactions': 'All_Reactions',
        'all_total_likes': 'KPI existiert nicht',
        'all_total_dislikes': 'KPI existiert nicht',
        'all_average_view_duration': 'KPI existiert nicht',

        # --- Organic_ ---
        'organic_impressions': 'Organic_Impressions',
        'organic_engagement': 'Organic_Engagements',
        'organic_weighted_engagement': 'Organic_WeightedEngagements',
        'organic_total_shares': 'Organic_Shares',
        'organic_total_clicks': 'Organic_Clicks',
        'organic_total_comments': 'Organic_Comments',
        'organic_total_reactions': 'Organic_Reactions',
        'organic_total_saved': 'Organic_Saved',
        'organic_total_likes': 'KPI existiert nicht',

        # --- Paid_ ---
        'paid_impressions': 'Paid_Impressions',
        'paid_engagement': 'Paid_Engagements',
        'paid_weighted_engagement': 'Paid_WeightedEngagements',
        'paid_total_shares': 'Paid_Shares',
        'paid_total_clicks': 'Paid_Clicks',
        'paid_total_comments': 'Paid_Comments',
        'paid_total_reactions': 'Paid_Reactions',
        'paid_total_likes': 'KPI existiert nicht',
    },

    {
        'table_name': '03_transformed.outbrain_paid_daily',
        # --- Allgemein ---
        'organic_id': 'Existiert nicht',
        'paid_id': 'AdID',
        'source': 'Paid',
        'channel': 'Outbrain',
        'created_date': 'Existiert nicht',
        'campaign_name': 'CampaignName',
        'ad_name': 'AdName',
        'url': 'Content_URL',
        'post_type': 'Existiert nicht',
        'title': 'Title',
        'owner': 'Existiert nicht',
        'spend': 'Amount_Spend',

        # --- All_ ---
        'all_impressions': 'Impressions',
        'all_engagement': 'KPI existiert nicht',
        'all_weighted_engagement': 'KPI existiert nicht',
        'all_total_clicks': 'Clicks',
        'all_total_likes': 'KPI existiert nicht',
        'all_total_comments': 'KPI existiert nicht',
        'all_total_shares': 'KPI existiert nicht',
        'all_total_reactions': 'KPI existiert nicht',
        'all_total_dislikes': 'KPI existiert nicht',
        'all_average_view_duration': 'Video_Average_View',

        # --- Organic_ ---
        'organic_impressions': 'KPI existiert nicht',
        'organic_engagement': 'KPI existiert nicht',
        'organic_weighted_engagement': 'KPI existiert nicht',
        'organic_total_clicks': 'KPI existiert nicht',
        'organic_total_likes': 'KPI existiert nicht',
        'organic_total_comments': 'KPI existiert nicht',
        'organic_total_shares': 'KPI existiert nicht',
        'organic_total_reactions': 'KPI existiert nicht',

        # --- Paid_ ---
        'paid_impressions': 'Impressions',
        'paid_engagement': 'KPI existiert nicht',
        'paid_weighted_engagement': 'KPI existiert nicht',
        'paid_total_clicks': 'Clicks',
        'paid_total_likes': 'KPI existiert nicht',
        'paid_total_comments': 'KPI existiert nicht',
        'paid_total_shares': 'KPI existiert nicht',
        'paid_total_reactions': 'KPI existiert nicht',
    },

    {
        'table_name': '03_transformed.instagram_organic_stories_daily',
        # --- Allgemein ---
        'organic_id': 'StoryID',
        'paid_id': 'Existiert nicht',
        'source': 'Organic',
        'channel': 'Instagram',
        'created_date': 'CreatedDate',
        'campaign_name': 'Existiert nicht',
        'ad_name': 'Existiert nicht',
        'url': 'Existiert nicht',
        'post_type': 'Story',
        'title': 'Caption',
        'owner': 'Existiert nicht',
        'spend': 'KPI existiert nicht',

        # --- All_ ---
        'all_impressions': 'TotalImpressionsSum',
        'all_engagement': 'Engagement',
        'all_weighted_engagement': 'WeightedEngagements',
        'all_total_shares': 'TotalSharesSum',
        'all_total_comments': 'TotalRepliesSum',
        'all_total_likes': 'KPI existiert nicht',
        'all_total_clicks': 'KPI existiert nicht',
        'all_total_reactions': 'KPI existiert nicht',
        'all_average_view_duration': 'KPI existiert nicht',
        'all_total_dislikes': 'KPI existiert nicht',

        # --- Organic_ ---
        'organic_impressions': 'TotalImpressions',
        'organic_engagement': 'Engagement',
        'organic_weighted_engagement': 'WeightedEngagements',
        'organic_total_shares': 'TotalShares',
        'organic_total_comments': 'TotalReplies',
        'organic_total_likes': 'KPI existiert nicht',
        'organic_total_clicks': 'KPI existiert nicht',
        'organic_total_reactions': 'KPI existiert nicht',

        # --- Paid_ ---
        'paid_impressions': 'KPI existiert nicht',
        'paid_engagement': 'KPI existiert nicht',
        'paid_weighted_engagement': 'KPI existiert nicht',
        'paid_total_shares': 'KPI existiert nicht',
        'paid_total_comments': 'KPI existiert nicht',
        'paid_total_likes': 'KPI existiert nicht',
        'paid_total_clicks': 'KPI existiert nicht',
        'paid_total_reactions': 'KPI existiert nicht',
    }
]


# Combine relevant Tables

In [0]:
schema = T.StructType([
    # --- IDs & Allgemein ---
    T.StructField("OrganicID", T.StringType(), True),
    T.StructField("PaidID", T.StringType(), True),
    T.StructField("Source", T.StringType(), True),
    T.StructField("Channel", T.StringType(), True),
    T.StructField("CreatedDate", T.StringType(), True),
    T.StructField("CampaignName", T.StringType(), True),
    T.StructField("AdName", T.StringType(), True),
    T.StructField("URL", T.StringType(), True),
    T.StructField("PostType", T.StringType(), True),
    T.StructField("Title", T.StringType(), True),
    T.StructField("Owner", T.StringType(), True),
    T.StructField("Spend", T.StringType(), True),

    # --- Impressions ---
    T.StructField("All_Impressions", T.StringType(), True),
    T.StructField("Organic_Impressions", T.StringType(), True),
    T.StructField("Paid_Impressions", T.StringType(), True),

    # --- Engagements ---
    T.StructField("All_Engagements", T.StringType(), True),
    T.StructField("Organic_Engagements", T.StringType(), True),
    T.StructField("Paid_Engagements", T.StringType(), True),

    # --- Weighted Engagements ---
    T.StructField("All_WeightedEngagements", T.StringType(), True),
    T.StructField("Organic_WeightedEngagements", T.StringType(), True),
    T.StructField("Paid_WeightedEngagements", T.StringType(), True),

    # --- Likes ---
    T.StructField("All_Likes", T.StringType(), True),
    T.StructField("Organic_Likes", T.StringType(), True),
    T.StructField("Paid_Likes", T.StringType(), True),

    # --- Dislikes ---
    T.StructField("All_Dislikes", T.StringType(), True),

    # --- Comments ---
    T.StructField("All_Comments", T.StringType(), True),
    T.StructField("Organic_Comments", T.StringType(), True),
    T.StructField("Paid_Comments", T.StringType(), True),

    # --- Shares ---
    T.StructField("All_Shares", T.StringType(), True),
    T.StructField("Organic_Shares", T.StringType(), True),
    T.StructField("Paid_Shares", T.StringType(), True),

    # --- Replies ---
    T.StructField("All_Replies", T.StringType(), True),
    T.StructField("Organic_Replies", T.StringType(), True),
    T.StructField("Paid_Replies", T.StringType(), True),

    # --- Clicks ---
    T.StructField("All_Clicks", T.StringType(), True),
    T.StructField("Organic_Clicks", T.StringType(), True),
    T.StructField("Paid_Clicks", T.StringType(), True),

    # --- Reactions ---
    T.StructField("All_Reactions", T.StringType(), True),
    T.StructField("Organic_Reactions", T.StringType(), True),
    T.StructField("Paid_Reactions", T.StringType(), True),

    # --- View Duration ---
    T.StructField("All_AverageViewDuration", T.StringType(), True),

    # --- Topics ---
    T.StructField("StrategischesThema1", T.StringType(), True),
    T.StructField("StrategischesThema2", T.StringType(), True),
    T.StructField("StrategischesThema3", T.StringType(), True),
    T.StructField("Themenbereich1", T.StringType(), True),
    T.StructField("Themenbereich2", T.StringType(), True),
    T.StructField("Themenbereich3", T.StringType(), True),
])

df_consolidated = spark.createDataFrame([], schema=schema)


In [0]:
# ---------------------------------------------------------------------
# Hilfsfunktion für Spaltenzugriff
# ---------------------------------------------------------------------
def _safe_col_from(df, name: str):
    """
    Gibt eine String-Spalte zurück:
      - Wenn 'Fehlt noch' / 'Existiert nicht' / leer -> Literal "Existiert nicht"
      - Wenn 'KPI existiert nicht' -> Literal 0
      - Wenn konstanter Begriff ('Organic', 'Paid', 'Video', 'Story') -> Literal dieses Werts
      - Wenn Spalte nicht existiert -> Literal "Existiert nicht"
      - Sonst -> df[name] (als string gecastet)
    """
    if not name:
        return F.lit("Existiert nicht")

    key = str(name).strip()
    if key in {"Fehlt noch", "Existiert nicht", "", None}:
        return F.lit("Existiert nicht")
    if key == "KPI existiert nicht":
        return F.lit(0)
    if key in {"Organic", "Paid", "Story", "Video"}:
        return F.lit(key)
    if key not in df.columns:
        return F.lit("Existiert nicht")

    return F.col(key).cast("string")


# ---------------------------------------------------------------------
# Hauptlogik: Vereinheitlichung aller Tabellen
# ---------------------------------------------------------------------
df_consolidated = None

for t in relevant_tables:
    print(f"Verarbeite Tabelle: {t['table_name']}")
    df = spark.read.table(f"datif_pz_uk_{env}.{t['table_name']}")

    # Einheitliches Select nach dem neuen Schema
    df_sel = df.select(
        # --- IDs & Allgemein ---
        _safe_col_from(df, t.get('organic_id')).alias("OrganicID"),
        _safe_col_from(df, t.get('paid_id')).alias("PaidID"),
        _safe_col_from(df, t.get('source')).alias("Source"),
        # Änderung 1: Channel nur bei Meta dynamisch
        F.when(F.lit(t['channel']) == "Plattform", _safe_col_from(df, t.get('channel')))
         .otherwise(F.lit(t.get('channel', 'Existiert nicht'))).alias("Channel"),
        _safe_col_from(df, t.get('created_date')).alias("CreatedDate"),
        _safe_col_from(df, t.get('campaign_name')).alias("CampaignName"),
        _safe_col_from(df, t.get('ad_name')).alias("AdName"),
        _safe_col_from(df, t.get('url')).alias("URL"),
        _safe_col_from(df, t.get('post_type')).alias("PostType"),
        _safe_col_from(df, t.get('title')).alias("Title"),
        _safe_col_from(df, t.get('owner')).alias("Owner"),
        _safe_col_from(df, t.get('spend')).alias("Spend"),

        # --- Impressions ---
        _safe_col_from(df, t.get('all_impressions')).alias("All_Impressions"),
        _safe_col_from(df, t.get('organic_impressions')).alias("Organic_Impressions"),
        _safe_col_from(df, t.get('paid_impressions')).alias("Paid_Impressions"),

        # --- Engagements ---
        _safe_col_from(df, t.get('all_engagement')).alias("All_Engagements"),
        _safe_col_from(df, t.get('organic_engagement')).alias("Organic_Engagements"),
        _safe_col_from(df, t.get('paid_engagement')).alias("Paid_Engagements"),

        # --- Weighted Engagements ---
        _safe_col_from(df, t.get('all_weighted_engagement')).alias("All_WeightedEngagements"),
        _safe_col_from(df, t.get('organic_weighted_engagement')).alias("Organic_WeightedEngagements"),
        _safe_col_from(df, t.get('paid_weighted_engagement')).alias("Paid_WeightedEngagements"),

        # --- Likes ---
        _safe_col_from(df, t.get('all_total_likes')).alias("All_Likes"),
        _safe_col_from(df, t.get('organic_total_likes')).alias("Organic_Likes"),
        _safe_col_from(df, t.get('paid_total_likes')).alias("Paid_Likes"),

        # --- Dislikes ---
        _safe_col_from(df, t.get('all_total_dislikes')).alias("All_Dislikes"),

        # --- Comments ---
        _safe_col_from(df, t.get('all_total_comments')).alias("All_Comments"),
        _safe_col_from(df, t.get('organic_total_comments')).alias("Organic_Comments"),
        _safe_col_from(df, t.get('paid_total_comments')).alias("Paid_Comments"),

        # --- Shares ---
        _safe_col_from(df, t.get('all_total_shares')).alias("All_Shares"),
        _safe_col_from(df, t.get('organic_total_shares')).alias("Organic_Shares"),
        _safe_col_from(df, t.get('paid_total_shares')).alias("Paid_Shares"),

        # --- Replies ---
        _safe_col_from(df, t.get('all_replies')).alias("All_Replies"),
        _safe_col_from(df, t.get('organic_replies')).alias("Organic_Replies"),
        _safe_col_from(df, t.get('paid_replies')).alias("Paid_Replies"),

        # --- Clicks ---
        _safe_col_from(df, t.get('all_total_clicks')).alias("All_Clicks"),
        _safe_col_from(df, t.get('organic_total_clicks')).alias("Organic_Clicks"),
        _safe_col_from(df, t.get('paid_total_clicks')).alias("Paid_Clicks"),

        # --- Reactions ---
        _safe_col_from(df, t.get('all_total_reactions')).alias("All_Reactions"),
        _safe_col_from(df, t.get('organic_total_reactions')).alias("Organic_Reactions"),
        _safe_col_from(df, t.get('paid_total_reactions')).alias("Paid_Reactions"),

        # --- View Duration ---
        _safe_col_from(df, t.get('all_average_view_duration')).alias("All_AverageViewDuration"),

        # --- Topics ---
        F.lit("Kein strategisches Thema").alias("StrategischesThema1"),
        F.lit("Kein strategisches Thema").alias("StrategischesThema2"),
        F.lit("Kein strategisches Thema").alias("StrategischesThema3"),
        F.lit("Kein Themenbereich").alias("Themenbereich1"),
        F.lit("Kein Themenbereich").alias("Themenbereich2"),
        F.lit("Kein Themenbereich").alias("Themenbereich3"),
    )

    # Union der Tabellen mit gleichem Schema
    df_consolidated = (
        df_sel if df_consolidated is None
        else df_consolidated.unionByName(df_sel, allowMissingColumns=True)
    )

# Ergebnis anzeigen
df_consolidated.display()


### Bug das bei Facebook manchmal der PostType Null ist obwohl es einer Exisitiert

In [0]:
# 1. Quelle laden mit expliziter Umbenennung der PostType-Spalte
df_posttype_source = spark.read.table(f"datif_pz_uk_{env}.03_transformed.consolidated_socials") \
    .select(
        F.col("ID").alias("JoinID"),
        F.col("PostType").alias("PostType_source")
    ) \
    .dropDuplicates(["JoinID"])

# 2. Join & PostType auffüllen, falls im Ziel Null
df_consolidated = df_consolidated.join(
    df_posttype_source,
    df_consolidated["OrganicID"] == df_posttype_source["JoinID"],
    how="left"
).withColumn(
    "PostType",
    F.when(F.col("PostType").isNull(), F.col("PostType_source"))
     .otherwise(F.col("PostType"))
).drop("PostType_source", "JoinID")

### Alle https: in der Postmessage löschen https://A.visualstudio.com/DaTIF/_workitems/edit/4281898/

In [0]:
df_consolidated = df_consolidated.withColumn(
    "Title",
    regexp_replace("Title", r"https:\/\/\S+", "")
)

### Einheitliches PostType Mapping

In [0]:
post_type_mapping = {
    ("YouTube", "Video"): "Video",
    ("Facebook", "video_direct_response"): "Video",
    ("Facebook", "video_inline"): "Video",
    ("Facebook", "Ohne Post Type"): "Other",
    ("Facebook", "album"): "Image",
    ("Facebook", "share"): "Other",
    ("Facebook", "photo"): "Image",
    ("Facebook", "profile_media"): "Other",
    ("Facebook", "cover_photo"): "Other",
    ("Facebook", "multi_share"): "Carousel / Document",
    ("Facebook", "multi_share_no_end_card"): "Other",
    ("Instagram", "VIDEO"): "Video",
    ("Instagram", "IMAGE"): "Image",
    ("Instagram", "CAROUSEL_ALBUM"): "Carousel / Document",
    ("Instagram", "Story"): "Story",
    ("LinkedIn", "Article"): "Other",
    ("LinkedIn", "Repost"): "Other",
    ("LinkedIn", "Image"): "Image",
    ("LinkedIn", "Poll"): "Poll",
    ("LinkedIn", "Document"): "Carousel / Document",
    ("LinkedIn", "Video"): "Video",


}



def map_post_type(channel, post_type):
    key = (channel, post_type)
    return post_type_mapping.get(key, "Keine Zuordnung möglich")

map_post_type_udf = udf(map_post_type, StringType())


df_consolidated = df_consolidated.withColumn(
    "PostTypeGeneralized",
    map_post_type_udf(F.col("Channel"), F.col("PostType"))
)

df_consolidated = df_consolidated.drop("PostType")
df_consolidated = df_consolidated.withColumnRenamed("PostTypeGeneralized", "PostType")

### Themen Mapping und null Werte ersetzen


In [0]:
df_consolidated_total = spark.sql(f"""
SELECT
    ID as OrganicID,
    StrategischesThema1,
    StrategischesThema2,
    StrategischesThema3,
    Themenbereich1,
    Themenbereich2,
    Themenbereich3
    from datif_pz_uk_{env}.03_transformed.consolidated_socials""")


df_consolidated = df_consolidated.drop(
    "StrategischesThema1", "StrategischesThema2", "StrategischesThema3",
    "Themenbereich1", "Themenbereich2", "Themenbereich3"
).join(df_consolidated_total.alias("total"), on="OrganicID", how="left")

def replace_empty_and_null(df, column_name, replacement):
    return df.withColumn(
        column_name,
        when(col(f"total.{column_name}").isNull() | (trim(col(f"total.{column_name}")) == ""), replacement)
        .otherwise(col(f"total.{column_name}"))
    )

# Liste der Spalten und ihre gewünschten Fallbacks
replacements = {
    "StrategischesThema1": "Kein strategisches Thema",
    "StrategischesThema2": "Kein strategisches Thema",
    "StrategischesThema3": "Kein strategisches Thema",
    "Themenbereich1": "Kein Themenbereich",
    "Themenbereich2": "Kein Themenbereich",
    "Themenbereich3": "Kein Themenbereich",
}

# Anwenden auf das DataFrame
for col_name, replacement_text in replacements.items():
    df_consolidated = replace_empty_and_null(df_consolidated, col_name, replacement_text)

# display(df_consolidated)

### Bug in Themenbereiche

"Geothermie/Pumpspeicher" → "Geothermie / Pumpspeicher" " / Pumpspeicherkraftwerke" → " / Pumpspeicherkraftwerke" "Geothermie / Pumpspeicher" → "Geothermie / Pumpspeicher"

In [0]:

# Spalten, auf die die Regel angewendet werden soll
columns_to_fix = ["Themenbereich1", "Themenbereich2", "Themenbereich3"]

for col_name in columns_to_fix:
    df_consolidated = df_consolidated.withColumn(
        col_name,
        regexp_replace(col(col_name), r"\s*/\s*", " / ")  # ersetzt alle "/"-Varianten durch " / "
    ).withColumn(
        col_name,
        regexp_replace(col(col_name), r"\s{2,}", " ")  # entfernt doppelte Leerzeichen
    ).withColumn(
        col_name,
        trim(col(col_name))  # entfernt führende/trailing Leerzeichen
    )

#Write Table

In [0]:
fn_overwrite_table(df_source=df_consolidated, target_schema_name=target_schema_name, target_table_name="consolidated_socials_paid_organic_daily", target_path=target_path)