%md
# 01-raw: Auslesen der Volltexte aus Talkwalker

* Um was handelt es sich hier  (Kurzbeschreibung Inhalt):  
Anbinden der Talkwalker API um Social Listening Daten in 01_Raw abzuspeichern.


---
## QUELLEN:  
- Talkwalker API 

## ZIEL  
- Unity-Catalog: 
  - datif_pz_uk_dev.01-toRaw.talkwalker

  
---
* Versionen (aktuelle immer oben):
- 10.07.2025 Max Mustermann: Init

In [0]:
%run ../../common/nb_init

In [0]:
import requests
import pandas as pd
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, DoubleType,
    BooleanType, TimestampType, MapType, ArrayType, DateType  
)
import datetime
import numpy as np


In [0]:
# Define schema name and path to store tables
target_schema_name = "02_cleaned"
target_path = "talkwalker_raw"
target_table_name="talkwalker_beispiel_30d"
# Set source and trg path
source_path = sta_endpoint_pz_uk["01_raw"] + "/talkwalker"

In [0]:
secret= get_secret("Talkwalker-API")


In [0]:
def get(URL: str) -> dict:
    """
    Führt eine GET-Anfrage an die angegebene URL mit dem Talkwalker-API-Schlüssel als Autorisierungsschlüssel aus und gibt das Ergebnis als JSON-Daten zurück.

    Input:
    URL: URL Endpoint

    Output:
    result: JSON-Daten
    """
    response = requests.get(URL, headers={"Authorization": f"Bearer {secret}"})
    if response.status_code == 200:
        print("✅ API funktioniert!")
        return response.json()
    else:
        print("❌ Fehler beim API-Aufruf:", response.status_code)
        print(response.text)

## Get Status

In [0]:
status_url = "https://api.talkwalker.com/api/v1/status/credits"
data = get(status_url)
print(data)
print("Verbleibende monatliche Credits:", data["result_creditinfo"]["remaining_credits_monthly"])

## Get Search API

In [0]:
projekt_url = "https://api.talkwalker.com/api/v1/search/info"
data = get(projekt_url)
print(data)
for project in data["result_accinfo"]["projects"]:
    print("Projekt:", project["name"], "| ID:", project["id"])
project_id = data["result_accinfo"]["projects"][0]["id"]

## Get Topics

In [0]:
topic_result_url = f"https://api.talkwalker.com/api/v2/talkwalker/p/{project_id}/topics/list" 

data = get(topic_result_url)

# print(data)
sub_topic_dict = {}
for topic in data["result_topics"]["topic_categories"]:
    for sub_topic in topic["query_topics"]:
        if topic["title"] == "ICC UK":
            print(f"Topic: {topic["title"]}, ID: {topic["id"]}, Subtopic: {sub_topic['title']}, ID: {sub_topic['id']}")
            sub_topic_dict[sub_topic["title"]] = sub_topic["id"]
            topic_id = topic["id"]

print(topic_id)
print(sub_topic_dict)

### Get Summary

In [0]:
# summary_result_url = f"https://api.talkwalker.com/api/v1/search/p/{project_id}/summary?time_range=1d&topic={topic_id}&q=A%20AND%20A"

# data = get(summary_result_url)
# print(data)

## Get Project Results 

In [0]:

# Liste zum speichern der Daten
df_list = []

# Set von allen Keys bzw Spalten Namen
all_columns = set()

for subtopic_title, subtopic_id in sub_topic_dict.items():
    url = f"https://api.talkwalker.com/api/v1/search/p/{project_id}/results?time_range=30d&topic={subtopic_id}&timezone=Europe/Berlin&hpp=500"
    data = get(url)
    
    try:
        for item in data["result_content"]["data"]:
            flat = item.get("data")
            all_columns.update(data.keys())
            if flat and isinstance(flat, dict) and len(flat) > 0:
                flat["subtopic"] = subtopic_title
                df_list.append(flat)
    except Exception as e:
        print(f"Error processing item: {e}")
        continue

# Daten in ein gemeinsames DataFrame überführen (mit automatischer Spaltenangleichung)
df = pd.DataFrame(df_list)


### Convert Publishe Date

In [0]:
# Timestamp konvertieren
df["published"] = pd.to_numeric(df["published"], errors="coerce")
df["published_dt"] = pd.to_datetime(df["published"], unit="ms", utc=True)
df["published_dt_local"] = df["published_dt"].dt.tz_convert("Europe/Berlin")
df["published_date"] = pd.to_datetime(df["published"], unit="ms", errors="coerce").dt.date

In [0]:
df.shape

In [0]:
df.head(10)

### Create Schema

In [0]:
def infer_type(val: object) -> object:
    """
    Infers the corresponding PySpark data type for a given Python or pandas value.

    Args:
        val (object): A single value from a pandas DataFrame column. This can be a primitive type
                      (int, float, str, bool), a datetime/date, list, dict, or a pandas-specific type.

    Returns:
        object: A PySpark data type (e.g., StringType, IntegerType, DoubleType, etc.)
                suitable for use in a StructField.
    """
    if isinstance(val, (bool, np.bool_)):
        return BooleanType()
    elif isinstance(val, (int, np.integer)):
        # Spark IntegerType supports only 32-bit integers
        if val > 2_147_483_647:
            return StringType()
        return IntegerType()
    elif isinstance(val, (float, np.floating)):
        return DoubleType()
    elif isinstance(val, pd.Timestamp):
        return TimestampType()
    elif isinstance(val, (pd.Timestamp, datetime.datetime)):
        return TimestampType()
    elif isinstance(val, datetime.date):
        return DateType()
    elif isinstance(val, dict):
        return MapType(StringType(), StringType(), True)
    elif isinstance(val, list):
        return ArrayType(StringType(), True)
    else:
        return StringType()


def infer_spark_schema_from_pandas(df: pd.DataFrame) -> StructType:
    """
    Infers a complete PySpark StructType schema from a pandas DataFrame
    by inspecting the first non-null value in each column.

    Args:
        df (pd.DataFrame): The input pandas DataFrame whose schema is to be inferred.

    Returns:
        StructType: A PySpark StructType object representing the schema of the DataFrame.
    """
    fields = []
    for col in df.columns:
        # Use the first non-null sample value in the column
        sample_value = df[col].dropna().iloc[0] if df[col].dropna().size > 0 else None
        # print(f"{col}: {sample_value}, {type(sample_value)}")
        spark_type = infer_type(sample_value)
        fields.append(StructField(col, spark_type, True))

    return StructType(fields)

In [0]:
def normalize_pandas_types(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalisiert die Datentypen eines pandas DataFrames, sodass sie für
    eine Konvertierung nach PySpark möglichst kompatibel und robust sind.

    Args:
        df (pd.DataFrame): Ursprünglicher pandas DataFrame mit möglicherweise uneinheitlichen Typen.

    Returns:
        pd.DataFrame: Neuer DataFrame mit bereinigten und standardisierten Datentypen.
    """
    df_clean = df.copy()

    for col in df.columns:
        non_na = df_clean[col].dropna()
        if non_na.empty:
            continue  # keine Werte → egal

        sample = non_na.iloc[0]

        if isinstance(sample, (int, np.integer)):
            df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce").astype("Int64")
        elif isinstance(sample, (float, np.floating)):
            df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce").astype(float)
        elif isinstance(sample, (str, bool, pd.Timestamp)):
            pass  # keine Änderung nötig
        elif isinstance(sample, datetime.date):
            df_clean[col] = df_clean[col].apply(lambda x: x if pd.notnull(x) else None)
        elif isinstance(sample, list):
            df_clean[col] = df_clean[col].apply(lambda x: x if isinstance(x, list) else [])
        elif isinstance(sample, dict):
            df_clean[col] = df_clean[col].apply(lambda x: x if isinstance(x, dict) else {})
        else:
            df_clean[col] = df_clean[col].astype(str)

    return df_clean


In [0]:
df_norm = normalize_pandas_types(df)


In [0]:
Schema = infer_spark_schema_from_pandas(df_norm)
# print(Schema)
df_final = spark.createDataFrame(df_norm, schema=Schema)



df_final.printSchema()

In [0]:
fn_overwrite_table(
    df_source=df_final,
    target_schema_name=target_schema_name,
    target_table_name=target_table_name,
    target_path=target_path
)
print("✅ Tabelle erfolgreich aktualisiert.")