%md
# 01-raw: Auslesen der Volltexte aus Talkwalker

* Um was handelt es sich hier  (Kurzbeschreibung Inhalt):  
Anbinden der Talkwalker API um Social Listening Daten in 01_Raw abzuspeichern.


---
## QUELLEN:  
- Talkwalker API 

## ZIEL  
- Unity-Catalog: 
  - datif_pz_uk_dev.01-toRaw.talkwalker

  
---
* Versionen (aktuelle immer oben):
- 10.07.2025 Minh Hieu Le: Init

In [0]:
%run ../../common/nb_init

In [0]:
pip install azure-storage-blob


In [0]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType, LongType
from pyspark.sql.functions import current_timestamp, from_unixtime, col, regexp_replace, when, lit
from azure.storage.blob import BlobServiceClient
import json
import pandas as pd

In [0]:
secret= get_secret("Talkwalker-API")


In [0]:
def get(URL: str) -> dict:
    """
    Führt eine GET-Anfrage an die angegebene URL mit dem Talkwalker-API-Schlüssel als Autorisierungsschlüssel aus und gibt das Ergebnis als JSON-Daten zurück.

    Input:
    URL: URL Endpoint

    Output:
    result: JSON-Daten
    """
    response = requests.get(URL, headers={"Authorization": f"Bearer {secret}"})
    if response.status_code == 200:
        print("✅ API funktioniert!")
        return response.json()
    else:
        print("❌ Fehler beim API-Aufruf:", response.status_code)
        print(response.text)

## Get Status

In [0]:
status_url = "https://api.talkwalker.com/api/v1/status/credits"
data = get(status_url)
print(data)
print("Verbleibende monatliche Credits:", data["result_creditinfo"]["remaining_credits_monthly"])

## Get Search API

In [0]:
projekt_url = "https://api.talkwalker.com/api/v1/search/info"
data = get(projekt_url)
print(data)
for project in data["result_accinfo"]["projects"]:
    print("Projekt:", project["name"], "| ID:", project["id"])
project_id = data["result_accinfo"]["projects"][0]["id"]

## Get Topics

In [0]:
topic_result_url = f"https://api.talkwalker.com/api/v2/talkwalker/p/{project_id}/topics/list" 

data = get(topic_result_url)

# print(data)
sub_topic_dict = {}
for topic in data["result_topics"]["topic_categories"]:
    for sub_topic in topic["query_topics"]:
        if topic["title"] == "ICC UK":
            print(f"Topic: {topic["title"]}, ID: {topic["id"]}, Subtopic: {sub_topic['title']}, ID: {sub_topic['id']}")
            sub_topic_dict[sub_topic["title"]] = sub_topic["id"]
            topic_id = topic["id"]

print(topic_id)
print(sub_topic_dict)

In [0]:
# Liste zum speichern der Daten
df_list = []

# Set von allen Keys bzw Spalten Namen
all_columns = set()

for subtopic_title, subtopic_id in sub_topic_dict.items():
    url = f"https://api.talkwalker.com/api/v1/search/p/{project_id}/results?time_range=30d&topic={subtopic_id}&timezone=Europe/Berlin&hpp=500"
    data = get(url)
    
    try:
        for item in data["result_content"]["data"]:
            flat = item.get("data") if isinstance(item.get("data"), dict) else str(item.get("data"))
            all_columns.update(data.keys())
            if flat and isinstance(flat, dict) and len(flat) > 0:
                flat["subtopic"] = subtopic_title
                df_list.append(flat)
    except Exception as e:
        print(f"Error processing item: {e}")
        continue

# Daten in ein gemeinsames DataFrame überführen (mit automatischer Spaltenangleichung)
df = pd.DataFrame(df_list)

## Get Project Results 

In [0]:
project_results_url = f"https://api.talkwalker.com/api/v1/search/p/{project_id}/results?time_range=7d"
data = get(project_results_url)

print(data)

results = []

for entry in data['result_content']['data']:
    post = entry['data']
    results.append({
        'title': post.get('title'),
        'content': post.get('content_snippet'),
        'url': post.get('url'),
        'root_url': post.get('root_url'),
        'domain_url': post.get('domain_url'),
        'published_at': post.get('published'),
        #'language': post.get('lang'),
        'source_type': post.get('source_type'),
        'post_type': post.get('post_type'),
        'images': post.get('images'),
        'author': post.get('extra_author_attributes', {}).get('name'),
        #'country': post.get('extra_author_attributes', {}).get('world_data', {}).get('country'),
        'sentiment': post.get('sentiment'),
        'youtube_views': post.get('article_extended_attributes', {}).get('youtube_views'),
        'youtube_likes': post.get('article_extended_attributes', {}).get('youtube_likes'),
        'num_comments': post.get('article_extended_attributes', {}).get('num_comments'),
        #'alexa_pageviews': post.get('source_extended_attributes', {}).get('alexa_pageviews'),
        #'alexa_unique_visitors': post.get('source_extended_attributes', {}).get('alexa_unique_visitors'),
        'reach': post.get('reach'),
        'engagement': post.get('engagement')
    })

# Define schema
schema = StructType([
    StructField("title", StringType(), True),
    StructField("content", StringType(), True),
    StructField("url", StringType(), True),
    StructField("root_url", StringType(), True),
    StructField("domain_url", StringType(), True),
    StructField("published_at", LongType(), True),
    #StructField("language", StringType(), True),
    StructField("source_type", StringType(), True),
    StructField("post_type", StringType(), True),
    StructField("images", StringType(), True),
    StructField("author", StringType(), True),
    #StructField("country", StringType(), True),
    StructField("sentiment", IntegerType(), True),
    StructField("youtube_views", IntegerType(), True),
    StructField("youtube_likes", IntegerType(), True),
    StructField("num_comments", IntegerType(), True),
    #StructField("alexa_pageviews", LongType(), True),
    #StructField("alexa_unique_visitors", LongType(), True),
    StructField("reach", IntegerType(), True),
    StructField("engagement", IntegerType(), True)
])

# Create PySpark DataFrame
df_api = spark.createDataFrame(results, schema=schema)

#df_api = df_api.withColumn("ingested_at", current_timestamp())

df_api = df_api.withColumn("published_at", from_unixtime(col("published_at")/1000))

df_api = df_api.withColumn("post_type", regexp_replace("post_type", r"[\[\]]", ""))

df_api = df_api.withColumn("images", regexp_replace("images", r"[\[\]\{\}]", ""))

df_existing = spark.read.table("datif_pz_uk_dev.02_cleaned.talkwalker")

df_joined = df_api.join(df_existing.select("url", "ingested_at"), on="url", how="left")

#df = df_joined.withColumn("ingested_at", when(col("ingested_at").isNull(), current_timestamp()).otherwise(col("ingested_at")))

df = df_joined.withColumn(
    "ingested_at",
    when(col("ingested_at").isNull(), current_timestamp()).otherwise(col("ingested_at"))
)

display(df)

#df.coalesce(1).write.mode("overwrite").json("wasbs://01-raw@datifpzukdevstdfs.blob.core.windows.net/talkwalker/")

df.coalesce(1).write.mode("overwrite").json("abfss://01-raw@datifpzukdevstdfs.dfs.core.windows.net/talkwalker")




fn_overwrite_table(df, target_schema_name=target_schema_name, target_table_name="talkwalker", target_path=target_path)

#   "pagination": {
#     "next": "GET /api/v1/search/p/a1ab1abc-a123-123a-123a-abcd12ab1a12/results?q=cats%20AND%20dogs&offset=2&hpp=2&sort_by=engagement&sort_order=desc&hl=true&pretty=false&topic&filter&channel&panel&dataset&access_token=abcd_1234_abcd",
#     "total": 1529

## GET Project Results and Save as JSON

### GET Project Results and save as JSON

### Read existing data

In [0]:
df_existing = spark.read.json("abfss://01-raw@datifpzukdevstdfs.dfs.core.windows.net/talkwalker/data.json")

display(df_existing)


### Read all data from API 

In [0]:
#project_results_url = f"https://api.talkwalker.com/api/v1/search/p/{project_id}/results?time_range=7d"
project_results_url = f"https://api.talkwalker.com/api/v1/search/p/{project_id}/results?time_range=30d&topic={subtopic_id}&timezone=Europe/Berlin&hpp=500"
api_data = get(project_results_url) # Make sure it's parsed as JSON

# Extract the list of records 
#records = data.get("data", [])  

raw_results = api_data.get("result_content", {}).get("data", [])

# flatten the list (pull out each inner `data` object)
records = [item["data"] for item in raw_results if "data" in item]

# Turn it into newline-delimited JSON (NDJSON)
json_lines = "\n".join([json.dumps(record) for record in records])

# Upload to Azure Blob
connection_string = ""
container = "01-raw"
blob = "talkwalker/api_data.json"

# Save API data to Storage
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
blob_client = blob_service_client.get_blob_client(container=container, blob=blob)
blob_client.upload_blob(json_lines, overwrite=True)

# Read API data
df_api = spark.read.json("abfss://01-raw@datifpzukdevstdfs.dfs.core.windows.net/talkwalker/api_data.json")
#df_api = df.withColumn("ingested_at", current_timestamp())

# Left join api data with existing data to ensure no duplicate
df_joined = df_api.join(df_existing.select("url", "ingested_at"), on="url", how="left")

# Add "ingested_at" if not exists yet
df = df_joined.withColumn(
    "ingested_at",
    when(col("ingested_at").isNull(), current_timestamp()).otherwise(col("ingested_at"))
)

display(df)


### Save all data to Blob Storage

In [0]:
tmp_path = "abfss://01-raw@datifpzukdevstdfs.dfs.core.windows.net/talkwalker/tmp_data_json"

# write data to a temporary folder
df.coalesce(1).write.mode("overwrite").json(tmp_path)

# wrap all data to one file
files = dbutils.fs.ls(tmp_path)
json_file = json_file = [f.path for f in files if f.name.endswith(".json")][0]

final_path = "abfss://01-raw@datifpzukdevstdfs.dfs.core.windows.net/talkwalker/data.json"

# remove the existing file
dbutils.fs.rm(final_path, True)

# overwrite file
dbutils.fs.mv(json_file, final_path)


In [0]:
project_results_url = f"https://api.talkwalker.com/api/v1/search/p/{project_id}/results?time_range=7d"
data = get(project_results_url) 

In [0]:
results_url = "https://api.talkwalker.com/api/v1/search/results?q=mobility"
data = get(results_url)
print(data)

## GET Summary API

In [0]:
summary_url = "https://api.talkwalker.com/api/v1/search/summary?q=A"
data = get(summary_url)
print(data)

In [0]:
project_summary_url = f"https://api.talkwalker.com/api/v1/search/p/{project_id}/summary"
data = get(project_summary_url)
print(data)

## GET Histogram API

In [0]:
histo_api = f"https://api.talkwalker.com/api/v1/search/p/{project_id}/histogram/published"
data = get(histo_api)
print(data)

In [0]:
histo_infl_api = f"https://api.talkwalker.com/api/v1/search/p/{project_id}/histogram/top_influencers"
data = get(histo_infl_api)
print(data)