In [0]:
import sys
sys.path.append('/Workspace/Users/laugur1508@gmail.com/twitch_data_pipeline/notebooks')
import dlt
import requests
import pandas as pd
from pyspark.sql.functions import current_timestamp, to_timestamp, concat, col, lit
import logging
import json
from utils.auth import get_access_token, twitch_api_request
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, TimestampType

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

empty_schema = StructType([
            StructField("id", StringType(), True),
            StructField("login", StringType(), True),
            StructField("display_name", StringType(), True),
            StructField("data_collection_time", TimestampType(), True),
            StructField("collection_date", StringType(), True)
        ])

def get_streamers_profile_data(streamers, client_id, access_token):
    users_url = "https://api.twitch.tv/helix/users"
    batch_size = 100
    all_users = []

    for i in range(0, len(streamers), batch_size):
        batch = streamers[i:i + batch_size]
        params = [("login", s) for s in batch]
        try:
            users = twitch_api_request(users_url, client_id, access_token, params=params)
            temp = users.get("data", [])
            all_users.extend(temp)
            logger.info(f"User batch {i//batch_size + 1}: Found {len(temp)} users")
        except Exception as e:
            logger.error(f"User batch {i//batch_size + 1} failed: {e}")
            continue

    return all_users

def get_streamers_followers_data(user_ids, client_id, access_token):
    followers_url = "https://api.twitch.tv/helix/channels/followers"
    followers_data = []

    for user_id in user_ids:
        params = {"broadcaster_id": user_id}
        try:
            data = twitch_api_request(followers_url, client_id, access_token, params=params)
            followers_count = data.get("total", 0)
            followers_data.append({
                "user_id": user_id,
                "followers_count": followers_count
            })
            logger.info(f"Got followers for {user_id}: {followers_count}")
        except Exception as e:
            logger.error(f"Failed to get followers for {user_id}: {e}")
            followers_data.append({
                "user_id": user_id,
                "followers_count": 0
            })

    return followers_data

def get_streamers_channel_info(user_ids, client_id, access_token):
    channels_url = "https://api.twitch.tv/helix/channels"
    batch_size = 20
    all_channels = []

    for i in range(0, len(user_ids), batch_size):
        batch = user_ids[i:i + batch_size]
        params = [("broadcaster_id", uid) for uid in batch]
        try:
            channels = twitch_api_request(channels_url, client_id, access_token, params=params)
            temp = channels.get("data", [])
            all_channels.extend(temp)
        except Exception as e:
            logger.error(f"Failed to get channels batch {i//batch_size + 1}: {e}")
            continue

    return all_channels

def clean_profile_data(users_data, followers_data, channels_data):
    if not users_data:
        return None
    
    users_df = pd.DataFrame(users_data)
    followers_df = pd.DataFrame(followers_data)
    channels_df = pd.DataFrame(channels_data)

    merged_df = users_df.merge(followers_df, left_on='id', right_on='user_id', how='left')
    if 'user_id' in merged_df.columns:
        merged_df = merged_df.drop('user_id', axis=1)
    merged_df = merged_df.merge(channels_df, left_on='id', right_on='broadcaster_id', how='left')

    final_columns = {
        'id': 'user_id',
        'login': 'username',
        'display_name': 'display_name',
        'type': 'user_type',
        'broadcaster_type': 'broadcaster_type',
        'description': 'description',
        'profile_image_url': 'profile_image_url',
        'offline_image_url': 'offline_image_url',
        'created_at': 'account_created_at',
        'followers_count': 'followers_count',
        'broadcaster_language': 'broadcaster_language',
        'game_id': 'current_game_id',
        'game_name': 'current_game_name',
        'title': 'current_title',
        'delay': 'stream_delay'
    }

    available_columns = {k: v for k, v in final_columns.items() if k in merged_df.columns}
    result_df = merged_df[list(available_columns.keys())].rename(columns=available_columns)

    result_df = result_df.fillna({
        'followers_count': 0,
        'description': '',
        'current_game_name': '',
        'current_title': '',
        'broadcaster_language': 'en'
    })

    return result_df

@dlt.table(
    name="bronze_streamers",
    comment="Bronze layer: Twitch streamers profile data with append-only mode",
    table_properties={
        "delta.autoOptimize.optimizeWrite": "true",
        "delta.autoOptimize.autoCompact": "true",
        "pipelines.autoOptimize.managed": "true"
    },
    table_type="live"
)
def bronze_streamers():
    """
    Bronze layer table for Twitch streamers profile data.
    Uses append mode to ensure data is not overwritten.
    """
    # Get API credentials
    CLIENT_ID = dbutils.secrets.get(scope="my-secret", key="CLIENT-ID")
    CLIENT_SECRET = dbutils.secrets.get(scope="my-secret", key="CLIENT-SECRET")
    ACCESS_TOKEN = get_access_token(CLIENT_ID, CLIENT_SECRET)
    
    if not ACCESS_TOKEN:
        logger.error("Failed to get Twitch access token")
        return spark.createDataFrame([], streamers_schema)

    # Load configuration
    config_path = "dbfs:/FileStore/config/top50FrenchStreamer.json"
    config = json.loads(dbutils.fs.head(config_path))
    top50FrenchStreamers = config["top50FrenchStreamers"]

    # Fetch data from Twitch API
    users_data = get_streamers_profile_data(top50FrenchStreamers, CLIENT_ID, ACCESS_TOKEN)

    if not users_data:
        logger.warning("No user data fetched from Twitch API")
        return spark.createDataFrame([], streamers_schema)

    # Get additional data
    user_ids = [user['id'] for user in users_data]
    followers_data = get_streamers_followers_data(user_ids, CLIENT_ID, ACCESS_TOKEN)
    channels_data = get_streamers_channel_info(user_ids, CLIENT_ID, ACCESS_TOKEN)

    # Clean and process data
    cleaned_df = clean_profile_data(users_data, followers_data, channels_data)
    if cleaned_df is None or cleaned_df.empty:
        logger.warning("No cleaned data available after processing")
        return spark.createDataFrame([], streamers_schema)

    # Convert to Spark DataFrame
    spark_df = spark.createDataFrame(cleaned_df)

    # Add metadata columns
    spark_df = spark_df.withColumn("data_collection_time", current_timestamp()) \
                       .withColumn("collection_date", current_timestamp().cast("date").cast("string"))

    # Add unique identifier to prevent duplicates
    # Using user_id + collection_date for daily snapshots of streamer profiles
    spark_df = spark_df.withColumn("record_id", 
                                   concat(col("user_id"),  # assuming 'id' is the user_id field
                                         lit("_"), 
                                         col("collection_date")))

    logger.info(f"Successfully processed {spark_df.count()} streamer records")
    return spark_df