In [22]:
import json
import pandas as pd
import boto3

In [42]:
# S3 configuration
bucket = "twitch-emotes-analytics-project"
channel_name = "esl_dota2"
chat_year = "2023"

In [24]:
s3 = boto3.client("s3")

In [37]:
def list_raw_json_files(chat_year):
    prefix = f"data/raw/esl_dota2/{str(chat_year)}/"
    response = s3.list_objects_v2(Bucket = bucket, Prefix = prefix)
    files = [item["Key"] for item in response.get("Contents",[]) if item['Key'].endswith('.json')]
    return files

In [39]:
s3_keys = list_raw_json_files(2023)

In [43]:
s3_key = s3_keys[0]

In [44]:
s3_key

'data/raw/esl_dota2/2023/esl_dota2_2023_0.json'

In [45]:
bucket

'twitch-emotes-analytics-project'

In [46]:
response = s3.get_object(Bucket = bucket, Key = s3_key)

In [47]:
type(response)

dict

In [None]:
import boto3
import pandas as pd
import json
import io

# --- Config ---
bucket = 'twitch-emotes-analytics-project'
channel = 'esl_dota2'
years = ['2020', '2021', '2022', '2023', '2024', '2025']

s3 = boto3.client('s3')

def list_raw_json_keys(channel, year):
    prefix = f'raw/{channel}/{year}/chats/'
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
    keys = [item['Key'] for item in response.get('Contents', []) if item['Key'].endswith('.json')]
    return keys

def load_chat_data_from_s3(s3_key):
    response = s3.get_object(Bucket=bucket, Key=s3_key)
    body = response['Body'].read()
    try:
        return json.loads(body)
    except json.JSONDecodeError as e:
        print(f"⚠️ JSONDecodeError in {s3_key}: {e}")
        return []

def parse_emote_records(json_data, streamer="esl_dota2"):
    emote_records = []
    for msg in json_data:
        emotes = msg.get("emotes", [])
        if not emotes:
            continue

        author = msg.get("author", {})
        user_id = author.get("id", "")
        user_name = author.get("name", "")
        color = author.get("colour", None)
        badges = author.get("badges", [])
        timestamp = pd.to_datetime(msg.get("timestamp"), unit="us", errors="coerce")
        time_text = msg.get("time_text", "")
        seconds = msg.get("time_in_seconds")
        message = msg.get("message", "")

        for emote in emotes:
            emote_records.append({
                "i_user_id": user_id,
                "i_username": user_name,
                "i_display_color": color,
                "i_badges": badges,
                "j_streamer": streamer,
                "k_emote_name": emote.get("name", ""),
                "t_timestamp": timestamp,
                "t_time_text": time_text,
                "t_seconds": seconds,
                "message": message
            })
    return emote_records

for year in years:
    print(f"\n📅 Year: {year}")
    all_records = []
    json_keys = list_raw_json_keys(channel, year)

    for i, key in enumerate(json_keys, start=1):
        print(f"🔄 Reading {key} ({i}/{len(json_keys)})")
        chat_data = load_chat_data_from_s3(key)
        records = parse_emote_records(chat_data, streamer=channel)
        all_records.extend(records)

    df_panel = pd.DataFrame(all_records)
    output_key = f'processed/{channel}/{year}/emote_panel.csv'

    csv_buffer = io.StringIO()
    df_panel.to_csv(csv_buffer, index=False)
    s3.put_object(Bucket=bucket, Key=output_key, Body=csv_buffer.getvalue())

    print(f"✅ Saved {len(df_panel)} records to s3://{bucket}/{output_key}")
