# 1. Producing the data  
In this task, we will implement Apache Kafka producers to simulate real-time data streaming. Spark and parallel data processing should not be used in this section, as we are simulating sensors that often lack processing capabilities.  

1.	Every 5 seconds, load 5 days of weather data from the CSV file. We refer to this as weather5s to explain the tasks; feel free to use your own variable name. You should keep a pointer in the file reading process and advance it per read. The data reading should be in chronological order.
2.	Add the current timestamp (weather_ts) to the weather5s and spread your batch out evenly for 5 seconds for each day. Since the weather data is hourly readings, each day you shall have 24 records (120 records in total for 5 days).
For example, assume you send the records at 2025-01-26 00:00:00 (ISO format: YYYY-MM-DD HH:MM:SS) -> (ts = 1737810000):  
Day 1(records 1-24): ts = 1737810000  
Day 2(records 25-48): ts = 1737810001  
Day 3(records 49-72): ts = 1737810002  
…
3.	Send your batch of weather data to a Kafka topic with an appropriate name.




In [2]:
import json, pandas as pd, os

STATE_FILE = "state_sites.json"
CSV_PATH   = "weather.csv"

# detect sites that actually exist in the CSV
df = pd.read_csv(CSV_PATH, usecols=["site_id","timestamp"])
sites = sorted(df["site_id"].unique().tolist())
if not sites:
    raise RuntimeError("No site_id values found in weather.csv")

# fresh state: all cursors to 0, start at first site, zero rounds
state = {
    "_meta": {"current_site": sites[0], "completed_rounds": 0},
    "sites": {str(sid): {"cursor_idx": 0} for sid in sites}
}

tmp = STATE_FILE + ".tmp"
with open(tmp, "w") as f:
    json.dump(state, f)
os.replace(tmp, STATE_FILE)

print(f"✅ Wrote fresh {STATE_FILE} with sites={sites} and current_site={sites[0]}")


✅ Wrote fresh state_sites.json with sites=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] and current_site=0


In [3]:
# Assignment-2B-Task1_producer — Weather 5s Kafka producer (one-site-at-a-time)
# - NO Spark — simulates low-power sensors
# - Every 5 seconds: send 5 days (24*5 = 120 rows) per site, in order
# - Emits ALL days for site A, then moves to site B, etc.
# - weather_ts spreads the 5 days evenly across 5 seconds: +0..+119 (1 sec per CSV hour)

from time import sleep
from json import dumps, load, dump
from kafka3 import KafkaProducer
import pandas as pd
import numpy as np
import time
import os
import signal
import sys
from typing import Dict, Any, List, Tuple

# ---------------- configuration ----------------
HOST_IP            = "kafka"         # Kafka hostname from your Docker/network
TOPIC_BASE         = "weather"       # used when MULTI_TOPIC=False
MULTI_TOPIC        = True            # True -> weather-site-<id>, False -> single topic "weather"
SEND_AS_ARRAY      = True            # True -> one array (5 days) per tick; False -> send per-row
CSV_PATH           = "weather.csv"   # path to weather.csv
SITE_IDS           = list(range(16)) # desired site ids; actual available are auto-detected from CSV
BATCH_DAYS         = 5               # assignment requirement: 5 days per tick
TICK_INTERVAL      = 5               # seconds between sends
STATE_FILE         = "state_sites.json"  # persisted pointer
STOP_AFTER_ONE_PASS = False          # set True to stop after finishing the last site once

# --------------- globals for graceful shutdown ---------------
_shutdown = False
def _handle_signal(signum, frame):
    global _shutdown
    print(f"\nCaught signal {signum}. Stopping after current tick…")
    _shutdown = True
signal.signal(signal.SIGINT, _handle_signal)
signal.signal(signal.SIGTERM, _handle_signal)

# --------------- kafka helpers -----------------
def connect_kafka_producer():
    """Create a KafkaProducer using configs supported by kafka3/kafka-python."""
    try:
        producer = KafkaProducer(
            bootstrap_servers=[f"{HOST_IP}:9092"],
            value_serializer=lambda v: dumps(v, default=str).encode("utf-8"),
            key_serializer=lambda k: str(k).encode("utf-8"),
            acks="all",
            linger_ms=15,
            batch_size=16384,
            compression_type="gzip",
            retries=5,
            request_timeout_ms=30000,
            max_in_flight_requests_per_connection=5,
            api_version=(0, 10),
            max_request_size=2_000_000
        )
        print("Kafka Producer connected.")
        return producer
    except Exception as ex:
        print("Exception while connecting Kafka.")
        print(str(ex))
        return None

def publish_message(producer, topic_name: str, key: str, value: Any, count_hint: int = None):
    """Queue a message to Kafka. Flushing is handled once per tick outside this function."""
    producer.send(topic_name, key=key, value=value)
    n = count_hint if count_hint is not None else (len(value) if isinstance(value, list) else 1)
    print(f"Queued {n:>3} record(s) → topic={topic_name}, key={key}")

# --------------- state (current-site + per-site linear cursor) ---------------
# State layout:
# {
#   "_meta": {"current_site": <int>, "completed_rounds": 0},
#   "sites": { "<sid>": {"cursor_idx": <int>} }
# }
def load_state() -> Dict[str, Any]:
    if os.path.exists(STATE_FILE):
        try:
            with open(STATE_FILE, "r") as f:
                state = load(f)
                # Backfill structure if coming from old format
                if "_meta" not in state or "sites" not in state:
                    # Old per-site flat dict -> migrate
                    migrated = {
                        "_meta": {"current_site": None, "completed_rounds": 0},
                        "sites": {}
                    }
                    for k, v in state.items():
                        if isinstance(v, dict) and "cursor_idx" in v:
                            migrated["sites"][k] = v
                    state = migrated
                state["_meta"].setdefault("current_site", None)
                state["_meta"].setdefault("completed_rounds", 0)
                return state
        except Exception:
            pass
    # initialize fresh
    return {
        "_meta": {"current_site": None, "completed_rounds": 0},
        "sites": {str(sid): {"cursor_idx": 0} for sid in SITE_IDS}
    }

def save_state(state: Dict[str, Any]):
    tmp = STATE_FILE + ".tmp"
    with open(tmp, "w") as f:
        dump(state, f)
    os.replace(tmp, STATE_FILE)

# --------------- data loading ---------------
def load_weather(csv_path: str, wanted_site_ids: List[int]) -> Dict[int, Dict[str, Any]]:
    """
    Load CSV, parse timestamps, sort, and prepare per-site frames with a day column.
    Required columns: site_id,timestamp and weather attributes (air_temperature, etc.)
    Only sites present both in CSV and in wanted_site_ids are retained.
    """
    df = pd.read_csv(csv_path)
    required = {"site_id", "timestamp"}
    if not required.issubset(df.columns):
        raise ValueError("weather.csv must include columns: site_id,timestamp,...")

    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df.sort_values(["site_id", "timestamp"], inplace=True)
    df["day"] = df["timestamp"].dt.floor("D")

    csv_sites = sorted(df["site_id"].unique().tolist())
    wanted    = sorted(set(wanted_site_ids).intersection(csv_sites))
    if not wanted:
        raise ValueError(f"No overlapping site_ids. CSV has {csv_sites}, configured {wanted_site_ids}")

    per_site: Dict[int, Dict[str, Any]] = {}
    for sid in wanted:
        sdf = df[df["site_id"] == sid].copy()
        days = list(pd.Series(sorted(sdf["day"].unique())))
        per_site[sid] = {"df": sdf, "days": days}
    return per_site

# --------------- batch building (linear, no wrap) ---------------
def build_batch_for_site_linear(site_obj: Dict[str, Any], cursor_idx: int, batch_days: int):
    """
    Pick a contiguous window of up to batch_days from [cursor_idx, ...] WITHOUT wrapping.
    Returns: (batch_df, next_cursor_idx, pick_days, is_done)
    """
    sdf  = site_obj["df"]
    days = site_obj["days"]
    total = len(days)
    if total == 0 or cursor_idx >= total:
        return pd.DataFrame(), cursor_idx, [], True

    end = min(cursor_idx + batch_days, total)
    pick_days = days[cursor_idx:end]
    batch = sdf[sdf["day"].isin(pick_days)].copy()

    # day_index is 0..(len(pick_days)-1) within THIS tick's batch
    day_rank_map = {d: j for j, d in enumerate(pick_days)}
    batch["day_index"] = batch["day"].map(day_rank_map).astype(int)

    next_cursor = end
    is_done = next_cursor >= total
    return batch, next_cursor, pick_days, is_done

def assert_hourly_integrity(batch: pd.DataFrame, pick_days: List[pd.Timestamp], expect_per_day: int = 24):
    if not pick_days or batch.empty:
        return
    counts = batch.groupby("day").size()
    bad = {str(pd.to_datetime(d).date()): int(counts.get(d, 0))
           for d in pick_days if int(counts.get(d, 0)) != expect_per_day}
    if bad:
        print(f"Integrity warning (expect {expect_per_day}/day): {bad}")

# --------------- payload building ---------------
def clean_and_payload(batch: pd.DataFrame, tick_epoch: int, cycle: int = 0) -> List[Dict[str, Any]]:
    if batch.empty:
        return []
    batch = batch.copy()

    # If you want synthetic yearly cycles, set cycle>0; default is 0 (no roll)
    if cycle:
        batch["timestamp"] = batch["timestamp"] + pd.to_timedelta(365 * cycle, unit="D")
        batch["day"] = batch["day"] + pd.to_timedelta(365 * cycle, unit="D")

    batch["hour_of_day"] = batch["timestamp"].dt.hour.astype(int)
    batch["timestamp"]   = batch["timestamp"].dt.strftime("%Y-%m-%dT%H:%M:%S")
    batch["day"]         = batch["day"].dt.strftime("%Y-%m-%d")
    batch = batch.where(pd.notnull(batch), None)

    # weather_ts: 1 sec per CSV hour within the 5-day batch
    batch["weather_ts"] = (batch["day_index"].astype(int) * 24 + batch["hour_of_day"].astype(int)).apply(
        lambda h: tick_epoch + int(h)
    )

    cols = [
        "site_id", "timestamp",
        "air_temperature", "cloud_coverage", "dew_temperature",
        "sea_level_pressure", "wind_direction", "wind_speed",
        "weather_ts", "day_index"
    ]
    existing_cols = [c for c in cols if c in batch.columns]
    return batch[existing_cols].to_dict(orient="records")

# --------------- aligned sleep helper ---------------
def aligned_sleep(start_time: float, interval: int):
    elapsed = time.time() - start_time
    to_sleep = max(0.0, interval - (elapsed % interval))
    sleep(to_sleep)

# --------------- main loop ---------------
if __name__ == "__main__":
    print("Starting Weather Stream Producer (one-site-at-a-time)…")
    producer = connect_kafka_producer()
    if producer is None:
        sys.exit(1)

    try:
        per_site = load_weather(CSV_PATH, SITE_IDS)
    except Exception as e:
        print("Failed to load weather CSV:", e)
        sys.exit(1)

    available_sites = sorted(per_site.keys())
    print("Available sites from CSV:", available_sites)
    for sid in available_sites:
        print(f"• site_id={sid}: records={len(per_site[sid]['df'])}, days={len(per_site[sid]['days'])}")

    state = load_state()

    # Initialize current_site if needed
    cur = state["_meta"].get("current_site")
    if cur not in available_sites:
        cur = available_sites[0]
        state["_meta"]["current_site"] = cur
    # Ensure per-site cursor exists
    state["sites"].setdefault(str(cur), {"cursor_idx": 0})

    completed_sites_this_round = set()  # used only if STOP_AFTER_ONE_PASS

    while True:
        tick_start = time.time()
        tick_epoch = int(tick_start)

        sid = state["_meta"]["current_site"]
        if sid not in per_site:
            # Move to first available if somehow invalid
            sid = available_sites[0]
            state["_meta"]["current_site"] = sid
            state["sites"].setdefault(str(sid), {"cursor_idx": 0})

        skey = str(sid)
        cursor = int(state["sites"][skey]["cursor_idx"])

        # Build a non-wrapping batch for ONLY the current site
        batch_df, next_cursor, pick_days, is_done = build_batch_for_site_linear(
            per_site[sid], cursor, BATCH_DAYS
        )

        # integrity check: expect 24 rows per picked day
        assert_hourly_integrity(batch_df, pick_days, expect_per_day=24)

        records = clean_and_payload(batch_df, tick_epoch)
        topic   = f"weather-site-{sid}" if MULTI_TOPIC else TOPIC_BASE
        key     = f"site-{sid}"

        sent_rows = 0
        if records:
            if SEND_AS_ARRAY:
                publish_message(producer, topic, key, records, count_hint=len(records))
                sent_rows += len(records)
            else:
                for rec in records:
                    publish_message(producer, topic, key, rec, count_hint=1)
                sent_rows += len(records)

        # Advance cursor (stay on this site until done)
        state["sites"][skey]["cursor_idx"] = next_cursor

        # If this site is done, switch to the next site (and optionally stop after one pass)
        if is_done:
            print(f"✔️ Completed site_id={sid} (days={len(per_site[sid]['days'])}).")
            completed_sites_this_round.add(sid)

            # pick next site in order
            idx = available_sites.index(sid)
            if idx == len(available_sites) - 1:
                # end of list
                if STOP_AFTER_ONE_PASS and completed_sites_this_round == set(available_sites):
                    print("All sites completed once. Exiting.")
                    try:
                        producer.flush()
                        producer.close()
                    except Exception:
                        pass
                    save_state(state)
                    sys.exit(0)
                # start another round
                next_sid = available_sites[0]
                completed_sites_this_round = set()  # reset for next round
                state["_meta"]["completed_rounds"] = state["_meta"].get("completed_rounds", 0) + 1
            else:
                next_sid = available_sites[idx + 1]

            state["_meta"]["current_site"] = next_sid
            state["sites"].setdefault(str(next_sid), {"cursor_idx": 0})

        # flush once per tick and persist state
        try:
            producer.flush()
        except Exception as e:
            print("Flush error:", e)
        try:
            save_state(state)
        except Exception as e:
            print("Failed to save state:", e)

        print(
            f"⏱️  Tick complete — site={sid} sent ~{sent_rows} rows; "
            f"cursor={state['sites'][skey]['cursor_idx']}; current_site={state['_meta']['current_site']}"
        )

        if _shutdown:
            break
        aligned_sleep(tick_start, TICK_INTERVAL)

    print("Stopping producer…")
    try:
        producer.flush()
        producer.close()
    except Exception:
        pass
    print("Clean shutdown.")


Starting Weather Stream Producer (one-site-at-a-time)…
Kafka Producer connected.
Available sites from CSV: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
• site_id=0: records=8784, days=365
• site_id=1: records=8763, days=365
• site_id=2: records=8783, days=365
• site_id=3: records=8780, days=365
• site_id=4: records=8783, days=365
• site_id=5: records=8755, days=365
• site_id=6: records=8782, days=365
• site_id=7: records=8614, days=365
• site_id=8: records=8784, days=365
• site_id=9: records=8780, days=365
• site_id=10: records=8782, days=365
• site_id=11: records=8614, days=365
• site_id=12: records=8755, days=365
• site_id=13: records=8783, days=365
• site_id=14: records=8777, days=365
• site_id=15: records=8454, days=365
Queued 120 record(s) → topic=weather-site-0, key=site-0
⏱️  Tick complete — site=0 sent ~120 rows; cursor=5; current_site=0
Queued 120 record(s) → topic=weather-site-0, key=site-0
⏱️  Tick complete — site=0 sent ~120 rows; cursor=10; current_site=0
Queued 

Queued 120 record(s) → topic=weather-site-0, key=site-0
⏱️  Tick complete — site=0 sent ~120 rows; cursor=300; current_site=0
Queued 120 record(s) → topic=weather-site-0, key=site-0
⏱️  Tick complete — site=0 sent ~120 rows; cursor=305; current_site=0
Queued 120 record(s) → topic=weather-site-0, key=site-0
⏱️  Tick complete — site=0 sent ~120 rows; cursor=310; current_site=0
Queued 120 record(s) → topic=weather-site-0, key=site-0
⏱️  Tick complete — site=0 sent ~120 rows; cursor=315; current_site=0
Queued 120 record(s) → topic=weather-site-0, key=site-0
⏱️  Tick complete — site=0 sent ~120 rows; cursor=320; current_site=0
Queued 120 record(s) → topic=weather-site-0, key=site-0
⏱️  Tick complete — site=0 sent ~120 rows; cursor=325; current_site=0
Queued 120 record(s) → topic=weather-site-0, key=site-0
⏱️  Tick complete — site=0 sent ~120 rows; cursor=330; current_site=0
Queued 120 record(s) → topic=weather-site-0, key=site-0
⏱️  Tick complete — site=0 sent ~120 rows; cursor=335; current

Queued 120 record(s) → topic=weather-site-1, key=site-1
⏱️  Tick complete — site=1 sent ~120 rows; cursor=250; current_site=1
Queued 120 record(s) → topic=weather-site-1, key=site-1
⏱️  Tick complete — site=1 sent ~120 rows; cursor=255; current_site=1
Queued 120 record(s) → topic=weather-site-1, key=site-1
⏱️  Tick complete — site=1 sent ~120 rows; cursor=260; current_site=1
Queued 120 record(s) → topic=weather-site-1, key=site-1
⏱️  Tick complete — site=1 sent ~120 rows; cursor=265; current_site=1
Queued 120 record(s) → topic=weather-site-1, key=site-1
⏱️  Tick complete — site=1 sent ~120 rows; cursor=270; current_site=1
Queued 119 record(s) → topic=weather-site-1, key=site-1
⏱️  Tick complete — site=1 sent ~119 rows; cursor=275; current_site=1
Queued 120 record(s) → topic=weather-site-1, key=site-1
⏱️  Tick complete — site=1 sent ~120 rows; cursor=280; current_site=1
Queued 120 record(s) → topic=weather-site-1, key=site-1
⏱️  Tick complete — site=1 sent ~120 rows; cursor=285; current

Queued 120 record(s) → topic=weather-site-2, key=site-2
⏱️  Tick complete — site=2 sent ~120 rows; cursor=205; current_site=2
Queued 120 record(s) → topic=weather-site-2, key=site-2
⏱️  Tick complete — site=2 sent ~120 rows; cursor=210; current_site=2
Queued 120 record(s) → topic=weather-site-2, key=site-2
⏱️  Tick complete — site=2 sent ~120 rows; cursor=215; current_site=2
Queued 120 record(s) → topic=weather-site-2, key=site-2
⏱️  Tick complete — site=2 sent ~120 rows; cursor=220; current_site=2
Queued 120 record(s) → topic=weather-site-2, key=site-2
⏱️  Tick complete — site=2 sent ~120 rows; cursor=225; current_site=2
Queued 120 record(s) → topic=weather-site-2, key=site-2
⏱️  Tick complete — site=2 sent ~120 rows; cursor=230; current_site=2
Queued 120 record(s) → topic=weather-site-2, key=site-2
⏱️  Tick complete — site=2 sent ~120 rows; cursor=235; current_site=2
Queued 120 record(s) → topic=weather-site-2, key=site-2
⏱️  Tick complete — site=2 sent ~120 rows; cursor=240; current

Queued 120 record(s) → topic=weather-site-3, key=site-3
⏱️  Tick complete — site=3 sent ~120 rows; cursor=165; current_site=3
Queued 120 record(s) → topic=weather-site-3, key=site-3
⏱️  Tick complete — site=3 sent ~120 rows; cursor=170; current_site=3
Queued 120 record(s) → topic=weather-site-3, key=site-3
⏱️  Tick complete — site=3 sent ~120 rows; cursor=175; current_site=3
Queued 120 record(s) → topic=weather-site-3, key=site-3
⏱️  Tick complete — site=3 sent ~120 rows; cursor=180; current_site=3
Queued 120 record(s) → topic=weather-site-3, key=site-3
⏱️  Tick complete — site=3 sent ~120 rows; cursor=185; current_site=3
Queued 120 record(s) → topic=weather-site-3, key=site-3
⏱️  Tick complete — site=3 sent ~120 rows; cursor=190; current_site=3
Queued 117 record(s) → topic=weather-site-3, key=site-3
⏱️  Tick complete — site=3 sent ~117 rows; cursor=195; current_site=3
Queued 120 record(s) → topic=weather-site-3, key=site-3
⏱️  Tick complete — site=3 sent ~120 rows; cursor=200; current

Queued 120 record(s) → topic=weather-site-4, key=site-4
⏱️  Tick complete — site=4 sent ~120 rows; cursor=120; current_site=4
Queued 120 record(s) → topic=weather-site-4, key=site-4
⏱️  Tick complete — site=4 sent ~120 rows; cursor=125; current_site=4
Queued 120 record(s) → topic=weather-site-4, key=site-4
⏱️  Tick complete — site=4 sent ~120 rows; cursor=130; current_site=4
Queued 120 record(s) → topic=weather-site-4, key=site-4
⏱️  Tick complete — site=4 sent ~120 rows; cursor=135; current_site=4
Queued 120 record(s) → topic=weather-site-4, key=site-4
⏱️  Tick complete — site=4 sent ~120 rows; cursor=140; current_site=4
Queued 120 record(s) → topic=weather-site-4, key=site-4
⏱️  Tick complete — site=4 sent ~120 rows; cursor=145; current_site=4
Queued 120 record(s) → topic=weather-site-4, key=site-4
⏱️  Tick complete — site=4 sent ~120 rows; cursor=150; current_site=4
Queued 120 record(s) → topic=weather-site-4, key=site-4
⏱️  Tick complete — site=4 sent ~120 rows; cursor=155; current

Queued 120 record(s) → topic=weather-site-5, key=site-5
⏱️  Tick complete — site=5 sent ~120 rows; cursor=70; current_site=5
Queued 119 record(s) → topic=weather-site-5, key=site-5
⏱️  Tick complete — site=5 sent ~119 rows; cursor=75; current_site=5
Queued 120 record(s) → topic=weather-site-5, key=site-5
⏱️  Tick complete — site=5 sent ~120 rows; cursor=80; current_site=5
Queued 111 record(s) → topic=weather-site-5, key=site-5
⏱️  Tick complete — site=5 sent ~111 rows; cursor=85; current_site=5
Queued 120 record(s) → topic=weather-site-5, key=site-5
⏱️  Tick complete — site=5 sent ~120 rows; cursor=90; current_site=5
Queued 120 record(s) → topic=weather-site-5, key=site-5
⏱️  Tick complete — site=5 sent ~120 rows; cursor=95; current_site=5
Queued 119 record(s) → topic=weather-site-5, key=site-5
⏱️  Tick complete — site=5 sent ~119 rows; cursor=100; current_site=5
Queued 120 record(s) → topic=weather-site-5, key=site-5
⏱️  Tick complete — site=5 sent ~120 rows; cursor=105; current_site=