In [None]:
#%pip install pyarrow

## Writer

In [1]:
# %% [markdown]
# Notebook A — Writer (Producer): writes 1 parquet file per second to ./landing/

# %%
import time
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, Tuple, Iterator, Optional, List

import numpy as np
import pandas as pd

FEATURES: List[str] = [f"signal_{i}" for i in range(1, 61)]
TIMESTAMP_COL = "Timestamp"

LANDING_DIR = Path(r"C:\Users\Ishaan Tiwari\Desktop\Kafka\landing")
LANDING_DIR.mkdir(parents=True, exist_ok=True)


In [2]:
# %%
def format_ts(dt: datetime) -> str:
    """Return 'MM:DD:YYYY hh:mm:ss:ffffff' (6-digit subsecond)."""
    return dt.strftime("%m:%d:%Y %H:%M:%S") + f":{dt.microsecond:06d}"

def make_default_ranges() -> Dict[str, Tuple[float, float]]:
    bands = [(-1.0, 1.0), (-3.0, 3.0), (0.0, 5.0), (-10.0, -2.0), (5.0, 15.0)]
    ranges = {}
    for i, sig in enumerate(FEATURES, start=1):
        lo, hi = bands[(i - 1) % len(bands)]
        ranges[sig] = (lo, hi)
    return ranges

class RowStream:
    """
    Emits 1000 rows per real second. Each row's Timestamp is a string and
    consecutive rows are +1 second apart in simulated time.
    rows_per_sec=200, seed=999
    """
    def __init__(self, rows_per_sec: int = 1000, start_dt: Optional[datetime] = None,
                 per_signal_ranges: Optional[Dict[str, Tuple[float, float]]] = None, seed: int = 42):
        assert rows_per_sec > 0
        self.rows_per_sec = int(rows_per_sec)
        self.rng = np.random.default_rng(seed)
        self.features = FEATURES
        self.per_signal_ranges = per_signal_ranges or make_default_ranges()
        missing = [f for f in self.features if f not in self.per_signal_ranges]
        if missing:
            raise ValueError(f"Missing ranges for: {missing[:5]}{'...' if len(missing)>5 else ''}")
        lows, highs = [], []
        for f in self.features:
            lo, hi = self.per_signal_ranges[f]
            if not (isinstance(lo, (int, float)) and isinstance(hi, (int, float)) and lo < hi):
                raise ValueError(f"Invalid range for {f}: ({lo}, {hi})")
            lows.append(lo); highs.append(hi)
        self.low = np.array(lows, dtype=np.float64)
        self.high = np.array(highs, dtype=np.float64)
        self.width = self.high - self.low
        self.sim_time = (start_dt or datetime.utcnow()).replace(microsecond=0)

    def _gen_batch(self) -> pd.DataFrame:
        n = self.rows_per_sec
        stamps = [format_ts(self.sim_time + timedelta(seconds=i)) for i in range(n)]
        u = self.rng.random(size=(n, len(self.features)), dtype=np.float64)
        vals = (self.low + u * self.width).astype(np.float32, copy=False)
        df = pd.DataFrame(vals, columns=self.features)
        df.insert(0, TIMESTAMP_COL, stamps)
        self.sim_time += timedelta(seconds=n)
        return df

    def stream(self, seconds: Optional[int] = None, real_time: bool = True) -> Iterator[pd.DataFrame]:
        i = 0
        while seconds is None or i < seconds:
            start_wall = time.time()
            yield self._gen_batch()
            i += 1
            if real_time:
                elapsed = time.time() - start_wall
                to_sleep = 1.0 - elapsed
                if to_sleep > 0:
                    time.sleep(to_sleep)

def write_batch_parquet(pdf: pd.DataFrame, base_dir: Path = LANDING_DIR):
    """landing/YYYY=YYYY/MM=MM/DD=DD/HH=hh/mm=mm/ss=ss/part-00000.parquet"""
    first_ts = pdf[TIMESTAMP_COL].iloc[0]  # "MM:DD:YYYY hh:mm:ss:ffffff"
    MM, DD, YYYY_time = first_ts.split(":")[0], first_ts.split(":")[1], first_ts.split(":")[2]
    YYYY = YYYY_time.split()[0]
    hh = first_ts.split()[1].split(":")[0]
    mm = first_ts.split()[1].split(":")[1]
    ss = first_ts.split()[1].split(":")[2]
    out_dir = base_dir / f"YYYY={YYYY}/MM={MM}/DD={DD}/HH={hh}/mm={mm}/ss={ss}"
    out_dir.mkdir(parents=True, exist_ok=True)
    pdf.to_parquet(out_dir / "part-00000.parquet", index=False)


### Start Writing and Interrupt Writing

In [9]:
# %%
stream = RowStream(rows_per_sec=200, seed=999)
try:
    for batch_pdf in stream.stream(seconds=None, real_time=True):
        write_batch_parquet(batch_pdf)
except KeyboardInterrupt:
    print("Stopped real-time producer.")


[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=20\HH=22\mm=45\ss=01\part-00000.parquet | size=92.9 KB
[monitor]   sample 3 rows from part-00000.parquet:
                    Timestamp  signal_1  signal_2  signal_3  signal_4  \
0  08:20:2025 22:45:01:000000  0.557650 -1.966508  3.567864 -3.997319   
1  08:20:2025 22:45:02:000000  0.435615 -1.497452  4.305065 -2.693646   
2  08:20:2025 22:45:03:000000  0.270561  2.550678  0.309831 -7.853706   

    signal_5  signal_6  signal_7  signal_8  signal_9  ...  signal_51  \
0   6.711091  0.803849 -2.149311  3.056144 -2.712455  ...  -0.603399   
1   5.403578 -0.870554 -0.508553  4.713863 -6.981105  ...   0.247433   
2  12.164513 -0.062574 -2.929401  1.479812 -6.175411  ...  -0.141809   

   signal_52  signal_53  signal_54  signal_55  signal_56  signal_57  \
0   1.312248   3.160212  -3.110476   9.106469  -0.023211   0.653595   
1   1.355940   3.273074  -5.358946  11.318894  -0.766794  -0.283098   
2  -2.143357 

[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=00\mm=28\ss=21\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=00\mm=31\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=00\mm=35\ss=01\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=00\mm=38\ss=21\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=00\mm=41\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=00\mm=45\ss=01\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=00\mm=48\ss=21\part-00000.parquet | size=92.8 KB
[monitor] + n

[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=02\mm=25\ss=01\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=02\mm=28\ss=21\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=02\mm=31\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=02\mm=35\ss=01\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=02\mm=38\ss=21\part-00000.parquet | size=92.9 KB
[monitor]   sample 3 rows from part-00000.parquet:
                    Timestamp  signal_1  signal_2  signal_3  signal_4  \
0  08:21:2025 02:38:21:000000 -0.658741  1.219882  0.907957 -9.563185   
1  08:21:2025 02:38:22:000000  0.432552  0.132668  4.348126 -4.727551   
2  08:21:2025 02:38:23:00

[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=04\mm=21\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=04\mm=25\ss=01\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=04\mm=28\ss=21\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=04\mm=31\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=04\mm=35\ss=01\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=04\mm=38\ss=21\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=04\mm=41\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + n

[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=06\mm=18\ss=21\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=06\mm=21\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=06\mm=25\ss=01\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=06\mm=28\ss=21\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=06\mm=31\ss=41\part-00000.parquet | size=92.9 KB
[monitor]   sample 3 rows from part-00000.parquet:
                    Timestamp  signal_1  signal_2  signal_3  signal_4  \
0  08:21:2025 06:31:41:000000  0.042998 -0.981116  2.774759 -5.387207   
1  08:21:2025 06:31:42:000000  0.175029  1.707431  2.819038 -6.478513   
2  08:21:2025 06:31:43:00

[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=08\mm=15\ss=01\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=08\mm=18\ss=21\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=08\mm=21\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=08\mm=25\ss=01\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=08\mm=28\ss=21\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=08\mm=31\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=08\mm=35\ss=01\part-00000.parquet | size=92.9 KB
[monitor] + n

[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=10\mm=11\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=10\mm=15\ss=01\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=10\mm=18\ss=21\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=10\mm=21\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=10\mm=25\ss=01\part-00000.parquet | size=92.9 KB
[monitor]   sample 3 rows from part-00000.parquet:
                    Timestamp  signal_1  signal_2  signal_3  signal_4  \
0  08:21:2025 10:25:01:000000  0.980595  0.987176  0.090515 -2.701555   
1  08:21:2025 10:25:02:000000 -0.738054  0.189475  3.924256 -3.787553   
2  08:21:2025 10:25:03:00

[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=12\mm=11\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=12\mm=15\ss=01\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=12\mm=18\ss=21\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=12\mm=21\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=12\mm=25\ss=01\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=12\mm=28\ss=21\part-00000.parquet | size=92.9 KB
[monitor] + new file: C:\Users\Ishaan Tiwari\Desktop\Kafka\landing\YYYY=2025\MM=08\DD=21\HH=12\mm=31\ss=41\part-00000.parquet | size=92.9 KB
[monitor] + n

## Write Monitor

In [3]:
# %% 
# Live monitor for landing/ that prints new Parquet files as they appear.
# No extra packages needed; simple polling + a background thread.

import threading, time, os
from pathlib import Path
from typing import Set

_MONITOR_THREAD = None
_MONITOR_STOP = threading.Event()

def _bytes_to_human(n: int) -> str:
    for unit in ['B','KB','MB','GB','TB']:
        if n < 1024.0:
            return f"{n:,.1f} {unit}"
        n /= 1024.0
    return f"{n:.1f} PB"

def _list_parquet_files(base: Path) -> Set[Path]:
    return set(base.rglob("*.parquet"))

def _monitor_loop(base: Path, sample_every: int = 10, read_sample_rows: int = 3):
    seen = _list_parquet_files(base)
    count = len(seen)
    print(f"[monitor] watching {base.resolve()} | existing parquet files: {count}")
    last_sampled = 0
    while not _MONITOR_STOP.is_set():
        time.sleep(1.0)  # poll every second
        current = _list_parquet_files(base)
        new = current - seen
        if new:
            # Sort by mtime for nicer ordering
            new_sorted = sorted(new, key=lambda p: p.stat().st_mtime)
            for p in new_sorted:
                size = _bytes_to_human(p.stat().st_size)
                print(f"[monitor] + new file: {p} | size={size}")
                count += 1
                # Occasionally peek inside to verify schema & a few rows
                if read_sample_rows > 0 and (count - last_sampled) >= sample_every:
                    try:
                        import pandas as pd
                        df = pd.read_parquet(p)
                        print(f"[monitor]   sample {read_sample_rows} rows from {p.name}:")
                        print(df.head(read_sample_rows))
                        last_sampled = count
                    except Exception as e:
                        print(f"[monitor]   (could not read sample) {e}")
        seen = current

def start_landing_monitor(base_dir: Path = LANDING_DIR, sample_every: int = 10, read_sample_rows: int = 3):
    """Start background monitor thread. Call once."""
    global _MONITOR_THREAD, _MONITOR_STOP
    if _MONITOR_THREAD is not None and _MONITOR_THREAD.is_alive():
        print("[monitor] already running")
        return
    _MONITOR_STOP.clear()
    _MONITOR_THREAD = threading.Thread(
        target=_monitor_loop, 
        args=(base_dir, sample_every, read_sample_rows),
        daemon=True
    )
    _MONITOR_THREAD.start()
    print("[monitor] started")

def stop_landing_monitor():
    """Stop background monitor thread."""
    global _MONITOR_THREAD, _MONITOR_STOP
    if _MONITOR_THREAD is None:
        print("[monitor] not running")
        return
    _MONITOR_STOP.set()
    _MONITOR_THREAD.join(timeout=5)
    _MONITOR_THREAD = None
    print("[monitor] stopped")



### Start Monitor : Run monitor before running writer

In [8]:
# %%
start_landing_monitor(base_dir=LANDING_DIR, sample_every=10, read_sample_rows=3)


[monitor] started
[monitor] watching C:\Users\Ishaan Tiwari\Desktop\Kafka\landing | existing parquet files: 195


### Stop Monitor

In [10]:
# %%
stop_landing_monitor()


[monitor] stopped


## Clear Lander Folder : Run Only when Writer in not running.

In [4]:
# %% 
# A0) HARD RESET: remove ALL previous landing data
from pathlib import Path
import shutil

LANDING_DIR = Path(r"C:\Users\Ishaan Tiwari\Desktop\Kafka\landing")


def reset_landing(base: Path = LANDING_DIR):
    if base.exists():
        shutil.rmtree(base)
    base.mkdir(parents=True, exist_ok=True)
    print(f"[reset] Cleaned and recreated {base.resolve()}")

# Stop the writer loop first, then run:
reset_landing()


[reset] Cleaned and recreated C:\Users\Ishaan Tiwari\Desktop\Kafka\landing
