In [16]:
# config & setup 
import os, json, datetime, asyncio
from pathlib import Path
from dotenv import load_dotenv

load_dotenv("../.env")
api_id = int(os.getenv("TELEGRAM_API_ID"))
api_hash = os.getenv("TELEGRAM_API_HASH")
session_name = os.getenv("TELEGRAM_SESSION", "telegram_session")

CHANNELS = ["@CheMed123", "@lobelia4cosmetics", "@tikvahpharma"]

# Force full backfill for all channels (ignore state)
FORCE_FULL = ["@CheMed123", "@lobelia4cosmetics", "@tikvahpharma"]

BACKFILL_PAGE_SIZE = 100   # smaller = gentler on rate limits and flaky networks

today = datetime.date.today().isoformat()
BASE = Path("..")
MSG_DIR = BASE / "data" / "raw" / "telegram_messages" / today
IMG_DIR = BASE / "data" / "raw" / "images" / today
STATE_FILE = BASE / ".state" / "scrape_state.json"

MSG_DIR.mkdir(parents=True, exist_ok=True)
IMG_DIR.mkdir(parents=True, exist_ok=True)
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)

def load_state():
    if STATE_FILE.exists():
        try:
            return json.loads(STATE_FILE.read_text(encoding="utf-8"))
        except Exception:
            return {}
    return {}

def save_state(state: dict):
    STATE_FILE.write_text(json.dumps(state, indent=2), encoding="utf-8")

state = load_state()
print("Will write JSON under:", MSG_DIR)
print("Will write images under:", IMG_DIR)


Will write JSON under: ..\data\raw\telegram_messages\2025-08-21
Will write images under: ..\data\raw\images\2025-08-21


In [17]:
#  FULL backfill (ignores state) 
from telethon import TelegramClient
from telethon.tl.types import MessageMediaPhoto
from telethon.errors.rpcerrorlist import FloodWaitError
from pathlib import Path
import datetime, asyncio

def write_json(channel_name: str, records: list):
    out_json = MSG_DIR / f"{channel_name}.json"
    out_json.parent.mkdir(parents=True, exist_ok=True)
    out_json.write_text(json.dumps(records, ensure_ascii=False, indent=2), encoding="utf-8")
    return out_json

async def scrape_full_history(client: TelegramClient, channel: str):
    ch_name = channel.strip("@")
    ch_img_dir = IMG_DIR / ch_name
    ch_img_dir.mkdir(parents=True, exist_ok=True)

    out = []
    offset_id = 0
    total = 0

    print(f" FULL backfill {channel} (ignoring state) ...")
    while True:
        batch = 0
        try:
            async for msg in client.iter_messages(channel, limit=BACKFILL_PAGE_SIZE, offset_id=offset_id):
                rec = {
                    "id": int(msg.id),
                    "channel_name": ch_name,
                    "message_text": getattr(msg, "message", None),
                    "message_date": msg.date.isoformat() if msg.date else None,
                    "has_image": False,
                    "image_path": None,
                }
                if isinstance(msg.media, MessageMediaPhoto):
                    try:
                        file_path = (ch_img_dir / f"{msg.id}").as_posix()
                        saved = await msg.download_media(file=file_path)
                        if saved:
                            rel = Path(saved).resolve().relative_to(Path("..").resolve())
                            rec["has_image"] = True
                            rec["image_path"] = rel.as_posix()
                    except Exception as e:
                        print(f"[WARN] {channel} msg {msg.id}: image download failed: {e}")

                out.append(rec)
                offset_id = msg.id
                batch += 1
                total += 1

        except FloodWaitError as e:
            print(f"[RATE LIMIT] {channel}: sleeping {e.seconds}s ...")
            await asyncio.sleep(e.seconds + 5)
            continue
        except Exception as e:
            print(f"[ERROR] {channel}: {e}; retrying current page ...")
            await asyncio.sleep(3)
            continue

        print(f"  fetched batch={batch} (offset_id now {offset_id}), total={total}")
        if batch == 0:
            break  # no more messages

    out_json = write_json(ch_name, out)
    print(f"FULL saved {total} → {out_json}")

    if out:
        max_id = max(m["id"] for m in out)
        state[channel] = {"last_id": max_id, "last_run": datetime.datetime.utcnow().isoformat()}
        save_state(state)

async def run_backfill():
    async with TelegramClient(session_name, api_id, api_hash) as client:
        for ch in FORCE_FULL:
            await scrape_full_history(client, ch)
    print(" Full backfill complete for:", FORCE_FULL)

await run_backfill()


 FULL backfill @CheMed123 (ignoring state) ...
  fetched batch=76 (offset_id now 1), total=76
  fetched batch=0 (offset_id now 1), total=76
FULL saved 76 → ..\data\raw\telegram_messages\2025-08-21\CheMed123.json
 FULL backfill @lobelia4cosmetics (ignoring state) ...
  fetched batch=100 (offset_id now 19178), total=100
  fetched batch=100 (offset_id now 19074), total=200
  fetched batch=100 (offset_id now 18969), total=300
  fetched batch=100 (offset_id now 18859), total=400
  fetched batch=100 (offset_id now 18751), total=500
  fetched batch=100 (offset_id now 18645), total=600
  fetched batch=100 (offset_id now 18544), total=700
  fetched batch=100 (offset_id now 18440), total=800
  fetched batch=100 (offset_id now 18336), total=900
  fetched batch=100 (offset_id now 18228), total=1000
  fetched batch=100 (offset_id now 18121), total=1100
  fetched batch=100 (offset_id now 18015), total=1200
  fetched batch=100 (offset_id now 17914), total=1300
  fetched batch=100 (offset_id now 17812