# FETCH AND APPEND

## CSV Daily Cleaner

### IHSG Emiten Cleaner

In [2]:
import pandas as pd
from pathlib import Path
import yfinance as yf
from tqdm import tqdm
from tabulate import tabulate
from datetime import datetime

# ================== CONFIG ==================
# FIX: Ticker file adalah file master dengan semua kolom
TICKER_FILE_MASTER = Path("ihsg/emiten_ihsg_active.csv")
OUTPUT_FILE_SUFFIX = datetime.now().strftime("%Y%m%d")
OUTPUT_FILE = Path(f"ihsg/emiten_ihsg_active_{OUTPUT_FILE_SUFFIX}.csv")

# -------------- MAIN LOGIC --------------
print(f"Memuat daftar ticker dari file master: {TICKER_FILE_MASTER.resolve()}")
try:
    df_master = pd.read_csv(TICKER_FILE_MASTER)
    print(f"Total {len(df_master)} ticker dimuat.")

    # Menjalankan validasi yfinance
    valid_tickers = []
    delisted_tickers = []
    
    intervals_to_check = [
        ("7d", "1m"),
        ("60d", "5m"),
        ("60d", "15m"),
        ("5y", "1d")
    ]

    print("\nMemulai validasi ticker via yfinance untuk 4 timeframes...")

    for ticker in tqdm(df_master['ticker'].unique(), desc="Validating tickers"):
        is_valid = True
        
        for period, interval in intervals_to_check:
            try:
                data = yf.download(ticker, period=period, interval=interval, progress=False, auto_adjust=False)
                if data.empty:
                    is_valid = False
                    break
            except Exception:
                is_valid = False
                break
    
        if is_valid:
            valid_tickers.append(ticker)
        else:
            delisted_tickers.append(ticker)
            
    print("\nValidasi selesai.")

    # -------------- OUTPUT --------------
    # FIX: Filter DataFrame master (df_master) dengan daftar ticker yang valid
    df_filtered = df_master[df_master['ticker'].isin(valid_tickers)].copy()

    print(f"✅ Ditemukan {len(df_filtered)} ticker yang valid.")
    print(f"❌ Ditemukan {len(delisted_tickers)} ticker yang tidak valid atau delisting.")

    df_filtered.to_csv(OUTPUT_FILE, index=False)
    print(f"\nData yang valid telah disimpan ke file: {OUTPUT_FILE.resolve()}")

    print("\nSampel 20 ticker yang valid:")
    print(tabulate(df_filtered.head(20), headers='keys', tablefmt='pipe'))
    
except FileNotFoundError:
    print(f"❌ ERROR: File '{TICKER_FILE_MASTER}' tidak ditemukan.")
    print("Pastikan file berada di direktori yang sama dengan skrip ini.")
except Exception as e:
    print(f"❌ ERROR: Terjadi kesalahan saat memproses data.")
    print(f"   Detail: {e}")

Memuat daftar ticker dari file master: /home/mkemalw/Projects/SSSAHAM_SERVICE/ihsg/emiten_ihsg_active.csv
Total 773 ticker dimuat.

Memulai validasi ticker via yfinance untuk 4 timeframes...


Validating tickers:   0%|          | 0/773 [00:00<?, ?it/s]

Validating tickers:   7%|▋         | 51/773 [00:44<10:26,  1.15it/s]


KeyboardInterrupt: 

### sanity check yfinance connection

In [8]:
import yfinance as yf
import pandas as pd

ticker = "BBRI.JK"
interval = "1m"
period = "1d"

# Ambil data mentah tanpa auto_adjust
df = yf.download(ticker, interval=interval, period=period, auto_adjust=False)

# Konversi timezone ke Jakarta
df = df.tz_convert("Asia/Jakarta")

# Filter jam perdagangan BEI
df = df.between_time("09:00", "15:30")

print(df.head(10))
print(f"\nTotal rows: {len(df)}")


[*********************100%***********************]  1 of 1 completed

Price                     Adj Close   Close    High     Low    Open   Volume
Ticker                      BBRI.JK BBRI.JK BBRI.JK BBRI.JK BBRI.JK  BBRI.JK
Datetime                                                                    
2025-08-19 09:00:00+07:00    4070.0  4070.0  4100.0  4070.0  4090.0        0
2025-08-19 09:01:00+07:00    4050.0  4050.0  4080.0  4050.0  4070.0  4596600
2025-08-19 09:02:00+07:00    4050.0  4050.0  4070.0  4050.0  4060.0  4691600
2025-08-19 09:03:00+07:00    4070.0  4070.0  4070.0  4050.0  4050.0  1529600
2025-08-19 09:04:00+07:00    4070.0  4070.0  4080.0  4060.0  4070.0  1500400
2025-08-19 09:05:00+07:00    4070.0  4070.0  4080.0  4070.0  4070.0   520000
2025-08-19 09:06:00+07:00    4070.0  4070.0  4080.0  4070.0  4070.0  1013100
2025-08-19 09:07:00+07:00    4070.0  4070.0  4080.0  4070.0  4080.0   700500
2025-08-19 09:08:00+07:00    4070.0  4070.0  4080.0  4070.0  4070.0   378400
2025-08-19 09:09:00+07:00    4060.0  4060.0  4080.0  4060.0  4070.0  1034100




## Fetch and Append Service Using Alphabetical

### Service 1m fetch and append

In [1]:
# ============================================
# 1M APPEND FIX - IN-PLACE (cache_1m overwrite)
# - Scan semua .csv di cache_1m
# - Fetch fresh dari Yahoo, append & overwrite
# ============================================
from pathlib import Path
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import yfinance as yf
import os, tempfile, shutil

# ---------- CONFIG ----------
FOLDER         = Path("emiten/cache_1m")   # overwrite langsung di sini
YF_PERIOD      = "5d"
YF_INTERVAL    = "1m"
SESSION_START  = "09:00"
SESSION_END    = "15:50"
LOOKBACK_MIN   = 5
DRY_RUN        = False
# ----------------------------

STANDARD_COLS = ["Open","High","Low","Close","Adj Close","Volume"]

def _atomic_write_csv(fp: Path, df: pd.DataFrame):
    tmpdir = Path(tempfile.mkdtemp(prefix="tmp_write_"))
    tmpfp = tmpdir / (fp.name + ".tmp")
    df.to_csv(tmpfp, index=False)
    shutil.move(str(tmpfp), str(fp))
    shutil.rmtree(tmpdir, ignore_errors=True)

def _parse_jakarta(x: pd.Series) -> pd.Series:
    dt = pd.to_datetime(x, errors="coerce", utc=False)
    if getattr(dt.dt, "tz", None) is None:
        dt = dt.dt.tz_localize("Asia/Jakarta")
    else:
        dt = dt.dt.tz_convert("Asia/Jakarta")
    return dt

def _fetch_fresh_1m(ticker: str) -> pd.DataFrame:
    df = yf.download(
        ticker, period=YF_PERIOD, interval=YF_INTERVAL,
        auto_adjust=False, threads=False, progress=False
    )
    if df is None or df.empty:
        return pd.DataFrame(columns=STANDARD_COLS)

    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [c[0] for c in df.columns]
    if "Price" in df.columns and "Close" not in df.columns:
        df = df.rename(columns={"Price":"Close"})

    idx = df.index
    if getattr(idx, "tz", None) is None:
        df.index = pd.DatetimeIndex(idx).tz_localize("UTC")
    df = df.tz_convert("Asia/Jakarta")
    df = df.between_time(SESSION_START, SESSION_END)

    for c in STANDARD_COLS:
        if c not in df.columns:
            df[c] = pd.NA
    df = df[STANDARD_COLS]
    for c in STANDARD_COLS:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df = df.dropna(how="all", subset=["Open","High","Low","Close","Volume"])
    return df

def _read_last_dt(fp: Path):
    if not fp.exists() or fp.stat().st_size == 0:
        return None
    try:
        d = pd.read_csv(fp, usecols=["Datetime"])
        if d.empty:
            return None
        dt = _parse_jakarta(d["Datetime"])
        return dt.max()
    except Exception:
        return None

def _merge_append_write(ticker: str, out_csv: Path) -> dict:
    fresh = _fetch_fresh_1m(ticker)
    if fresh.empty:
        return {"ticker": ticker, "status": "no-fresh", "wrote": False}

    last_dt = _read_last_dt(out_csv)
    sess_today_start = pd.Timestamp(datetime.now().date(), tz="Asia/Jakarta") + pd.Timedelta(hours=9)

    if (last_dt is None) or (last_dt < sess_today_start):
        merge_start = sess_today_start
    else:
        merge_start = last_dt - pd.Timedelta(minutes=LOOKBACK_MIN)

    fresh = fresh.loc[fresh.index >= merge_start].copy()

    if out_csv.exists() and out_csv.stat().st_size > 0:
        base = pd.read_csv(out_csv, low_memory=False)
        if base.empty or "Datetime" not in base.columns:
            base = pd.DataFrame(columns=["Datetime"] + STANDARD_COLS)
        else:
            base["Datetime"] = _parse_jakarta(base["Datetime"])
            for c in STANDARD_COLS:
                if c not in base.columns:
                    base[c] = pd.NA
            base = base[["Datetime"] + STANDARD_COLS]
    else:
        base = pd.DataFrame(columns=["Datetime"] + STANDARD_COLS)

    fresh_out = fresh.reset_index().rename(columns={"index":"Datetime"})
    for c in STANDARD_COLS:
        if c not in fresh_out.columns:
            fresh_out[c] = pd.NA
    fresh_out = fresh_out[["Datetime"] + STANDARD_COLS]

    frames = [x for x in (base, fresh_out) if not x.empty]
    if frames:
        merged = pd.concat(frames, ignore_index=True)
        merged = (
            merged.drop_duplicates(subset=["Datetime"], keep="last")
                  .sort_values("Datetime")
        )
    else:
        merged = pd.DataFrame(columns=["Datetime"] + STANDARD_COLS)

    if not DRY_RUN:
        _atomic_write_csv(out_csv, merged)

    return {
        "ticker": ticker,
        "status": "ok",
        "rows_base": len(base),
        "rows_fresh": len(fresh_out),
        "rows_out": len(merged),
        "min_out": merged["Datetime"].min() if not merged.empty else None,
        "max_out": merged["Datetime"].max() if not merged.empty else None,
        "merge_start": merge_start,
        "last_dt_before": last_dt,
        "wrote": not DRY_RUN,
        "file": str(out_csv),
    }

# --------------- RUN BATCH ---------------
csv_files = sorted(FOLDER.glob("*.csv"))
for fp in csv_files:
    print(f"📄 Running on {fp.name} ...")  # ← pindahkan ke sini
    ticker = fp.stem.upper()
    res = _merge_append_write(ticker, fp)

    if res.get("status") == "ok":
        print(f"✅ {ticker} | base={res['rows_base']} fresh={res['rows_fresh']} out={res['rows_out']} | "
              f"{res['min_out']} … {res['max_out']} | wrote={res['wrote']} | file={res['file']}")
    else:
        print(f"⚠️  {ticker} | {res.get('status')} | wrote={res.get('wrote')} | file={fp}")


📄 Running on AADI.JK.csv ...
✅ AADI.JK | base=3189 fresh=6 out=3189 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/AADI.JK.csv
📄 Running on AALI.JK.csv ...
✅ AALI.JK | base=3179 fresh=6 out=3179 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/AALI.JK.csv
📄 Running on ABDA.JK.csv ...
⚠️  ABDA.JK | no-fresh | wrote=False | file=emiten/cache_1m/ABDA.JK.csv
📄 Running on ABMM.JK.csv ...
✅ ABMM.JK | base=3015 fresh=3 out=3015 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/ABMM.JK.csv
📄 Running on ACES.JK.csv ...
✅ ACES.JK | base=3192 fresh=6 out=3192 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/ACES.JK.csv
📄 Running on ACRO.JK.csv ...
✅ ACRO.JK | base=3046 fresh=4 out=3046 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/ACRO.JK.csv
📄 Running on ACST.JK.csv ...
✅ ACST.JK 


1 Failed download:
['ASLI.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ ASLC.JK | base=3197 fresh=6 out=3197 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/ASLC.JK.csv
📄 Running on ASLI.JK.csv ...
⚠️  ASLI.JK | no-fresh | wrote=False | file=emiten/cache_1m/ASLI.JK.csv
📄 Running on ASPI.JK.csv ...
✅ ASPI.JK | base=3107 fresh=5 out=3107 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/ASPI.JK.csv
📄 Running on ASPR.JK.csv ...
✅ ASPR.JK | base=3065 fresh=3 out=3065 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/ASPR.JK.csv
📄 Running on ASRI.JK.csv ...
✅ ASRI.JK | base=3195 fresh=6 out=3195 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/ASRI.JK.csv
📄 Running on ASRM.JK.csv ...
✅ ASRM.JK | base=2886 fresh=2 out=2886 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:25:00+07:00 | wrote=True | file=emiten/cache_1m/ASRM.JK.csv
📄 Running on ASSA.JK.csv ...
✅ ASSA.JK | base=3042 fresh=5 out=3042 


1 Failed download:
['BSWD.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ BSSR.JK | base=3016 fresh=2 out=3016 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/BSSR.JK.csv
📄 Running on BSWD.JK.csv ...
⚠️  BSWD.JK | no-fresh | wrote=False | file=emiten/cache_1m/BSWD.JK.csv
📄 Running on BTON.JK.csv ...
✅ BTON.JK | base=2882 fresh=2 out=2882 | 2025-08-11 09:00:00+07:00 … 2025-08-20 13:48:00+07:00 | wrote=True | file=emiten/cache_1m/BTON.JK.csv
📄 Running on BTPN.JK.csv ...
✅ BTPN.JK | base=2901 fresh=1 out=2901 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/BTPN.JK.csv
📄 Running on BTPS.JK.csv ...
✅ BTPS.JK | base=3135 fresh=6 out=3135 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/BTPS.JK.csv
📄 Running on BUAH.JK.csv ...
✅ BUAH.JK | base=2950 fresh=3 out=2950 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/BUAH.JK.csv
📄 Running on BUDI.JK.csv ...
✅ BUDI.JK | base=2891 fresh=1 out=2891 


1 Failed download:
['CBRE.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ CBPE.JK | base=2890 fresh=1 out=2890 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:42:00+07:00 | wrote=True | file=emiten/cache_1m/CBPE.JK.csv
📄 Running on CBRE.JK.csv ...
⚠️  CBRE.JK | no-fresh | wrote=False | file=emiten/cache_1m/CBRE.JK.csv
📄 Running on CBUT.JK.csv ...
✅ CBUT.JK | base=2904 fresh=1 out=2904 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:48:00+07:00 | wrote=True | file=emiten/cache_1m/CBUT.JK.csv
📄 Running on CCSI.JK.csv ...
✅ CCSI.JK | base=2917 fresh=2 out=2917 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:47:00+07:00 | wrote=True | file=emiten/cache_1m/CCSI.JK.csv
📄 Running on CDIA.JK.csv ...
✅ CDIA.JK | base=3197 fresh=6 out=3197 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/CDIA.JK.csv
📄 Running on CEKA.JK.csv ...
✅ CEKA.JK | base=2905 fresh=1 out=2905 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:48:00+07:00 | wrote=True | file=emiten/cache_1m/CEKA.JK.csv
📄 Running on CENT.JK.csv ...
✅ CENT.JK | base=3114 fresh=5 out=3114 


1 Failed download:
['COCO.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ CNMA.JK | base=2877 fresh=253 out=3130 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/CNMA.JK.csv
📄 Running on COCO.JK.csv ...
⚠️  COCO.JK | no-fresh | wrote=False | file=emiten/cache_1m/COCO.JK.csv
📄 Running on COIN.JK.csv ...
✅ COIN.JK | base=2877 fresh=320 out=3197 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/COIN.JK.csv
📄 Running on CPIN.JK.csv ...
✅ CPIN.JK | base=2877 fresh=319 out=3196 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/CPIN.JK.csv
📄 Running on CPRO.JK.csv ...
✅ CPRO.JK | base=2877 fresh=135 out=3012 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:48:00+07:00 | wrote=True | file=emiten/cache_1m/CPRO.JK.csv
📄 Running on CRAB.JK.csv ...
✅ CRAB.JK | base=2877 fresh=46 out=2923 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/CRAB.JK.csv
📄 Running on CRSN.JK.csv ...
✅ CRSN.JK | base=2877 fresh=10


1 Failed download:
['CSMI.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ CSIS.JK | base=2877 fresh=259 out=3136 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:48:00+07:00 | wrote=True | file=emiten/cache_1m/CSIS.JK.csv
📄 Running on CSMI.JK.csv ...
⚠️  CSMI.JK | no-fresh | wrote=False | file=emiten/cache_1m/CSMI.JK.csv
📄 Running on CSRA.JK.csv ...
✅ CSRA.JK | base=2877 fresh=210 out=3087 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:48:00+07:00 | wrote=True | file=emiten/cache_1m/CSRA.JK.csv
📄 Running on CTBN.JK.csv ...
✅ CTBN.JK | base=2877 fresh=2 out=2879 | 2025-08-11 09:00:00+07:00 … 2025-08-20 14:55:00+07:00 | wrote=True | file=emiten/cache_1m/CTBN.JK.csv
📄 Running on CTRA.JK.csv ...
✅ CTRA.JK | base=2877 fresh=317 out=3194 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/CTRA.JK.csv
📄 Running on CUAN.JK.csv ...
✅ CUAN.JK | base=2877 fresh=320 out=3197 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/CUAN.JK.csv
📄 Running on CYBR.JK.csv ...
✅ CYBR.JK | base=2877 fresh=195


1 Failed download:
['FIMP.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ FILM.JK | base=2877 fresh=306 out=3183 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/FILM.JK.csv
📄 Running on FIMP.JK.csv ...
⚠️  FIMP.JK | no-fresh | wrote=False | file=emiten/cache_1m/FIMP.JK.csv
📄 Running on FIRE.JK.csv ...
✅ FIRE.JK | base=2877 fresh=148 out=3025 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:48:00+07:00 | wrote=True | file=emiten/cache_1m/FIRE.JK.csv
📄 Running on FITT.JK.csv ...
✅ FITT.JK | base=2877 fresh=114 out=2991 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/FITT.JK.csv
📄 Running on FLMC.JK.csv ...
✅ FLMC.JK | base=2877 fresh=15 out=2892 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/FLMC.JK.csv
📄 Running on FMII.JK.csv ...
✅ FMII.JK | base=2877 fresh=36 out=2913 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:37:00+07:00 | wrote=True | file=emiten/cache_1m/FMII.JK.csv
📄 Running on FOLK.JK.csv ...
✅ FOLK.JK | base=2877 fresh=38 


1 Failed download:
['HOPE.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ HOMI.JK | base=2877 fresh=17 out=2894 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:41:00+07:00 | wrote=True | file=emiten/cache_1m/HOMI.JK.csv
📄 Running on HOPE.JK.csv ...
⚠️  HOPE.JK | no-fresh | wrote=False | file=emiten/cache_1m/HOPE.JK.csv
📄 Running on HRTA.JK.csv ...
✅ HRTA.JK | base=2877 fresh=266 out=3143 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/HRTA.JK.csv
📄 Running on HRUM.JK.csv ...
✅ HRUM.JK | base=2877 fresh=259 out=3136 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/HRUM.JK.csv
📄 Running on HUMI.JK.csv ...
✅ HUMI.JK | base=2877 fresh=298 out=3175 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:48:00+07:00 | wrote=True | file=emiten/cache_1m/HUMI.JK.csv
📄 Running on HYGN.JK.csv ...
✅ HYGN.JK | base=2877 fresh=122 out=2999 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:38:00+07:00 | wrote=True | file=emiten/cache_1m/HYGN.JK.csv
📄 Running on IATA.JK.csv ...
✅ IATA.JK | base=2877 fresh=4 


1 Failed download:
['IMPC.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ IMJS.JK | base=2877 fresh=304 out=3181 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/IMJS.JK.csv
📄 Running on IMPC.JK.csv ...
⚠️  IMPC.JK | no-fresh | wrote=False | file=emiten/cache_1m/IMPC.JK.csv
📄 Running on INAI.JK.csv ...
✅ INAI.JK | base=2877 fresh=15 out=2892 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:17:00+07:00 | wrote=True | file=emiten/cache_1m/INAI.JK.csv
📄 Running on INCI.JK.csv ...
✅ INCI.JK | base=2877 fresh=88 out=2965 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:47:00+07:00 | wrote=True | file=emiten/cache_1m/INCI.JK.csv
📄 Running on INCO.JK.csv ...
✅ INCO.JK | base=2877 fresh=314 out=3191 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/INCO.JK.csv
📄 Running on INDF.JK.csv ...
✅ INDF.JK | base=2877 fresh=319 out=3196 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/INDF.JK.csv
📄 Running on INDO.JK.csv ...
✅ INDO.JK | base=2877 fresh=115


1 Failed download:
['IRSX.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ IRRA.JK | base=2877 fresh=75 out=2952 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:45:00+07:00 | wrote=True | file=emiten/cache_1m/IRRA.JK.csv
📄 Running on IRSX.JK.csv ...
⚠️  IRSX.JK | no-fresh | wrote=False | file=emiten/cache_1m/IRSX.JK.csv
📄 Running on ISAT.JK.csv ...
✅ ISAT.JK | base=2877 fresh=266 out=3143 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/ISAT.JK.csv
📄 Running on ISEA.JK.csv ...
✅ ISEA.JK | base=2877 fresh=187 out=3064 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:48:00+07:00 | wrote=True | file=emiten/cache_1m/ISEA.JK.csv
📄 Running on ISSP.JK.csv ...
✅ ISSP.JK | base=2877 fresh=114 out=2991 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/ISSP.JK.csv
📄 Running on ITIC.JK.csv ...
✅ ITIC.JK | base=2877 fresh=55 out=2932 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:46:00+07:00 | wrote=True | file=emiten/cache_1m/ITIC.JK.csv
📄 Running on ITMA.JK.csv ...
✅ ITMA.JK | base=2877 fresh=96 


1 Failed download:
['PMMP.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ PMJS.JK | base=2877 fresh=39 out=2916 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:44:00+07:00 | wrote=True | file=emiten/cache_1m/PMJS.JK.csv
📄 Running on PMMP.JK.csv ...
⚠️  PMMP.JK | no-fresh | wrote=False | file=emiten/cache_1m/PMMP.JK.csv
📄 Running on PMUI.JK.csv ...
✅ PMUI.JK | base=2877 fresh=156 out=3033 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/PMUI.JK.csv
📄 Running on PNBN.JK.csv ...
✅ PNBN.JK | base=2877 fresh=199 out=3076 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/PNBN.JK.csv
📄 Running on PNBS.JK.csv ...
✅ PNBS.JK | base=2877 fresh=157 out=3034 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/PNBS.JK.csv
📄 Running on PNGO.JK.csv ...
✅ PNGO.JK | base=2877 fresh=42 out=2919 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:44:00+07:00 | wrote=True | file=emiten/cache_1m/PNGO.JK.csv
📄 Running on PNIN.JK.csv ...
✅ PNIN.JK | base=2877 fresh=44 


1 Failed download:
['PYFA.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ PWON.JK | base=2877 fresh=315 out=3192 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/PWON.JK.csv
📄 Running on PYFA.JK.csv ...
⚠️  PYFA.JK | no-fresh | wrote=False | file=emiten/cache_1m/PYFA.JK.csv
📄 Running on PZZA.JK.csv ...
✅ PZZA.JK | base=2877 fresh=228 out=3105 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/PZZA.JK.csv
📄 Running on RAAM.JK.csv ...
✅ RAAM.JK | base=2877 fresh=144 out=3021 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/RAAM.JK.csv
📄 Running on RAJA.JK.csv ...
✅ RAJA.JK | base=2877 fresh=319 out=3196 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/RAJA.JK.csv
📄 Running on RALS.JK.csv ...
✅ RALS.JK | base=2877 fresh=140 out=3017 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/RALS.JK.csv
📄 Running on RANC.JK.csv ...
✅ RANC.JK | base=2877 fresh=3


1 Failed download:
['ROCK.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ RMKO.JK | base=2877 fresh=97 out=2974 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/RMKO.JK.csv
📄 Running on ROCK.JK.csv ...
⚠️  ROCK.JK | no-fresh | wrote=False | file=emiten/cache_1m/ROCK.JK.csv
📄 Running on ROTI.JK.csv ...
✅ ROTI.JK | base=2877 fresh=48 out=2925 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/ROTI.JK.csv
📄 Running on RSCH.JK.csv ...
✅ RSCH.JK | base=2877 fresh=39 out=2916 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:47:00+07:00 | wrote=True | file=emiten/cache_1m/RSCH.JK.csv
📄 Running on RUIS.JK.csv ...
✅ RUIS.JK | base=2877 fresh=224 out=3101 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/RUIS.JK.csv
📄 Running on RUNS.JK.csv ...
✅ RUNS.JK | base=2877 fresh=81 out=2958 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/RUNS.JK.csv
📄 Running on SAFE.JK.csv ...
✅ SAFE.JK | base=2877 fresh=73 ou


1 Failed download:
['TFCO.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ TFAS.JK | base=2877 fresh=46 out=2923 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/TFAS.JK.csv
📄 Running on TFCO.JK.csv ...
⚠️  TFCO.JK | no-fresh | wrote=False | file=emiten/cache_1m/TFCO.JK.csv
📄 Running on TGKA.JK.csv ...



1 Failed download:
['TGUK.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ TGKA.JK | base=2877 fresh=3 out=2880 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:26:00+07:00 | wrote=True | file=emiten/cache_1m/TGKA.JK.csv
📄 Running on TGUK.JK.csv ...
⚠️  TGUK.JK | no-fresh | wrote=False | file=emiten/cache_1m/TGUK.JK.csv
📄 Running on TIFA.JK.csv ...
✅ TIFA.JK | base=2877 fresh=31 out=2908 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/TIFA.JK.csv
📄 Running on TINS.JK.csv ...
✅ TINS.JK | base=2877 fresh=304 out=3181 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/TINS.JK.csv
📄 Running on TIRA.JK.csv ...
✅ TIRA.JK | base=2877 fresh=58 out=2935 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:47:00+07:00 | wrote=True | file=emiten/cache_1m/TIRA.JK.csv
📄 Running on TKIM.JK.csv ...
✅ TKIM.JK | base=2877 fresh=271 out=3148 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/TKIM.JK.csv
📄 Running on TLDN.JK.csv ...
✅ TLDN.JK | base=2877 fresh=229 o


1 Failed download:
['ZBRA.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=5d)')


✅ YUPI.JK | base=2877 fresh=63 out=2940 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:48:00+07:00 | wrote=True | file=emiten/cache_1m/YUPI.JK.csv
📄 Running on ZBRA.JK.csv ...
⚠️  ZBRA.JK | no-fresh | wrote=False | file=emiten/cache_1m/ZBRA.JK.csv
📄 Running on ZONE.JK.csv ...
✅ ZONE.JK | base=2877 fresh=17 out=2894 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:40:00+07:00 | wrote=True | file=emiten/cache_1m/ZONE.JK.csv
📄 Running on ZYRX.JK.csv ...
✅ ZYRX.JK | base=2877 fresh=233 out=3110 | 2025-08-11 09:00:00+07:00 … 2025-08-20 15:49:00+07:00 | wrote=True | file=emiten/cache_1m/ZYRX.JK.csv


### Service 5m fetch and append

In [7]:
# ============================================
# 5M APPEND + SMART BACKFILL (cache_5m)
# - TZ Asia/Jakarta, sesi 09:00–15:50
# - First run: backfill penuh (coba beberapa period Yahoo)
# - Next runs: efisien (append dgn window lookback)
# - Merge: drop-dup by Datetime (keep last), overwrite atomik
# - Diagnostik ringkas per-ticker
# ============================================

from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
import yfinance as yf
import tempfile, shutil

# ---------- CONFIG ----------
FOLDER        = Path("emiten/cache_5m")
SESSION_START = "09:00"
SESSION_END   = "15:50"
LOOKBACK_MIN  = 15
DRY_RUN       = False
STANDARD_COLS = ["Open","High","Low","Close","Adj Close","Volume"]

# Smart backfill
MIN_UNIQUE_DATES_TARGET = 5
PERIOD_CANDIDATES_5M    = ["60d", "30d", "1mo", "14d", "7d"]
ALLOW_BACKFILL_IF_BASE_SINGLE_DAY = True
VERBOSE = True
# ----------------------------

def _atomic_write_csv(fp: Path, df: pd.DataFrame):
    fp.parent.mkdir(parents=True, exist_ok=True)
    tmpdir = Path(tempfile.mkdtemp(prefix="tmp_write_"))
    tmpfp = tmpdir / (fp.name + ".tmp")
    df.to_csv(tmpfp, index=False)
    shutil.move(str(tmpfp), str(fp))
    shutil.rmtree(tmpdir, ignore_errors=True)

def _parse_jakarta(x: pd.Series) -> pd.Series:
    dt = pd.to_datetime(x, errors="coerce", utc=False)
    if getattr(dt.dt, "tz", None) is None:
        dt = dt.dt.tz_localize("Asia/Jakarta")
    else:
        dt = dt.dt.tz_convert("Asia/Jakarta")
    return dt

def _unique_dates_from_index(dt_index) -> list:
    idx = pd.DatetimeIndex(dt_index)
    if getattr(idx, "tz", None) is None:
        idx = idx.tz_localize("UTC")
    idx = idx.tz_convert("Asia/Jakarta")
    return sorted(pd.Series(idx.date).unique().tolist())

def _fetch_fresh_5m_try(ticker: str, period: str) -> pd.DataFrame:
    df = yf.download(
        ticker, period=period, interval="5m",
        auto_adjust=False, threads=False, progress=False
    )
    if df is None or df.empty:
        return pd.DataFrame(columns=STANDARD_COLS)

    # ratakan multiindex
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [c[0] for c in df.columns]

    # normalisasi kolom
    if "Price" in df.columns and "Close" in df.columns:
        df = df.drop(columns=["Price"])
    elif "Price" in df.columns and "Close" not in df.columns:
        df = df.rename(columns={"Price": "Close"})

    # pastikan tz → JKT
    if getattr(df.index, "tz", None) is None:
        df.index = pd.DatetimeIndex(df.index).tz_localize("UTC")
    df = df.tz_convert("Asia/Jakarta")

    # filter jam sesi
    df = df.between_time(SESSION_START, SESSION_END)

    # kolom wajib & tipe
    for c in STANDARD_COLS:
        if c not in df.columns:
            df[c] = pd.NA
    df = df[STANDARD_COLS]
    for c in STANDARD_COLS:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    df = df.dropna(how="all", subset=["Open","High","Low","Close","Volume"])
    return df

def _fetch_fresh_5m(ticker: str) -> pd.DataFrame:
    best_df, best_days = pd.DataFrame(columns=STANDARD_COLS), 0
    for per in PERIOD_CANDIDATES_5M:
        df = _fetch_fresh_5m_try(ticker, per)
        days = len(_unique_dates_from_index(df.index)) if not df.empty else 0
        if VERBOSE:
            print(f"    [Yahoo] {ticker} period={per} → rows={len(df)} days={days}")
        if days > best_days:
            best_days, best_df = days, df
        if days >= MIN_UNIQUE_DATES_TARGET:
            break
    return best_df

def _read_last_dt(fp: Path):
    if not fp.exists() or fp.stat().st_size == 0:
        return None
    try:
        d = pd.read_csv(fp, usecols=["Datetime"])
        if d.empty: return None
        dt = _parse_jakarta(d["Datetime"])
        return dt.max()
    except Exception:
        return None

def _read_base(out_csv: Path) -> pd.DataFrame:
    if out_csv.exists() and out_csv.stat().st_size > 0:
        base = pd.read_csv(out_csv, low_memory=False)
        if base.empty or "Datetime" not in base.columns:
            base = pd.DataFrame(columns=["Datetime"] + STANDARD_COLS)
        else:
            base["Datetime"] = _parse_jakarta(base["Datetime"])
            for c in STANDARD_COLS:
                if c not in base.columns:
                    base[c] = pd.NA
            base = base[["Datetime"] + STANDARD_COLS]
    else:
        base = pd.DataFrame(columns=["Datetime"] + STANDARD_COLS)
    return base

def _merge_append_write(ticker: str, out_csv: Path) -> dict:
    fresh = _fetch_fresh_5m(ticker)
    if fresh.empty:
        return {"ticker": ticker, "status": "no-fresh", "wrote": False}

    base = _read_base(out_csv)
    last_dt = _read_last_dt(out_csv)
    today = datetime.now().date()
    sess_today_start = pd.Timestamp(today, tz="Asia/Jakarta") + pd.Timedelta(hours=9)

    # deteksi base: berapa tanggal unik
    if not base.empty:
        base_dates = sorted(pd.Series(base["Datetime"].dt.tz_convert("Asia/Jakarta").dt.date).unique().tolist())
        is_base_single_day = (len(base_dates) == 1)
    else:
        base_dates, is_base_single_day = [], False

    # tentukan merge_start
    if (last_dt is None) or (ALLOW_BACKFILL_IF_BASE_SINGLE_DAY and is_base_single_day):
        merge_start = None                         # backfill penuh (ambil semua dari Yahoo)
    elif last_dt < sess_today_start:
        merge_start = sess_today_start             # refill dari awal sesi hari ini
    else:
        merge_start = last_dt - pd.Timedelta(minutes=LOOKBACK_MIN)  # append dgn window

    if VERBOSE:
        fresh_days = _unique_dates_from_index(fresh.index)
        print(f"    [Diag] base_days={base_dates} | fresh_days={fresh_days} | last_dt={last_dt} | merge_start={merge_start}")

    # filter fresh jika perlu
    if merge_start is not None:
        fresh = fresh.loc[fresh.index >= merge_start].copy()

    # bentuk fresh_out (pastikan kolom 'Datetime' ada)
    fresh_idx = pd.DatetimeIndex(fresh.index)
    fresh_idx.name = "Datetime"
    fresh_out = fresh.copy()
    fresh_out.index = fresh_idx
    fresh_out = fresh_out.reset_index()
    for c in STANDARD_COLS:
        if c not in fresh_out.columns:
            fresh_out[c] = pd.NA
    fresh_out = fresh_out[["Datetime"] + STANDARD_COLS]

    # merge & tulis (overwrite atomik)
    frames = [x for x in (base, fresh_out) if not x.empty]
    if frames:
        merged = (pd.concat(frames, ignore_index=True)
                    .drop_duplicates(subset=["Datetime"], keep="last")
                    .sort_values("Datetime"))
    else:
        merged = pd.DataFrame(columns=["Datetime"] + STANDARD_COLS)

    if not DRY_RUN:
        _atomic_write_csv(out_csv, merged)

    return {
        "ticker": ticker,
        "status": "ok",
        "rows_base": len(base),
        "rows_fresh": len(fresh_out),
        "rows_out": len(merged),
        "min_out": merged["Datetime"].min() if not merged.empty else None,
        "max_out": merged["Datetime"].max() if not merged.empty else None,
        "merge_start": merge_start,
        "last_dt_before": last_dt,
        "wrote": not DRY_RUN,
        "file": str(out_csv),
        "base_days": base_dates if base_dates else [],
        "fresh_days": _unique_dates_from_index(fresh.index),
    }

# --------------- RUN ---------------
for file in sorted(FOLDER.glob("*.csv")):
    tkr = file.stem.upper()
    out_csv = file
    res = _merge_append_write(tkr, out_csv)
    if res.get("status") == "ok":
        print(f"✅ {tkr} | base={res['rows_base']} fresh={res['rows_fresh']} out={res['rows_out']} | "
              f"{res['min_out']} … {res['max_out']} | wrote={res['wrote']} | file={res['file']}")
    else:
        print(f"⚠️  {tkr} | {res.get('status')} | wrote={res.get('wrote')} | file={out_csv}")


    [Yahoo] AADI.JK period=60d → rows=3656 days=59
    [Diag] base_days=[datetime.date(2025, 8, 20)] | fresh_days=[datetime.date(2025, 5, 22), datetime.date(2025, 5, 23), datetime.date(2025, 5, 26), datetime.date(2025, 5, 27), datetime.date(2025, 5, 28), datetime.date(2025, 6, 2), datetime.date(2025, 6, 3), datetime.date(2025, 6, 4), datetime.date(2025, 6, 5), datetime.date(2025, 6, 10), datetime.date(2025, 6, 11), datetime.date(2025, 6, 12), datetime.date(2025, 6, 13), datetime.date(2025, 6, 16), datetime.date(2025, 6, 17), datetime.date(2025, 6, 18), datetime.date(2025, 6, 19), datetime.date(2025, 6, 20), datetime.date(2025, 6, 23), datetime.date(2025, 6, 24), datetime.date(2025, 6, 25), datetime.date(2025, 6, 26), datetime.date(2025, 6, 30), datetime.date(2025, 7, 1), datetime.date(2025, 7, 2), datetime.date(2025, 7, 3), datetime.date(2025, 7, 4), datetime.date(2025, 7, 7), datetime.date(2025, 7, 8), datetime.date(2025, 7, 9), datetime.date(2025, 7, 10), datetime.date(2025, 7, 11), 


1 Failed download:
['TGUK.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=30d)')


    [Yahoo] TGUK.JK period=60d → rows=108 days=2
    [Yahoo] TGUK.JK period=30d → rows=0 days=0



1 Failed download:
['TGUK.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=1mo)')

1 Failed download:
['TGUK.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=14d)')

1 Failed download:
['TGUK.JK']: YFPricesMissingError('possibly delisted; no price data found  (period=7d)')


    [Yahoo] TGUK.JK period=1mo → rows=0 days=0
    [Yahoo] TGUK.JK period=14d → rows=0 days=0
    [Yahoo] TGUK.JK period=7d → rows=0 days=0
    [Diag] base_days=[] | fresh_days=[datetime.date(2025, 5, 22), datetime.date(2025, 5, 23)] | last_dt=None | merge_start=None
✅ TGUK.JK | base=0 fresh=108 out=108 | 2025-05-22 09:00:00+07:00 … 2025-05-23 15:45:00+07:00 | wrote=True | file=emiten/cache_5m/TGUK.JK.csv
    [Yahoo] TIFA.JK period=60d → rows=587 days=32
    [Diag] base_days=[datetime.date(2025, 8, 20)] | fresh_days=[datetime.date(2025, 6, 11), datetime.date(2025, 6, 12), datetime.date(2025, 6, 13), datetime.date(2025, 6, 17), datetime.date(2025, 6, 20), datetime.date(2025, 7, 14), datetime.date(2025, 7, 15), datetime.date(2025, 7, 16), datetime.date(2025, 7, 17), datetime.date(2025, 7, 18), datetime.date(2025, 7, 21), datetime.date(2025, 7, 22), datetime.date(2025, 7, 23), datetime.date(2025, 7, 24), datetime.date(2025, 7, 25), datetime.date(2025, 7, 28), datetime.date(2025, 7, 29), d

### Service 15m fetch and append

In [3]:
# ============================================
# 15M APPEND FIX (semua .csv di cache_15m)
# - Robust tz handling (Asia/Jakarta everywhere)
# - Base UTUH (tidak dipangkas), fresh pakai window lookback
# - Merge: drop-dup by Datetime (keep last)
# - Overwrite atomik (safe write)
# ============================================

from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
import yfinance as yf
import tempfile, shutil

# ---------- CONFIG ----------
FOLDER       = Path("emiten/cache_15m")
YF_PERIOD    = "60d"
YF_INTERVAL  = "15m"
SESSION_START = "09:00"
SESSION_END   = "15:50"
LOOKBACK_MIN  = 15
DRY_RUN       = False
STANDARD_COLS = ["Open","High","Low","Close","Adj Close","Volume"]
# ----------------------------

def _atomic_write_csv(fp: Path, df: pd.DataFrame):
    fp.parent.mkdir(parents=True, exist_ok=True)
    tmpdir = Path(tempfile.mkdtemp(prefix="tmp_write_"))
    tmpfp = tmpdir / (fp.name + ".tmp")
    df.to_csv(tmpfp, index=False)
    shutil.move(str(tmpfp), str(fp))
    shutil.rmtree(tmpdir, ignore_errors=True)

def _parse_jakarta(x: pd.Series) -> pd.Series:
    dt = pd.to_datetime(x, errors="coerce", utc=False)
    if getattr(dt.dt, "tz", None) is None:
        dt = dt.dt.tz_localize("Asia/Jakarta")
    else:
        dt = dt.dt.tz_convert("Asia/Jakarta")
    return dt

def _fetch_fresh_15m(ticker: str) -> pd.DataFrame:
    df = yf.download(
        ticker, period=YF_PERIOD, interval=YF_INTERVAL,
        auto_adjust=False, threads=False, progress=False
    )
    if df is None or df.empty:
        return pd.DataFrame(columns=STANDARD_COLS)

    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [c[0] for c in df.columns]

    if "Price" in df.columns and "Close" in df.columns:
        df = df.drop(columns=["Price"])
    elif "Price" in df.columns and "Close" not in df.columns:
        df = df.rename(columns={"Price": "Close"})

    idx = df.index
    if getattr(idx, "tz", None) is None:
        df.index = pd.DatetimeIndex(idx).tz_localize("UTC")
    df = df.tz_convert("Asia/Jakarta")

    df = df.between_time(SESSION_START, SESSION_END)

    for c in STANDARD_COLS:
        if c not in df.columns:
            df[c] = pd.NA
    df = df[STANDARD_COLS]

    for c in ["Open", "High", "Low", "Close", "Adj Close", "Volume"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df = df.dropna(how="all", subset=["Open", "High", "Low", "Close", "Volume"])
    return df

def _read_last_dt(fp: Path):
    if not fp.exists() or fp.stat().st_size == 0:
        return None
    try:
        d = pd.read_csv(fp, usecols=["Datetime"])
        if d.empty:
            return None
        dt = _parse_jakarta(d["Datetime"])
        return dt.max()
    except Exception:
        return None

def _merge_append_write(ticker: str, out_csv: Path) -> dict:
    fresh = _fetch_fresh_15m(ticker)
    if fresh.empty:
        return {"ticker": ticker, "status": "no-fresh", "wrote": False}

    last_dt = _read_last_dt(out_csv)
    sess_today_start = pd.Timestamp(datetime.now().date(), tz="Asia/Jakarta") + pd.Timedelta(hours=9)

    if (last_dt is None) or (last_dt < sess_today_start):
        merge_start = sess_today_start
    else:
        merge_start = last_dt - pd.Timedelta(minutes=LOOKBACK_MIN)

    fresh = fresh.loc[fresh.index >= merge_start].copy()

    if out_csv.exists() and out_csv.stat().st_size > 0:
        base = pd.read_csv(out_csv, low_memory=False)
        if base.empty or "Datetime" not in base.columns:
            base = pd.DataFrame(columns=["Datetime"] + STANDARD_COLS)
        else:
            base["Datetime"] = _parse_jakarta(base["Datetime"])
            for c in STANDARD_COLS:
                if c not in base.columns:
                    base[c] = pd.NA
            base = base[["Datetime"] + STANDARD_COLS]
    else:
        base = pd.DataFrame(columns=["Datetime"] + STANDARD_COLS)

    fresh_out = fresh.reset_index().rename(columns={"index": "Datetime"})
    for c in STANDARD_COLS:
        if c not in fresh_out.columns:
            fresh_out[c] = pd.NA
    fresh_out = fresh_out[["Datetime"] + STANDARD_COLS]

    frames = [x for x in (base, fresh_out) if not x.empty]
    if frames:
        merged = pd.concat(frames, ignore_index=True)
        merged = (
            merged.drop_duplicates(subset=["Datetime"], keep="last")
                  .sort_values("Datetime")
        )
    else:
        merged = pd.DataFrame(columns=["Datetime"] + STANDARD_COLS)

    if not DRY_RUN:
        _atomic_write_csv(out_csv, merged)

    return {
        "ticker": ticker,
        "status": "ok",
        "rows_base": len(base),
        "rows_fresh": len(fresh_out),
        "rows_out": len(merged),
        "min_out": merged["Datetime"].min() if not merged.empty else None,
        "max_out": merged["Datetime"].max() if not merged.empty else None,
        "merge_start": merge_start,
        "last_dt_before": last_dt,
        "wrote": not DRY_RUN,
        "file": str(out_csv),
    }

# --------------- RUN ---------------
for file in sorted(FOLDER.glob("*.csv")):
    tkr = file.stem.upper()
    out_csv = file
    res = _merge_append_write(tkr, out_csv)
    if res.get("status") == "ok":
        print(f"✅ {tkr} | base={res['rows_base']} fresh={res['rows_fresh']} out={res['rows_out']} | "
              f"{res['min_out']} … {res['max_out']} | wrote={res['wrote']} | file={res['file']}")
    else:
        print(f"⚠️  {tkr} | {res.get('status')} | wrote={res.get('wrote')} | file={out_csv}")


✅ AADI.JK | base=1252 fresh=8 out=1258 | 2025-05-22 09:00:00+07:00 … 2025-08-20 15:45:00+07:00 | wrote=True | file=emiten/cache_15m/AADI.JK.csv
✅ AALI.JK | base=1244 fresh=8 out=1250 | 2025-05-22 09:00:00+07:00 … 2025-08-20 15:45:00+07:00 | wrote=True | file=emiten/cache_15m/AALI.JK.csv
✅ ABBA.JK | base=224 fresh=2 out=225 | 2025-05-22 09:45:00+07:00 … 2025-08-20 14:45:00+07:00 | wrote=True | file=emiten/cache_15m/ABBA.JK.csv
✅ ABDA.JK | base=6 fresh=0 out=6 | 2025-06-05 14:45:00+07:00 … 2025-08-06 09:45:00+07:00 | wrote=True | file=emiten/cache_15m/ABDA.JK.csv
✅ ABMM.JK | base=1247 fresh=8 out=1253 | 2025-05-22 09:00:00+07:00 … 2025-08-20 15:45:00+07:00 | wrote=True | file=emiten/cache_15m/ABMM.JK.csv
✅ ACES.JK | base=1252 fresh=8 out=1258 | 2025-05-22 09:00:00+07:00 … 2025-08-20 15:45:00+07:00 | wrote=True | file=emiten/cache_15m/ACES.JK.csv
✅ ACRO.JK | base=1236 fresh=8 out=1242 | 2025-05-22 09:00:00+07:00 … 2025-08-20 15:45:00+07:00 | wrote=True | file=emiten/cache_15m/ACRO.JK.csv


### Service daily fetch and append

In [None]:
# ============================================
# DAILY APPEND FIX (semua .csv di cache_daily)
# - Robust tz handling (Asia/Jakarta everywhere)
# - Base UTUH (tidak dipangkas), fresh pakai window lookback
# - Merge: drop-dup by Datetime (keep last)
# - Overwrite atomik (safe write)
# ============================================

from pathlib import Path
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import yfinance as yf
import tempfile, shutil

# ---------- CONFIG ----------
FOLDER        = Path("emiten/cache_daily")
YF_PERIOD     = "730d"               # 2 tahun ke belakang
YF_INTERVAL   = "1d"
LOOKBACK_DAY  = 5                    # ambil fresh mulai last_dt - 5 hari
DRY_RUN       = False
STANDARD_COLS = ["Open","High","Low","Close","Adj Close","Volume"]
# ----------------------------

def _atomic_write_csv(fp: Path, df: pd.DataFrame):
    fp.parent.mkdir(parents=True, exist_ok=True)
    tmpdir = Path(tempfile.mkdtemp(prefix="tmp_write_"))
    tmpfp = tmpdir / (fp.name + ".tmp")
    df.to_csv(tmpfp, index=False)
    shutil.move(str(tmpfp), str(fp))
    shutil.rmtree(tmpdir, ignore_errors=True)

def _parse_jakarta(x: pd.Series) -> pd.Series:
    dt = pd.to_datetime(x, errors="coerce", utc=False)
    if getattr(dt.dt, "tz", None) is None:
        dt = dt.dt.tz_localize("Asia/Jakarta")
    else:
        dt = dt.dt.tz_convert("Asia/Jakarta")
    return dt

def _fetch_fresh_daily(ticker: str) -> pd.DataFrame:
    df = yf.download(
        ticker, period=YF_PERIOD, interval=YF_INTERVAL,
        auto_adjust=False, threads=False, progress=False
    )
    if df is None or df.empty:
        return pd.DataFrame(columns=STANDARD_COLS)

    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [c[0] for c in df.columns]

    if "Price" in df.columns and "Close" in df.columns:
        df = df.drop(columns=["Price"])
    elif "Price" in df.columns and "Close" not in df.columns:
        df = df.rename(columns={"Price": "Close"})

    for c in STANDARD_COLS:
        if c not in df.columns:
            df[c] = pd.NA
    df = df[STANDARD_COLS]

    for c in STANDARD_COLS:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df = df.dropna(how="all", subset=["Open", "High", "Low", "Close", "Volume"])

    df.index = pd.DatetimeIndex(df.index).tz_localize("UTC").tz_convert("Asia/Jakarta")
    return df

def _read_last_dt(fp: Path):
    if not fp.exists() or fp.stat().st_size == 0:
        return None
    try:
        d = pd.read_csv(fp, usecols=["Datetime"])
        if d.empty:
            return None
        dt = _parse_jakarta(d["Datetime"])
        return dt.max()
    except Exception:
        return None

def _merge_append_write(ticker: str, out_csv: Path) -> dict:
    fresh = _fetch_fresh_daily(ticker)
    if fresh.empty:
        return {"ticker": ticker, "status": "no-fresh", "wrote": False}

    last_dt = _read_last_dt(out_csv)
    merge_start = None
    if last_dt is None:
        merge_start = None
    else:
        merge_start = last_dt - pd.Timedelta(days=LOOKBACK_DAY)

    if merge_start is not None:
        fresh = fresh.loc[fresh.index >= merge_start].copy()

    if out_csv.exists() and out_csv.stat().st_size > 0:
        base = pd.read_csv(out_csv, low_memory=False)
        if base.empty or "Datetime" not in base.columns:
            base = pd.DataFrame(columns=["Datetime"] + STANDARD_COLS)
        else:
            base["Datetime"] = _parse_jakarta(base["Datetime"])
            for c in STANDARD_COLS:
                if c not in base.columns:
                    base[c] = pd.NA
            base = base[["Datetime"] + STANDARD_COLS]
    else:
        base = pd.DataFrame(columns=["Datetime"] + STANDARD_COLS)

    fresh_out = fresh.reset_index()
    dt_col = fresh_out.columns[0]
    fresh_out = fresh_out.rename(columns={dt_col: "Datetime"})
    for c in STANDARD_COLS:
        if c not in fresh_out.columns:
            fresh_out[c] = pd.NA
    fresh_out = fresh_out[["Datetime"] + STANDARD_COLS]


    frames = [x for x in (base, fresh_out) if not x.empty]
    if frames:
        merged = pd.concat(frames, ignore_index=True)
        merged = (
            merged.drop_duplicates(subset=["Datetime"], keep="last")
                  .sort_values("Datetime")
        )
    else:
        merged = pd.DataFrame(columns=["Datetime"] + STANDARD_COLS)

    if not DRY_RUN:
        _atomic_write_csv(out_csv, merged)

    return {
        "ticker": ticker,
        "status": "ok",
        "rows_base": len(base),
        "rows_fresh": len(fresh_out),
        "rows_out": len(merged),
        "min_out": merged["Datetime"].min() if not merged.empty else None,
        "max_out": merged["Datetime"].max() if not merged.empty else None,
        "merge_start": merge_start,
        "last_dt_before": last_dt,
        "wrote": not DRY_RUN,
        "file": str(out_csv),
    }

# --------------- RUN ---------------
for file in sorted(FOLDER.glob("*.csv")):
    tkr = file.stem.upper()
    out_csv = file
    res = _merge_append_write(tkr, out_csv)
    if res.get("status") == "ok":
        print(f"✅ {tkr} | base={res['rows_base']} fresh={res['rows_fresh']} out={res['rows_out']} | "
              f"{res['min_out']} … {res['max_out']} | wrote={res['wrote']} | file={res['file']}")
    else:
        print(f"⚠️  {tkr} | {res.get('status')} | wrote={res.get('wrote')} | file={out_csv}")


✅ DADA.JK | base=729 fresh=3 out=729 | 2022-08-02 07:00:00+07:00 … 2025-08-20 07:00:00+07:00 | wrote=True | file=emiten/cache_daily/DADA.JK.csv
✅ DART.JK | base=729 fresh=3 out=729 | 2022-08-02 07:00:00+07:00 … 2025-08-20 07:00:00+07:00 | wrote=True | file=emiten/cache_daily/DART.JK.csv
✅ DATA.JK | base=305 fresh=3 out=305 | 2024-05-07 07:00:00+07:00 … 2025-08-20 07:00:00+07:00 | wrote=True | file=emiten/cache_daily/DATA.JK.csv
✅ DAYA.JK | base=729 fresh=3 out=729 | 2022-08-02 07:00:00+07:00 … 2025-08-20 07:00:00+07:00 | wrote=True | file=emiten/cache_daily/DAYA.JK.csv
✅ DCII.JK | base=729 fresh=3 out=729 | 2022-08-02 07:00:00+07:00 … 2025-08-20 07:00:00+07:00 | wrote=True | file=emiten/cache_daily/DCII.JK.csv
✅ DEAL.JK | base=728 fresh=3 out=728 | 2022-08-02 07:00:00+07:00 … 2025-08-19 07:00:00+07:00 | wrote=True | file=emiten/cache_daily/DEAL.JK.csv
✅ DEFI.JK | base=729 fresh=3 out=729 | 2022-08-02 07:00:00+07:00 … 2025-08-20 07:00:00+07:00 | wrote=True | file=emiten/cache_daily/DEFI

## Sanity Check

### Sanity For Update Folder Date

In [6]:
from collections import Counter
import pandas as pd
from pathlib import Path

# ========= PARAMETER YANG ANDA GANTI SAJA =========
FREQ = "15m"   # pilih: "1m", "5m", atau "15m"
# ==================================================

BASE_DIR = Path("emiten")
folder = BASE_DIR / f"cache_{FREQ}"
LIMIT_ROWS = 20_000
TZ = "Asia/Jakarta"

if FREQ not in {"1m", "5m", "15m"}:
    raise ValueError('FREQ harus salah satu dari: "1m", "5m", "15m"')

if not folder.exists():
    raise FileNotFoundError(f"Folder tidak ditemukan: {folder.resolve()}")

label_menit = {"1m": "1 menit", "5m": "5 menit", "15m": "15 menit"}[FREQ]

dates = []
for fp in folder.glob("*.csv"):
    try:
        # Baca kolom Datetime saja agar cepat
        s = pd.read_csv(fp, usecols=["Datetime"], nrows=LIMIT_ROWS)["Datetime"]
        dt = pd.to_datetime(s, errors="coerce")

        # Pastikan semua di TZ Asia/Jakarta
        if dt.dt.tz is None:
            # Datetime naive → lokalize ke Asia/Jakarta
            dt = dt.dt.tz_localize(TZ, nonexistent="NaT", ambiguous="NaT")
        else:
            # Sudah tz-aware → konversi ke Asia/Jakarta
            dt = dt.dt.tz_convert(TZ)

        got = dt.dt.date.dropna()
        if not got.empty:
            dates.append(got.max())
    except Exception:
        # Lewati file yang rusak/format tak sesuai
        pass

cnt = Counter(dates)
print(f"Top tanggal di cache {label_menit} (terbanyak muncul):")
for d, n in cnt.most_common(5):
    print(d, n)


Top tanggal di cache 15 menit (terbanyak muncul):
2025-08-20 847
2025-08-19 48
2025-06-26 8
2025-08-12 3
2025-07-30 2


### Sanity No 1

In [6]:
# ============================================================
# SANITY TEST untuk emiten/clean_1m/*.csv
# - Validasi format kolom: Datetime, Open, High, Low, Close, Adj Close, Volume
# - Parse tz Asia/Jakarta, sort, duplicate, NA, out-of-session, weekend
# - Bandingkan min_dt/max_dt per file vs global
# - Simpan laporan rinci ke _logs/ + print ringkasan
# ============================================================
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime

# ---------------- CONFIG ----------------
CLEAN_DIR     = Path("emiten/cache_1m")
REQUIRED      = ["Datetime","Open","High","Low","Close","Adj Close","Volume"]
TIMEZONE      = "Asia/Jakarta"
SESSION_START = "09:00"
SESSION_END   = "15:59"  # pakai :59 supaya menit terakhir ikut
RANDOM_SAMPLE = 10
LOG_DIR       = CLEAN_DIR / "_logs"
STAMP         = datetime.now().strftime("%Y%m%d_%H%M%S")
OUT_FULL      = LOG_DIR / f"sanity_report_{STAMP}.csv"
OUT_MISS_S    = LOG_DIR / f"sanity_mismatch_start_{STAMP}.csv"
OUT_MISS_E    = LOG_DIR / f"sanity_mismatch_end_{STAMP}.csv"
# ---------------------------------------

def _parse_dt_series(s: pd.Series) -> pd.Series:
    """
    Parse Datetime yang mungkin:
    - sudah offset-aware (contoh: 2025-08-19 15:49:00+07:00)
    - atau masih naive → dilokalize ke Asia/Jakarta
    """
    dt = pd.to_datetime(s, errors="coerce")
    # kalau dtype datetime64[ns] (naive), lokalize ke Asia/Jakarta
    try:
        is_naive = (dt.dt.tz is None)
    except Exception:
        is_naive = True
    if is_naive:
        try:
            dt = dt.dt.tz_localize(TIMEZONE, nonexistent="NaT", ambiguous="NaT")
        except Exception:
            # fallback: localize Jakarta tanpa arg optional (untuk versi pandas lama)
            dt = dt.dt.tz_localize(TIMEZONE)
    else:
        # sudah tz-aware → pastikan benar ke Asia/Jakarta
        try:
            dt = dt.dt.tz_convert(TIMEZONE)
        except Exception:
            # kalau sudah Asia/Jakarta ya lanjut
            pass
    return dt

def _read_csv_safe(fp: Path) -> pd.DataFrame:
    # baca CSV; drop kemungkinan kolom index otomatis
    df = pd.read_csv(fp, low_memory=False)
    for c in list(df.columns):
        if str(c).startswith("Unnamed"):
            df = df.drop(columns=[c])
    return df

def _summarize_file(fp: Path) -> dict:
    rec = {
        "ticker": fp.stem, "status": "OK",
        "rows_csv": 0, "rows_valid": 0, "dup_dt": 0,
        "out_of_session": 0, "weekend_rows": 0,
        "min_dt": pd.NaT, "max_dt": pd.NaT,
        "missing_cols": "", "extra_cols": "", "file": str(fp),
        "na_Open": 0, "na_High": 0, "na_Low": 0, "na_Close": 0, "na_Adj Close": 0, "na_Volume": 0,
        "error": ""
    }
    try:
        df = _read_csv_safe(fp)
        rec["rows_csv"] = len(df)
        cols = list(df.columns)
        missing = [c for c in REQUIRED if c not in cols]
        extra   = [c for c in cols if c not in REQUIRED]
        if missing:
            rec["status"] = "MISSING_COLS"
            rec["missing_cols"] = ";".join(missing)
        if extra:
            rec["extra_cols"] = ";".join(extra)

        # parse datetime
        if "Datetime" not in df.columns:
            rec["status"] = "DT_ABSENT"
            return rec
        dt = _parse_dt_series(df["Datetime"])

        # index dan sort
        dfi = df.copy()
        dfi["Datetime"] = dt
        dfi = dfi.dropna(subset=["Datetime"]).set_index("Datetime").sort_index()
        rec["rows_valid"] = len(dfi)
        if rec["rows_valid"] == 0:
            return rec

        # cast ringan untuk NA count
        for c in ["Open","High","Low","Close","Adj Close"]:
            if c in dfi.columns:
                dfi[c] = pd.to_numeric(dfi[c], errors="coerce")
        if "Volume" in dfi.columns:
            dfi["Volume"] = pd.to_numeric(dfi["Volume"], errors="coerce")

        # duplicate timestamp
        rec["dup_dt"] = int(dfi.index.duplicated().sum())

        # NA per kolom
        for c in ["Open","High","Low","Close","Adj Close","Volume"]:
            if c in dfi.columns:
                rec[f"na_{c}"] = int(dfi[c].isna().sum())
            else:
                rec[f"na_{c}"] = rec["rows_valid"]

        # out-of-session & weekend
        try:
            in_sess = dfi.between_time(SESSION_START, SESSION_END)
            rec["out_of_session"] = int(len(dfi) - len(in_sess))
        except Exception:
            rec["out_of_session"] = 0
        wd_ok = int((dfi.index.dayofweek < 5).sum())
        rec["weekend_rows"] = int(len(dfi) - wd_ok)

        # range
        rec["min_dt"] = dfi.index.min()
        rec["max_dt"] = dfi.index.max()
        return rec
    except Exception as e:
        rec["status"] = "READ_FAIL"
        rec["error"] = f"{e}"
        return rec

# ---------------- RUN ----------------
files = sorted(CLEAN_DIR.glob("*.csv"))
if not files:
    print(f"[!] Tidak ada file CSV di {CLEAN_DIR.resolve()}")
else:
    LOG_DIR.mkdir(parents=True, exist_ok=True)
    results = [_summarize_file(fp) for fp in files]
    report = pd.DataFrame(results)

    # filter file OK & punya data
    okmask = (report["status"] == "OK") & report["rows_valid"].gt(0)
    okrep = report[okmask].copy()

    global_min = okrep["min_dt"].min() if not okrep.empty else pd.NaT
    global_max = okrep["max_dt"].max() if not okrep.empty else pd.NaT

    mism_start = okrep[okrep["min_dt"] != global_min][["ticker","min_dt"]].sort_values("ticker")
    mism_end   = okrep[okrep["max_dt"] != global_max][["ticker","max_dt"]].sort_values("ticker")

    # Simpan CSV laporan
    report.sort_values("ticker").to_csv(OUT_FULL, index=False)
    mism_start.to_csv(OUT_MISS_S, index=False)
    mism_end.to_csv(OUT_MISS_E, index=False)

    # Cetak ringkasan
    total = len(report)
    ok_ct = int((report["status"]=="OK").sum())
    non_ok_ct = total - ok_ct
    with_data = int(report["rows_valid"].gt(0).sum())
    any_dups = int(report["dup_dt"].gt(0).sum())
    any_oos  = int(report["out_of_session"].gt(0).sum())
    any_wend = int(report["weekend_rows"].gt(0).sum())

    print("\n================= SANITY SUMMARY =================")
    print(f"Total files         : {total}")
    print(f"OK files            : {ok_ct}")
    print(f"Non-OK files        : {non_ok_ct}")
    print(f"Files with data     : {with_data}")
    print(f"Global min datetime : {global_min}")
    print(f"Global max datetime : {global_max}")
    print(f"Mismatch start count: {len(mism_start)}")
    print(f"Mismatch end count  : {len(mism_end)}")
    print(f"Any duplicate rows  : {any_dups}")
    print(f"Any out-of-session  : {any_oos}")
    print(f"Any weekend rows    : {any_wend}")
    print("==================================================\n")

    # Tampilkan beberapa baris contoh dari report
    to_show = min(20, total)
    print(">>> Contoh 20 entri pertama dari laporan per-file:")
    print(report.sort_values("ticker").head(to_show).to_string(index=False))

    # Random sample (kalau mau lihat cepat)
    try:
        sample = okrep.sample(min(RANDOM_SAMPLE, len(okrep)), random_state=42)[
            ["ticker","rows_valid","min_dt","max_dt","dup_dt","out_of_session","weekend_rows"]
        ].sort_values("ticker")
        if len(sample) > 0:
            print("\n>>> Random sample ~{} tickers:".format(len(sample)))
            print(sample.to_string(index=False))
    except Exception:
        pass

    print(f"\n[✓] Laporan lengkap: {OUT_FULL}")
    print(f"[✓] Mismatch start: {OUT_MISS_S}")
    print(f"[✓] Mismatch end  : {OUT_MISS_E}")



Total files         : 773
OK files            : 773
Non-OK files        : 0
Files with data     : 773
Global min datetime : 2025-08-11 09:00:00+07:00
Global max datetime : 2025-08-20 14:51:00+07:00
Mismatch start count: 0
Mismatch end count  : 735
Any duplicate rows  : 0
Any out-of-session  : 0
Any weekend rows    : 0

>>> Contoh 20 entri pertama dari laporan per-file:
 ticker status  rows_csv  rows_valid  dup_dt  out_of_session  weekend_rows                    min_dt                    max_dt missing_cols extra_cols                        file  na_Open  na_High  na_Low  na_Close  na_Adj Close  na_Volume error
AADI.JK     OK      3128        3128       0               0             0 2025-08-11 09:00:00+07:00 2025-08-20 14:48:00+07:00                         emiten/cache_1m/AADI.JK.csv     1042     1042    1042      1042          1042       1042      
AALI.JK     OK      3120        3120       0               0             0 2025-08-11 09:00:00+07:00 2025-08-20 14:47:00+07:00         

### Sanity No 2

In [8]:
# ============================================================
# ALIGN + COVERAGE CHECK (1m, Asia/Jakarta)
# - Bangun indeks referensi 1m hanya pada weekdays & 09:00–15:50 (inklusif)
# - Overwrite semua CSV di emiten/clean_1m agar punya window & panjang sama
# - Hitung coverage per hari & metrik hari terakhir
# ============================================================
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime, time

# ---------------- CONFIG ----------------
CLEAN_DIR      = Path("emiten/cache_1m")            # folder input & output (overwrite)
LOG_DIR        = CLEAN_DIR / "_logs"
TZ             = "Asia/Jakarta"
SESSION_START  = time(9, 0)    # 09:00
SESSION_END    = time(15, 50)  # 15:50 (inklusif)
# Pakai hasil sanity summary kamu:
START_DATE_STR = "2025-08-11"
END_DATE_STR   = "2025-08-19"
# ---------------------------------------

def _drop_unnamed_cols(df: pd.DataFrame) -> pd.DataFrame:
    for c in list(df.columns):
        if str(c).startswith("Unnamed"):
            df = df.drop(columns=[c])
    return df

def _parse_jakarta(s: pd.Series) -> pd.Series:
    """Parse Datetime -> tz-aware Asia/Jakarta.
    - Jika naive: lokalize ke Asia/Jakarta
    - Jika sudah tz-aware: convert ke Asia/Jakarta
    """
    dt = pd.to_datetime(s, errors="coerce")
    try:
        # dt.dt.tz is None jika naive
        is_naive = (dt.dt.tz is None)
    except Exception:
        is_naive = True
    if is_naive:
        dt = dt.dt.tz_localize(TZ, nonexistent="NaT", ambiguous="NaT")
    else:
        dt = dt.dt.tz_convert(TZ)
    return dt

def build_ref_index(start_date: str, end_date: str, tz: str) -> pd.DatetimeIndex:
    """Bangun index referensi menit 09:00..15:50 (inklusif) untuk setiap weekday dalam rentang."""
    d0 = pd.Timestamp(start_date).tz_localize(tz)   # 00:00:00+07:00
    d1 = pd.Timestamp(end_date).tz_localize(tz)     # 00:00:00+07:00
    days = pd.date_range(d0.normalize(), d1.normalize(), freq="D")  # tz-aware, tanpa tz= arg
    pieces = []
    for d in days:
        if d.weekday() >= 5:  # skip weekend (5=Sat, 6=Sun)
            continue
        start_dt = d.replace(hour=SESSION_START.hour, minute=SESSION_START.minute, second=0, microsecond=0)
        end_dt   = d.replace(hour=SESSION_END.hour, minute=SESSION_END.minute, second=0, microsecond=0)
        rng = pd.date_range(start_dt, end_dt, freq="1min")  # tz-aware; JANGAN pass tz=
        pieces.append(rng)
    if not pieces:
        return pd.DatetimeIndex([], tz=tz)
    return pieces[0].append(pieces[1:])  # concat cepat

def minutes_expected_per_day(tz: str) -> int:
    """Hitung menit per hari dalam sesi (inklusif)."""
    d = pd.Timestamp("2000-01-03").tz_localize(tz)  # Senin
    s = d.replace(hour=SESSION_START.hour, minute=SESSION_START.minute)
    e = d.replace(hour=SESSION_END.hour, minute=SESSION_END.minute)
    # +1 karena inklusif (mis: 09:00..09:02 = 3 menit: 00,01,02)
    return int((e - s).total_seconds() // 60) + 1

def coverage_by_day(df: pd.DataFrame, ref_idx: pd.DatetimeIndex) -> pd.DataFrame:
    """Hitung coverage per hari: jumlah menit yang punya Close (non-NaN) dibanding expected."""
    # Valid minute: Close non-NaN (boleh ganti ke 'Volume' atau gabungan sesuai preferensi)
    has_data = df["Close"].notna() if "Close" in df.columns else pd.Series(False, index=df.index)
    dd = pd.DataFrame({"has_data": has_data})
    dd["date"] = dd.index.tz_convert(TZ).normalize()
    cov = dd.groupby("date")["has_data"].sum().reset_index(name="valid_minutes")

    # Expected per day = jumlah menit di ref_idx untuk tanggal itu
    exp = pd.DataFrame({"ts": ref_idx})
    exp["date"] = exp["ts"].dt.normalize()
    exp_ct = exp.groupby("date").size().rename("expected_minutes").reset_index()

    out = cov.merge(exp_ct, on="date", how="right").fillna({"valid_minutes": 0})
    out["coverage_pct"] = np.where(out["expected_minutes"] > 0,
                                   100.0 * out["valid_minutes"] / out["expected_minutes"],
                                   np.nan)
    return out.sort_values("date")

# ---------------- MAIN ----------------
STAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
LOG_DIR.mkdir(parents=True, exist_ok=True)

# 1) Bangun indeks referensi
REF_INDEX = build_ref_index(START_DATE_STR, END_DATE_STR, TZ)
EXPECTED_PER_DAY = minutes_expected_per_day(TZ)

# 2) Proses setiap file
rows = []
files = sorted(CLEAN_DIR.glob("*.csv"))
for fp in files:
    try:
        df = pd.read_csv(fp, low_memory=False)
        df = _drop_unnamed_cols(df)
        if "Datetime" not in df.columns:
            rows.append({"ticker": fp.stem, "status": "NO_DATETIME", "file": str(fp)})
            continue

        # Parse ke Asia/Jakarta
        dt = _parse_jakarta(df["Datetime"])
        df = df.dropna(subset=["Datetime"]).copy()
        df["Datetime"] = dt
        df = df.dropna(subset=["Datetime"]).set_index("Datetime").sort_index()

        # Pastikan semua kolom OHLCV ada (kalau tidak, tambahkan kosong)
        for c in ["Open","High","Low","Close","Adj Close","Volume"]:
            if c not in df.columns:
                df[c] = pd.NA
        # Cast numerik santai
        for c in ["Open","High","Low","Close","Adj Close","Volume"]:
            df[c] = pd.to_numeric(df[c], errors="coerce")

        # Filter ke jam sesi saja (aman kalau sudah bersih)
        df = df.between_time(SESSION_START.strftime("%H:%M"), SESSION_END.strftime("%H:%M"))

        # 3) Reindex ke REF_INDEX (overwrite total)
        before_len = len(df)
        df = df.reindex(REF_INDEX)   # Index tz sudah sama → aman
        after_len = len(df)

        # 4) Coverage per hari + metrik
        cov = coverage_by_day(df, REF_INDEX)
        last_day = pd.Timestamp(END_DATE_STR).tz_localize(TZ).normalize()
        last_cov_row = cov[cov["date"] == last_day]
        last_cov_pct = float(last_cov_row["coverage_pct"].iloc[0]) if not last_cov_row.empty else np.nan
        has_last_day = bool(last_cov_pct == last_cov_pct and last_cov_pct > 0)  # True jika ada menit valid

        # 5) Tulis balik (overwrite)
        df_out = df.reset_index().rename(columns={"index":"Datetime"})
        df_out.to_csv(fp, index=False)

        rows.append({
            "ticker": fp.stem,
            "status": "OK",
            "file": str(fp),
            "rows_before": before_len,
            "rows_after": after_len,
            "min_dt": df.index.min(),
            "max_dt": df.index.max(),
            "na_total": int(df_out[["Open","High","Low","Close","Adj Close","Volume"]].isna().sum().sum()),
            "expected_per_day": EXPECTED_PER_DAY,
            "last_day_coverage_pct": round(last_cov_pct, 2) if last_cov_pct == last_cov_pct else np.nan,
            "has_last_day": has_last_day
        })
    except Exception as e:
        rows.append({"ticker": fp.stem, "status": "ERROR", "file": str(fp), "error": str(e)})

# 6) Laporan
report = pd.DataFrame(rows).sort_values("ticker")
out_csv = LOG_DIR / f"align_report_{STAMP}.csv"
report.to_csv(out_csv, index=False)

# 7) Ringkasan ke layar
ok_ct = int((report["status"] == "OK").sum())
err_ct = int((report["status"] == "ERROR").sum())
no_dt_ct = int((report["status"] == "NO_DATETIME").sum())
with_data_last_day = int(report.get("has_last_day", pd.Series([], dtype=bool)).sum())

print("=============== ALIGN + COVERAGE SUMMARY ===============")
print(f"Files total         : {len(report)}")
print(f"OK                  : {ok_ct}")
print(f"NO_DATETIME         : {no_dt_ct}")
print(f"ERROR               : {err_ct}")
if "rows_after" in report.columns:
    print(f"Window rows (per file): {int(report['rows_after'].dropna().iloc[0]) if ok_ct else 0}")
print(f"Expected minutes/day: {EXPECTED_PER_DAY} (09:00–15:50 inkl.)")
print(f"Has data on last day: {with_data_last_day}")
print("========================================================\n")

# Tampilkan beberapa baris contoh
to_show = min(20, len(report))
print(">>> Contoh 20 baris pertama laporan:")
print(report.head(to_show).to_string(index=False))

print(f"\n[✓] Laporan lengkap: {out_csv}")


Files total         : 773
OK                  : 773
NO_DATETIME         : 0
ERROR               : 0
Window rows (per file): 2877
Expected minutes/day: 411 (09:00–15:50 inkl.)
Has data on last day: 755

>>> Contoh 20 baris pertama laporan:
 ticker status                        file  rows_before  rows_after                    min_dt                    max_dt  na_total  expected_per_day  last_day_coverage_pct  has_last_day
AADI.JK     OK emiten/cache_1m/AADI.JK.csv         2877        2877 2025-08-11 09:00:00+07:00 2025-08-19 15:50:00+07:00      6252               411                  77.37          True
AALI.JK     OK emiten/cache_1m/AALI.JK.csv         2877        2877 2025-08-11 09:00:00+07:00 2025-08-19 15:50:00+07:00      9324               411                  68.37          True
ABDA.JK     OK emiten/cache_1m/ABDA.JK.csv         2877        2877 2025-08-11 09:00:00+07:00 2025-08-19 15:50:00+07:00     17262               411                   0.00         False
ABMM.JK     OK emiten

## Core Service BSJP

### BSJP V.2.0 (GPT Version)

In [4]:
# ======================================================================
# BSJP v2.4-core — Intraday Spike Detector (IHSG) — SINGLE CELL
# ----------------------------------------------------------------------
# - Core engine TANPA hardcode daftar ARA / evaluator
# - Otomatis pilih "latest trading day" <= TODAY jika data hari ini belum ada
# - TZ: Asia/Jakarta, cutoff 14:15
# - Filters: daily_return 1%–40%, vol_pace > 1.2x
# - Score v2.2: price_term * log1p(min(pace, 50))
# - vol_pace fallback: 1m → 5m → daily (daily dikoreksi faktor 0.75)
# - Diagnostics ringkas: ringkasan alasan drop (Counter)
# - Output: Top-N (default 30) dengan kolom terformat
# ======================================================================

import pandas as pd, numpy as np
from pathlib import Path
from datetime import datetime, timedelta, date
from typing import Optional, List, Tuple, Dict
from collections import defaultdict, Counter
from IPython.display import display

# ================== CONFIG ==================
FOLDER_1M     = Path("emiten/cache_1m")
FOLDER_5M     = Path("emiten/cache_5m")      # fallback intraday
FOLDER_DAILY  = Path("emiten/cache_daily")   # last fallback
SESSION_TZ    = "Asia/Jakarta"
CUTOFF_STR    = "09:30"
CUTOFF_TIME   = datetime.strptime(CUTOFF_STR, "%H:%M").time()
TODAY         = pd.Timestamp("today", tz=SESSION_TZ).normalize().date()

BASELINE_DAYS = 60
PACE_MIN      = 1.2
RETURN_MIN    = 0.01
RETURN_MAX    = 0.40
TOP_N         = 10

# Target date behavior
AUTO_SELECT_LATEST      = True          # True: auto pakai hari intraday terbaru <= TODAY
TARGET_DATE_OVERRIDE    = date(2025, 8, 11)          # Contoh: date(2025, 8, 19) — kalau None, pakai auto / TODAY

print(f"Hari ini: {TODAY}, cutoff: {CUTOFF_STR}")
print(f"[INFO] Ditemukan {len(list(FOLDER_1M.glob('*.csv')))} file 1m di {FOLDER_1M.resolve()}")

# =============== UTIL =================
def to_jkt(series: pd.Series) -> pd.Series:
    s = pd.to_datetime(series, errors="coerce")
    try:
        if s.dt.tz is None:
            return s.dt.tz_localize(SESSION_TZ)
        return s.dt.tz_convert(SESSION_TZ)
    except Exception:
        s = pd.to_datetime(s, errors="coerce", utc=True).dt.tz_convert(SESSION_TZ)
        return s

def pick_work_date(df_dt: pd.Series, today_date: date) -> Optional[date]:
    dlist = pd.Series(df_dt.dt.date.unique()).dropna().sort_values().tolist()
    if not dlist: return None
    for d in reversed(dlist):
        if d <= today_date: return d
    return None

def read_daily_flex(path: Path) -> Optional[pd.DataFrame]:
    if not path.exists(): return None
    try:
        df = pd.read_csv(path, low_memory=False)
        # map date col
        for dc in ("Date","date","Datetime"):
            if dc in df.columns:
                df["Date"] = pd.to_datetime(df[dc], errors="coerce").dt.date
                break
        else:
            return None
        # map price/volume
        if "Close" not in df.columns and "Adj Close" in df.columns:
            df["Close"] = pd.to_numeric(df["Adj Close"], errors="coerce")
        if "Volume" in df.columns:
            df["Volume"] = pd.to_numeric(df["Volume"], errors="coerce")
        if "Close" not in df.columns or "Volume" not in df.columns: return None
        return df.dropna(subset=["Date","Close","Volume"]).sort_values("Date").reset_index(drop=True)
    except Exception:
        return None

def read_intraday(folder: Path, ticker: str) -> Optional[pd.DataFrame]:
    fp = folder / f"{ticker}.csv"
    if not fp.exists(): return None
    try:
        df = pd.read_csv(fp, low_memory=False)
        if "Datetime" not in df.columns: return None
        df["Datetime"] = to_jkt(df["Datetime"])
        for c in ("Open","High","Low","Close","Volume"):
            if c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce")
        return df.dropna(subset=["Datetime","Close","Volume"])
    except Exception:
        return None

def intraday_cut_volume(df_intraday: pd.DataFrame, work_date: date) -> float:
    mask = (df_intraday["Datetime"].dt.date == work_date) & (df_intraday["Datetime"].dt.time <= CUTOFF_TIME)
    return float(df_intraday.loc[mask, "Volume"].sum())

def intraday_hist_cut_volumes(df_intraday: pd.DataFrame, work_date: date, n_days: int) -> List[float]:
    days = sorted([d for d in df_intraday["Datetime"].dt.date.unique() if d < work_date])[-n_days:]
    vols = []
    for d in days:
        m = (df_intraday["Datetime"].dt.date == d) & (df_intraday["Datetime"].dt.time <= CUTOFF_TIME)
        v = float(df_intraday.loc[m, "Volume"].sum())
        if v > 0: vols.append(v)
    return vols

def vol_pace_robust(ticker: str, work_date: date, vol_today_cut_1m: Optional[float], df_1m: Optional[pd.DataFrame]) -> float:
    """Return pace; fallback 1m → 5m → daily (daily dikoreksi 0.75 utk cutoff)."""
    # 1) 1m baseline
    try:
        if df_1m is not None:
            vol_today_cut = vol_today_cut_1m if vol_today_cut_1m is not None else intraday_cut_volume(df_1m, work_date)
            vols_hist_1m = intraday_hist_cut_volumes(df_1m, work_date, BASELINE_DAYS)
            if len(vols_hist_1m) >= 10:
                base_1m = float(np.median(vols_hist_1m))
                if base_1m > 0: return vol_today_cut / base_1m
    except Exception:
        pass
    # 2) 5m baseline
    try:
        df_5m = read_intraday(FOLDER_5M, ticker)
        if df_5m is not None:
            vol_today_cut_5m = intraday_cut_volume(df_5m, work_date)
            vols_hist_5m = intraday_hist_cut_volumes(df_5m, work_date, BASELINE_DAYS)
            if len(vols_hist_5m) >= 10:
                base_5m = float(np.median(vols_hist_5m))
                if base_5m > 0: return vol_today_cut_5m / base_5m
    except Exception:
        pass
    # 3) Daily baseline (coarser)
    try:
        df_daily = read_daily_flex(FOLDER_DAILY / f"{ticker}.csv")
        if df_daily is not None:
            hist_daily = df_daily[df_daily["Date"] < work_date].tail(BASELINE_DAYS)
            if len(hist_daily) >= 20:
                base_daily = float(hist_daily["Volume"].median())
                if base_daily > 0:
                    vol_today_cut = vol_today_cut_1m
                    if vol_today_cut is None:
                        try:
                            if 'df_5m' in locals() and df_5m is not None:
                                vol_today_cut = intraday_cut_volume(df_5m, work_date)
                        except Exception:
                            pass
                    if vol_today_cut is None:
                        return np.nan
                    return vol_today_cut / (base_daily * 0.75)
    except Exception:
        pass
    return np.nan

def detect_latest_intraday_date(folder_1m: Path, today_date: date) -> Optional[date]:
    """Scan ringan: cari tanggal kerja terbaru yang tersedia di 1m (<= today)."""
    latest = None
    for fp in folder_1m.glob("*.csv"):
        try:
            df = pd.read_csv(fp, usecols=["Datetime"], low_memory=False)
            dt = to_jkt(df["Datetime"])
            wd = pick_work_date(dt, today_date)
            if wd and (latest is None or wd > latest):
                latest = wd
        except Exception:
            continue
    return latest

# =============== ENGINE ===============
def bsjp_candidates(target_date: date, diag: bool=True) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
    SUMMARY = []
    drop_reasons = defaultdict(list) if diag else None

    for fp in sorted(FOLDER_1M.glob("*.csv")):
        ticker = fp.stem
        try:
            df_1m = read_intraday(FOLDER_1M, ticker)
            if df_1m is None:
                if diag: drop_reasons[ticker].append("no_1m_file_or_parse_fail")
                continue

            work_date = pick_work_date(df_1m["Datetime"], target_date)
            if (work_date is None) or (work_date != target_date):
                if diag: drop_reasons[ticker].append(f"not_target_date:{work_date}")
                continue

            df_dwork = df_1m[df_1m["Datetime"].dt.date == work_date].copy().dropna(subset=["Close","Volume"])
            if df_dwork.empty or df_dwork["Volume"].sum() == 0:
                if diag: drop_reasons[ticker].append("no_intraday_or_zero_vol")
                continue

            # prev close dari daily
            df_daily = read_daily_flex(FOLDER_DAILY / f"{ticker}.csv")
            prev_close = np.nan
            if df_daily is not None:
                prev_day = df_daily[df_daily["Date"] < work_date]
                if not prev_day.empty:
                    prev_close = pd.to_numeric(prev_day.iloc[-1]["Close"], errors="coerce")
            if pd.isna(prev_close) or prev_close <= 0:
                if diag: drop_reasons[ticker].append("no_prev_close_daily")
                continue

            # metrik dasar
            high_px = float(df_dwork['High'].max())
            low_px  = float(df_dwork['Low'].min())
            last_px = float(df_dwork['Close'].iloc[-1])
            daily_return = (last_px / prev_close) - 1.0
            if not (RETURN_MIN < daily_return < RETURN_MAX):
                if diag: drop_reasons[ticker].append("daily_return_out_of_range")
                continue

            vol_today_cut_1m = intraday_cut_volume(df_1m, work_date)
            pace = vol_pace_robust(ticker, work_date, vol_today_cut_1m, df_1m)
            if not (pd.notna(pace) and pace > PACE_MIN):
                if diag: drop_reasons[ticker].append(f"pace_insufficient:{pace}")
                continue

            # metrik lanjutan
            daily_range = high_px - low_px
            closing_strength = (last_px - low_px) / daily_range if daily_range > 0 else 1.0

            start_time = df_dwork['Datetime'].min()
            first_5min = df_dwork[df_dwork['Datetime'] <= start_time + timedelta(minutes=5)]
            if not first_5min.empty and first_5min['Volume'].sum() > 0:
                stable_open = float((first_5min["Close"] * first_5min["Volume"]).sum() / first_5min["Volume"].sum())
            else:
                stable_open = float(df_dwork['Open'].iloc[0])
            afternoon_power = (last_px / stable_open) - 1.0 if stable_open > 0 else 0.0

            # skor v2.2
            price_term  = (1 + daily_return) * (1 + max(0.0, afternoon_power)) * closing_strength
            volume_term = np.log1p(min(pace, 50))
            score = price_term * volume_term
            # --- harga pada cutoff ---
            cut_mask = df_dwork["Datetime"].dt.time <= CUTOFF_TIME
            if cut_mask.any():
                price_at_cutoff = float(df_dwork.loc[cut_mask, "Close"].iloc[-1])
            else:
                price_at_cutoff = np.nan

            SUMMARY.append({
                "ticker": ticker, "date": work_date, "score": score, "last": last_px,
                "daily_return": daily_return, "closing_strength": closing_strength,
                "afternoon_power": afternoon_power, "vol_pace": pace,
                "price_at_cutoff": price_at_cutoff
            })

        except Exception as e:
            if diag: drop_reasons[ticker].append(f"exception:{type(e).__name__}")
            continue

    df_result = pd.DataFrame(SUMMARY).sort_values("score", ascending=False).reset_index(drop=True) if SUMMARY else pd.DataFrame(
        columns=["ticker","date","score","last","daily_return","closing_strength","afternoon_power","vol_pace"]
    )
    return df_result, (drop_reasons or {})

# =============== RUN ===============
if TARGET_DATE_OVERRIDE is not None:
    target_date = TARGET_DATE_OVERRIDE
elif AUTO_SELECT_LATEST:
    target_date = detect_latest_intraday_date(FOLDER_1M, TODAY) or TODAY
else:
    target_date = TODAY

print(f"[INFO] Target work_date: {target_date}")

df_result, drop_reasons = bsjp_candidates(target_date, diag=True)

if df_result.empty:
    print("❌ Tidak ada kandidat yang lolos filter awal.")
else:
    # tampilan Top-N terformat
    def fmt_pct(x):  return f"{x:,.2%}" if pd.notna(x) else "N/A"
    def fmt_x(x):    return f"{x:.2f}x"   if pd.notna(x) else "N/A"
    def fmt_f3(x):   return f"{x:.3f}"    if pd.notna(x) else "N/A"

    df_show = df_result.copy()
    df_show["score"]            = df_show["score"].map(fmt_f3)
    df_show["daily_return"]     = df_show["daily_return"].map(fmt_pct)
    df_show["closing_strength"] = df_show["closing_strength"].map(fmt_pct)
    df_show["afternoon_power"]  = df_show["afternoon_power"].map(fmt_pct)
    df_show["vol_pace"]         = df_show["vol_pace"].map(fmt_x)

    print("\n[✓] TOP CANDIDATES — BSJP v2.4-core")
    print(f"(work_date = {target_date}, cutoff = {CUTOFF_STR}, filters: return {int(RETURN_MIN*100)}–{int(RETURN_MAX*100)}%, pace > {PACE_MIN}x)")
    display(df_show.head(TOP_N))

    # Ringkasan alasan drop (Top 8)
    flat_reasons = [r for reasons in drop_reasons.values() for r in reasons]
    if flat_reasons:
        print("\n[DIAG] Alasan drop teratas:")
        for k, v in Counter(flat_reasons).most_common(8):
            print(f"- {k}: {v}")

    # ============= SAVE REKOMENDASI ============
    outdir = Path("rekomendasi")
    outdir.mkdir(exist_ok=True)
    outfile = outdir / f"bsjp_rekomendasi_{target_date}.csv"
     # hanya simpan Top-N
    df_result.head(TOP_N).to_csv(outfile, index=False)

    print(f"\n[✔] Top {TOP_N} rekomendasi disimpan ke: {outfile.resolve()}")



Hari ini: 2025-08-21, cutoff: 09:30
[INFO] Ditemukan 773 file 1m di /home/mkemalw/Projects/SSSAHAM_SERVICE/emiten/cache_1m
[INFO] Target work_date: 2025-08-11

[✓] TOP CANDIDATES — BSJP v2.4-core
(work_date = 2025-08-11, cutoff = 09:30, filters: return 1–40%, pace > 1.2x)


Unnamed: 0,ticker,date,score,last,daily_return,closing_strength,afternoon_power,vol_pace,price_at_cutoff
0,PPRE.JK,2025-08-11,6.91,79.0,33.90%,100.00%,31.25%,438.12x,78.0
1,CHEM.JK,2025-08-11,6.275,105.0,34.62%,100.00%,18.56%,344.83x,97.0
2,FUTR.JK,2025-08-11,4.985,145.0,30.63%,91.67%,19.71%,31.38x,119.0
3,MLPT.JK,2025-08-11,4.556,51600.0,20.00%,100.00%,20.00%,22.67x,47975.0
4,CASH.JK,2025-08-11,4.443,66.0,10.00%,100.00%,8.20%,40.80x,60.0
5,WEGE.JK,2025-08-11,4.358,62.0,21.57%,75.00%,21.57%,71.66x,54.0
6,BULL.JK,2025-08-11,3.582,151.0,17.05%,95.24%,10.51%,17.31x,135.0
7,BRNA.JK,2025-08-11,3.486,835.0,24.63%,100.00%,26.52%,8.12x,660.0
8,SKLT.JK,2025-08-11,3.485,190.0,1.60%,100.00%,0.00%,29.87x,190.0
9,COIN.JK,2025-08-11,3.13,1625.0,1.56%,68.00%,15.28%,66.17x,1405.0



[DIAG] Alasan drop teratas:
- daily_return_out_of_range: 480
- not_target_date:None: 20
- pace_insufficient:0.0: 20
- no_intraday_or_zero_vol: 6
- pace_insufficient:0.6021684737281068: 1
- pace_insufficient:0.41871847375518156: 1
- pace_insufficient:0.7692307692307693: 1
- pace_insufficient:0.29555704826792056: 1

[✔] Top 10 rekomendasi disimpan ke: /home/mkemalw/Projects/SSSAHAM_SERVICE/rekomendasi/bsjp_rekomendasi_2025-08-11.csv


### BSJP V.2.1 (GPT Version)

In [3]:
# ======================================================================
# BSJP v2.4-core — Intraday Spike Detector (IHSG) — 5-MINUTE ONLY
# ----------------------------------------------------------------------
# - SUMBER DATA: emiten/cache_5m/<TICKER>.csv (kolom: Datetime, Open, High, Low, Close, Volume)
# - Daily prev close: emiten/cache_daily/<TICKER>.csv (kolom: Date, Close, Volume)
# - Proses N hari bursa terakhir (union dari semua file 5m yang ada)
# - Cutoff: default "09:30" (bisa "10:00" atau lainnya)
# - Pace: median baseline dari historis 5m (≤ cutoff)
# - Output: rekomendasi/bsjp_rekomendasi_YYYY-MM-DD.csv (Top-N)
# ======================================================================

import pandas as pd, numpy as np
from pathlib import Path
from datetime import datetime, timedelta, date
from typing import Optional, List, Tuple, Dict
from collections import defaultdict, Counter
from IPython.display import display

# ================== CONFIG ==================
FOLDER_5M     = Path("emiten/cache_5m")
FOLDER_DAILY  = Path("emiten/cache_daily")
OUT_DIR       = Path("rekomendasi")
OUT_DIR.mkdir(exist_ok=True)

SESSION_TZ    = "Asia/Jakarta"
CUTOFF_STR    = "09:30"                 # contoh lain: "10:00" / "11:30" / "14:15"
CUTOFF_TIME   = datetime.strptime(CUTOFF_STR, "%H:%M").time()

# Engine params (boleh disesuaikan tipis sesuai data)
BASELINE_DAYS = 60                      # histori maksimum untuk median pace
MIN_BASELINE_FOR_PACE = 5               # minimal hari histori agar pace dinilai stabil
PACE_MIN      = 1.10                    # ambang pace (5m cenderung lebih stabil)
RETURN_MIN    = 0.01                    # 1% < daily_return < 50%
RETURN_MAX    = 0.50
TOP_N         = 10

# Multi-day
DAYS_LIMIT        = 30                  # proses maksimal N hari bursa terakhir yang tersedia
SHOW_LAST_ONLY    = True                # True: hanya tampilkan hari terakhir
SKIP_IF_EXISTS    = True                # True: skip jika file output hari tsb sudah ada
ALLOW_PASS_IF_BASELINE_SHORT = True     # True: jika histori pace < MIN_BASELINE_FOR_PACE, jangan auto-drop

# ================== UTIL ==================
def to_jkt(series: pd.Series) -> pd.Series:
    s = pd.to_datetime(series, errors="coerce")
    try:
        if s.dt.tz is None:
            return s.dt.tz_localize(SESSION_TZ)
        return s.dt.tz_convert(SESSION_TZ)
    except Exception:
        # fallback: asumsikan UTC → convert ke JKT
        return pd.to_datetime(series, errors="coerce", utc=True).dt.tz_convert(SESSION_TZ)

def read_daily_flex(path: Path) -> Optional[pd.DataFrame]:
    if not path.exists(): return None
    try:
        df = pd.read_csv(path, low_memory=False)
        # kolom tanggal fleksibel
        for dc in ("Date","date","Datetime","datetime"):
            if dc in df.columns:
                df["Date"] = pd.to_datetime(df[dc], errors="coerce").dt.date
                break
        else:
            return None
        # normalisasi Close & Volume
        if "Close" not in df.columns and "Adj Close" in df.columns:
            df["Close"] = pd.to_numeric(df["Adj Close"], errors="coerce")
        if "Volume" in df.columns:
            df["Volume"] = pd.to_numeric(df["Volume"], errors="coerce")
        if "Close" not in df.columns or "Volume" not in df.columns: return None
        df = df.dropna(subset=["Date","Close","Volume"]).sort_values("Date").reset_index(drop=True)
        return df
    except Exception:
        return None

def read_5m(ticker: str) -> Optional[pd.DataFrame]:
    fp = FOLDER_5M / f"{ticker}.csv"
    if not fp.exists(): return None
    try:
        df = pd.read_csv(fp, low_memory=False)
        if "Datetime" not in df.columns: return None
        df["Datetime"] = to_jkt(df["Datetime"])
        for c in ("Open","High","Low","Close","Volume"):
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors="coerce")
        return df.dropna(subset=["Datetime","Close","Volume"])
    except Exception:
        return None

def pick_work_date(df_dt: pd.Series, target: date) -> Optional[date]:
    dlist = pd.Series(df_dt.dt.date.unique()).dropna().sort_values().tolist()
    if not dlist: return None
    for d in reversed(dlist):
        if d <= target:
            return d
    return None

def has_data_for_date(df: Optional[pd.DataFrame], target: date) -> bool:
    if df is None: return False
    wd = pick_work_date(df["Datetime"], target)
    return (wd is not None) and (wd == target)

def intraday_cut_volume(df_5m: pd.DataFrame, work_date: date) -> float:
    m = (df_5m["Datetime"].dt.date == work_date) & (df_5m["Datetime"].dt.time <= CUTOFF_TIME)
    return float(df_5m.loc[m, "Volume"].sum())

def intraday_hist_cut_volumes(df_5m: pd.DataFrame, work_date: date, n_days: int) -> List[float]:
    days = sorted([d for d in df_5m["Datetime"].dt.date.unique() if d < work_date])[-n_days:]
    vols = []
    for d in days:
        m = (df_5m["Datetime"].dt.date == d) & (df_5m["Datetime"].dt.time <= CUTOFF_TIME)
        v = float(df_5m.loc[m, "Volume"].sum())
        if v > 0: vols.append(v)
    return vols

def vol_pace_5m(df_5m: pd.DataFrame, work_date: date) -> Tuple[float, int]:
    """Return (pace, hist_len). pace = vol_cut_today / median(vol_cut_hist)."""
    vol_today_cut = intraday_cut_volume(df_5m, work_date)
    vols_hist = intraday_hist_cut_volumes(df_5m, work_date, BASELINE_DAYS)
    hist_len = len(vols_hist)
    if hist_len >= MIN_BASELINE_FOR_PACE:
        base = float(np.median(vols_hist))
        if base > 0:
            return (vol_today_cut / base, hist_len)
    return (np.nan, hist_len)

def available_dates_5m_union(today_date: date) -> List[date]:
    dates = set()
    for fp in FOLDER_5M.glob("*.csv"):
        try:
            d = pd.read_csv(fp, usecols=["Datetime"], low_memory=False)
            dt = to_jkt(d["Datetime"])
            days = pd.Series(dt.dt.date.unique()).dropna().tolist()
            dates.update([x for x in days if x <= today_date])
        except Exception:
            continue
    return sorted(dates)

# ================== ENGINE ==================
def bsjp_candidates_5m(target_date: date, diag: bool=True):
    SUMMARY = []
    drop_reasons = defaultdict(list) if diag else None

    tickers = sorted({fp.stem for fp in FOLDER_5M.glob("*.csv")})
    for ticker in tickers:
        try:
            df_5m = read_5m(ticker)
            if not has_data_for_date(df_5m, target_date):
                if diag: drop_reasons[ticker].append("no_data_for_target_date")
                continue

            # day slice
            dwork = df_5m[df_5m["Datetime"].dt.date == target_date].copy().dropna(subset=["Close","Volume"])
            if dwork.empty or dwork["Volume"].sum() == 0:
                if diag: drop_reasons[ticker].append("no_intraday_or_zero_vol")
                continue

            # prev close (daily)
            df_daily = read_daily_flex(FOLDER_DAILY / f"{ticker}.csv")
            prev_close = np.nan
            if df_daily is not None:
                prev_day = df_daily[df_daily["Date"] < target_date]
                if not prev_day.empty:
                    prev_close = pd.to_numeric(prev_day.iloc[-1]["Close"], errors="coerce")
            if pd.isna(prev_close) or prev_close <= 0:
                if diag: drop_reasons[ticker].append("no_prev_close_daily")
                continue

            # full-day metrics
            high_px = float(dwork["High"].max())
            low_px  = float(dwork["Low"].min())
            last_px = float(dwork["Close"].iloc[-1])

            daily_return = (last_px / prev_close) - 1.0
            if not (RETURN_MIN < daily_return < RETURN_MAX):
                if diag: drop_reasons[ticker].append("daily_return_out_of_range")
                continue

            # pace (5m only)
            pace, hist_len = vol_pace_5m(df_5m, target_date)
            if np.isnan(pace):
                if not ALLOW_PASS_IF_BASELINE_SHORT:
                    if diag: drop_reasons[ticker].append(f"pace_nan_histlen:{hist_len}")
                    continue
            else:
                if not (pace > PACE_MIN):
                    if diag: drop_reasons[ticker].append(f"pace_insufficient:{pace:.2f}")
                    continue

            # closing strength & afternoon power
            daily_range = high_px - low_px
            closing_strength = (last_px - low_px) / daily_range if daily_range > 0 else 1.0

            start_time = dwork["Datetime"].min()
            first_5min = dwork[dwork["Datetime"] <= start_time + timedelta(minutes=5)]
            if not first_5min.empty and first_5min["Volume"].sum() > 0:
                stable_open = float((first_5min["Close"] * first_5min["Volume"]).sum() / first_5min["Volume"].sum())
            else:
                stable_open = float(dwork["Open"].iloc[0])
            afternoon_power = (last_px / stable_open) - 1.0 if stable_open > 0 else 0.0

            # score (v2.2)
            price_term  = (1 + daily_return) * (1 + max(0.0, afternoon_power)) * closing_strength
            volume_term = np.log1p(min(pace if not np.isnan(pace) else 1.0, 50))  # pace NaN → treat as 1x
            score = price_term * volume_term

            # info: price at cutoff
            cut_mask = (dwork["Datetime"].dt.time <= CUTOFF_TIME)
            price_at_cutoff = float(dwork.loc[cut_mask, "Close"].iloc[-1]) if cut_mask.any() else np.nan

            SUMMARY.append({
                "ticker": ticker, "date": target_date, "score": score, "last": last_px,
                "daily_return": daily_return, "closing_strength": closing_strength,
                "afternoon_power": afternoon_power, "vol_pace": pace,
                "baseline_days_used": hist_len, "price_at_cutoff": price_at_cutoff
            })

        except Exception as e:
            if diag: drop_reasons[ticker].append(f"exception:{type(e).__name__}")
            continue

    cols = ["ticker","date","score","last","daily_return","closing_strength",
            "afternoon_power","vol_pace","baseline_days_used","price_at_cutoff"]
    df_result = (pd.DataFrame(SUMMARY).sort_values("score", ascending=False).reset_index(drop=True)
                 if SUMMARY else pd.DataFrame(columns=cols))
    return df_result, (drop_reasons or {})

# ================== RUN MULTI-DAY ==================
TODAY = pd.Timestamp("today", tz=SESSION_TZ).normalize().date()
all_dates = available_dates_5m_union(TODAY)
if not all_dates:
    raise SystemExit("❌ Tidak ada tanggal pada cache_5m.")

target_dates = all_dates[-DAYS_LIMIT:]
print(f"[INFO] Ditemukan {len(all_dates)} tanggal 5m; diproses {len(target_dates)} (≤ {DAYS_LIMIT})")
print("       Rentang:", target_dates[0], "→", target_dates[-1])
print(f"Cutoff: {CUTOFF_STR} | Pace>= {PACE_MIN} | Return∈({RETURN_MIN:.1%}, {RETURN_MAX:.1%})")

for i, d in enumerate(target_dates):
    out_file = OUT_DIR / f"bsjp_rekomendasi_{d}.csv"
    if SKIP_IF_EXISTS and out_file.exists():
        print(f"[SKIP] {d} sudah ada → {out_file.name}")
        continue

    print(f"\n[RUN] {d}")
    df_result, drop_reasons = bsjp_candidates_5m(d, diag=True)

    if df_result.empty:
        print("   ❌ Tidak ada kandidat yang lolos filter.")
        # Diagnostik ringkas
        flat = [r for rs in drop_reasons.values() for r in rs]
        if flat:
            print("   [DIAG TOP REASONS]", Counter(flat).most_common(5))
        continue

    # Simpan Top-N
    df_result.head(TOP_N).to_csv(out_file, index=False)
    print(f"   [✔] Top {TOP_N} disimpan: {out_file.name}")

    # Tampilkan ringkas untuk hari terakhir
    if (not SHOW_LAST_ONLY) or (i == len(target_dates)-1):
        def fmt_pct(x):  return f"{x:,.2%}" if pd.notna(x) else "N/A"
        def fmt_x(x):    return f"{x:.2f}x"   if pd.notna(x) else "N/A"
        def fmt_f3(x):   return f"{x:.3f}"    if pd.notna(x) else "N/A"

        view = df_result.copy()
        view["score"]            = view["score"].map(fmt_f3)
        view["daily_return"]     = view["daily_return"].map(fmt_pct)
        view["closing_strength"] = view["closing_strength"].map(fmt_pct)
        view["afternoon_power"]  = view["afternoon_power"].map(fmt_pct)
        view["vol_pace"]         = view["vol_pace"].map(fmt_x)

        print(f"\n[✓] TOP CANDIDATES — {d} (cutoff {CUTOFF_STR})")
        display(view.head(TOP_N))

        flat = [r for rs in drop_reasons.values() for r in rs]
        if flat:
            print("[DIAG TOP REASONS]", Counter(flat).most_common(5))


[INFO] Ditemukan 59 tanggal 5m; diproses 30 (≤ 30)
       Rentang: 2025-07-09 → 2025-08-20
Cutoff: 09:30 | Pace>= 1.1 | Return∈(1.0%, 50.0%)

[RUN] 2025-07-09
   [✔] Top 10 disimpan: bsjp_rekomendasi_2025-07-09.csv

[RUN] 2025-07-10
   [✔] Top 10 disimpan: bsjp_rekomendasi_2025-07-10.csv

[RUN] 2025-07-11
   [✔] Top 10 disimpan: bsjp_rekomendasi_2025-07-11.csv

[RUN] 2025-07-14
   [✔] Top 10 disimpan: bsjp_rekomendasi_2025-07-14.csv

[RUN] 2025-07-15
   [✔] Top 10 disimpan: bsjp_rekomendasi_2025-07-15.csv

[RUN] 2025-07-16
   [✔] Top 10 disimpan: bsjp_rekomendasi_2025-07-16.csv

[RUN] 2025-07-17
   [✔] Top 10 disimpan: bsjp_rekomendasi_2025-07-17.csv

[RUN] 2025-07-18
   [✔] Top 10 disimpan: bsjp_rekomendasi_2025-07-18.csv

[RUN] 2025-07-21
   [✔] Top 10 disimpan: bsjp_rekomendasi_2025-07-21.csv

[RUN] 2025-07-22
   [✔] Top 10 disimpan: bsjp_rekomendasi_2025-07-22.csv
[SKIP] 2025-07-23 sudah ada → bsjp_rekomendasi_2025-07-23.csv
[SKIP] 2025-07-24 sudah ada → bsjp_rekomendasi_2025-07-24

### BSJP V.2.0 (Gemini Version)

In [13]:
import pandas as pd, numpy as np, glob
from pathlib import Path
from datetime import datetime, timedelta, date
from typing import Optional
from IPython.display import display

# ================== CONFIG ==================
FOLDER_1M = Path("emiten/cache_1m")
FOLDER_DAILY = Path("emiten/cache_daily")
SESSION_TZ = "Asia/Jakarta"
CUTOFF_STR = "14:15"
CUTOFF_TIME = datetime.strptime(CUTOFF_STR, "%H:%M").time()
TODAY = pd.Timestamp("today", tz=SESSION_TZ).normalize().date()
BASELINE_DAYS = 60 # max historical days for median pace

print(f"Hari ini: {TODAY}, cutoff: {CUTOFF_STR}")

# --------------- UTIL ---------------
def to_jkt(series: pd.Series) -> pd.Series:
    s = pd.to_datetime(series, errors="coerce")
    if getattr(s.dt, "tz", None) is None:
        return s.dt.tz_localize(SESSION_TZ)
    return s.dt.tz_convert(SESSION_TZ)

def pick_work_date(df_dt: pd.Series, today_date: date) -> Optional[date]:
    dlist = pd.Series(df_dt.dt.date.unique()).dropna().sort_values().tolist()
    if not dlist:
        return None
    for d in reversed(dlist):
        if d <= today_date:
            return d
    return None

def vol_pace_until_cutoff(df_1m: pd.DataFrame, work_date: date, ticker: str) -> float:
    try:
        mask_today = (df_1m["Datetime"].dt.date == work_date) & (df_1m["Datetime"].dt.time <= CUTOFF_TIME)
        vol_today = float(df_1m.loc[mask_today, "Volume"].sum())
        prev_days = sorted([d for d in df_1m["Datetime"].dt.date.unique() if d < work_date])[-BASELINE_DAYS:]
        vols = []
        for d in prev_days:
            m = (df_1m["Datetime"].dt.date == d) & (df_1m["Datetime"].dt.time <= CUTOFF_TIME)
            v = float(df_1m.loc[m, "Volume"].sum())
            if v > 0: vols.append(v)
        
        if len(vols) >= 5:
            base = float(np.median(vols))
            if base > 0:
                return vol_today / base
    except Exception as e:
        print(f"[INFO] {ticker} → 1m pace calc failed. Trying fallback.")
    
    df_daily = read_daily_flex(FOLDER_DAILY / f"{ticker}.csv")
    if df_daily is not None:
        try:
            prev_days_daily = df_daily[df_daily["Date"] < work_date].sort_values("Date").tail(BASELINE_DAYS)
            if len(prev_days_daily) >= 10:
                base_daily = float(prev_days_daily["Close"].iloc[-1]) # Corrected logic: used Close instead of Volume
                if base_daily > 0:
                    vol_today = float(df_1m.loc[(df_1m["Datetime"].dt.date == work_date), "Volume"].sum())
                    return vol_today / base_daily
        except Exception as e:
            print(f"[INFO] {ticker} → daily fallback failed: {e}")
    
    return np.nan

def read_daily_flex(path: Path) -> Optional[pd.DataFrame]:
    if not path.exists():
        return None
    try:
        df = pd.read_csv(path, low_memory=False)
        for dc in ("Date","date","Datetime","datetime","Timestamp","timestamp"):
            if dc in df.columns:
                d = pd.to_datetime(df[dc], errors="coerce")
                try: d = d.dt.tz_localize(None)
                except Exception: pass
                df["Date"] = d.dt.date
                break
        else:
            return None
        if "Close" not in df.columns:
            if "Adj Close" in df.columns:
                df["Close"] = pd.to_numeric(df["Adj Close"], errors="coerce")
            else:
                return None
        df["Close"] = pd.to_numeric(df["Close"], errors="coerce")
        df = df.dropna(subset=["Date","Close"]).sort_values("Date").reset_index(drop=True)
        return df
    except Exception:
        return None

# --------------- MAIN ----------------
SUMMARY = []
files = sorted(FOLDER_1M.glob("*.csv"))
print(f"[INFO] Ditemukan {len(files)} file 1m di {FOLDER_1M.resolve()}")

for fp in files:
    ticker = fp.stem
    try:
        # ---- LOAD 1M ----
        df = pd.read_csv(fp, low_memory=False)
        if "Datetime" not in df.columns:
            print(f"[SKIP] {ticker} → 1m: kolom 'Datetime' tidak ada")
            continue
        df["Datetime"] = to_jkt(df["Datetime"])
        for c in ("Open","High","Low","Close","Volume"):
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors="coerce")
            else:
                df[c] = np.nan
        df = df.dropna(subset=["Datetime"]).sort_values("Datetime").reset_index(drop=True)

        # ---- PILIH TANGGAL KERJA ----
        work_date = pick_work_date(df["Datetime"], TODAY)
        if work_date is None:
            print(f"[SKIP] {ticker} → 1m: tidak ada tanggal perdagangan di file")
            continue
        df_dwork = df[df["Datetime"].dt.date == work_date].copy()
        rows_dwork = len(df_dwork)
        vol_dwork = float(df_dwork["Volume"].sum())
        tmin = df_dwork["Datetime"].min()
        tmax = df_dwork["Datetime"].max()
        
        if rows_dwork == 0:
            print(f"[SKIP] {ticker} → 1m: tidak ada baris pada {work_date}")
            continue
        if vol_dwork == 0:
            print(f"[SKIP] {ticker} → 1m: volume 0 pada {work_date}")
            continue
        
        df_valid = df_dwork.dropna(subset=["Close","Volume"]).copy()
        if df_valid.empty:
            print(f"[SKIP] {ticker} → 1m: semua Close/Volume NaN pada {work_date}")
            continue

        # ---- METRIK INTRADAY ----
        df_valid["OBV"] = np.sign(df_valid["Close"].diff()).fillna(0.0) * df_valid["Volume"]
        df_valid["OBV"] = df_valid["OBV"].cumsum()
        
        last_row = df_valid.iloc[-1]
        last_px = float(last_row["Close"])

        w_start = last_row["Datetime"] - timedelta(minutes=5)
        df_5m = df_valid[df_valid["Datetime"] >= w_start]
        mom_5m = float(df_5m["Close"].iloc[-1] / df_5m["Close"].iloc[0]) if len(df_5m) > 1 else np.nan

        m_cut = df_valid["Datetime"].dt.time <= CUTOFF_TIME
        sesi_cut = df_valid.loc[m_cut]
        vwap_cut = float((sesi_cut["Close"] * sesi_cut["Volume"]).sum() / sesi_cut["Volume"].sum()) if not sesi_cut.empty else np.nan
        
        obv_now = float(df_valid["OBV"].iloc[-1])
        obv_cut = float(df_valid.loc[m_cut, "OBV"].iloc[-1]) if m_cut.any() else np.nan
        
        pace = vol_pace_until_cutoff(df, work_date, ticker)

        # ---- DAILY LOGRET ----
        daily_path = FOLDER_DAILY / f"{ticker}.csv"
        df_daily = read_daily_flex(daily_path)
        if df_daily is None:
            logret = np.nan
            print(f"[INFO] {ticker} → daily: file tidak ada/format tidak cocok")
        else:
            df_daily2 = df_daily[df_daily["Date"] <= work_date].sort_values("Date")
            if len(df_daily2) >= 2:
                logret = float(np.log(df_daily2["Close"].iloc[-1] / df_daily2["Close"].iloc[-2]))
            else:
                logret = np.nan
            print(f"[LOAD] {ticker} → daily≤{work_date}: rows={len(df_daily2)} logret={logret if pd.notna(logret) else np.nan:.4f}")

        # ---- SCORE ----
        score = 0
        if pd.notna(mom_5m) and mom_5m > 1.005: score += 1
        if pd.notna(pace) and pace > 1.5: score += 1
        if pd.notna(vwap_cut) and last_px > vwap_cut: score += 1
        if pd.notna(obv_cut) and obv_now > obv_cut: score += 1

        print(
            f"[METRIC] {ticker} last={last_px:.2f} Δ5m={mom_5m if pd.notna(mom_5m) else np.nan:.3f} "
            f"vwap≤{CUTOFF_STR}={vwap_cut if pd.notna(vwap_cut) else np.nan:.2f} "
            f"pace={pace if pd.notna(pace) else np.nan:.2f} obv_now={obv_now:.0f} "
            f"obv≤{CUTOFF_STR}={obv_cut if pd.notna(obv_cut) else np.nan:.0f} | score={score}"
        )
        
        SUMMARY.append({
            "ticker": ticker,
            "date": work_date,
            "score": score,
            "last": last_px,
            "mom_5m": mom_5m,
            "vwap_cut": vwap_cut,
            "vol_pace": pace,
            "obv_now": obv_now,
            "obv_cut": obv_cut,
            "logret": logret,
            "rows_day": rows_dwork,
            "vol_day": vol_dwork
        })

    except Exception as e:
        print(f"[ERROR] {ticker} → {e}")
        continue

# --------------- OUTPUT -----------------
df_result = pd.DataFrame(SUMMARY)
if not df_result.empty:
    df_result = df_result.sort_values(["score", "vol_pace", "mom_5m"], ascending=[False, False, False]).reset_index(drop=True)
    display(df_result.head(30))
else:
    print("❌ Tidak ada kandidat yang lolos (cek log di atas untuk alasan per ticker).")

Hari ini: 2025-08-20, cutoff: 14:15
[INFO] Ditemukan 773 file 1m di /home/mkemalw/Projects/SSSAHAM_SERVICE/emiten/cache_1m
[LOAD] AADI.JK → daily≤2025-08-19: rows=160 logret=0.0000
[METRIC] AADI.JK last=6825.00 Δ5m=1.004 vwap≤14:15=6855.49 pace=1.47 obv_now=-2071200 obv≤14:15=-1680100 | score=0
[LOAD] AALI.JK → daily≤2025-08-19: rows=728 logret=0.0556
[METRIC] AALI.JK last=7400.00 Δ5m=1.003 vwap≤14:15=7261.68 pace=4.11 obv_now=1840500 obv≤14:15=1741300 | score=3
[SKIP] ABDA.JK → 1m: volume 0 pada 2025-08-19
[LOAD] ABMM.JK → daily≤2025-08-19: rows=728 logret=-0.0034
[METRIC] ABMM.JK last=2980.00 Δ5m=1.656 vwap≤14:15=2072.67 pace=2.40 obv_now=-4309300 obv≤14:15=-836600 | score=3
[LOAD] ACES.JK → daily≤2025-08-19: rows=728 logret=-0.0301
[METRIC] ACES.JK last=460.00 Δ5m=1.004 vwap≤14:15=465.14 pace=1.73 obv_now=-30255800 obv≤14:15=-14924900 | score=1
[LOAD] ACRO.JK → daily≤2025-08-19: rows=374 logret=0.0000
[METRIC] ACRO.JK last=73.00 Δ5m=1.000 vwap≤14:15=74.71 pace=1.66 obv_now=1357200 o

Unnamed: 0,ticker,date,score,last,mom_5m,vwap_cut,vol_pace,obv_now,obv_cut,logret,rows_day,vol_day
0,BMAS.JK,2025-08-19,4,135.0,1.015038,129.778868,98114.43609,3038800.0,-4840400.0,0.093257,411,65246100.0
1,BEER.JK,2025-08-19,4,135.0,1.015038,119.972877,53.758328,21417500.0,13564100.0,0.300105,411,130649200.0
2,ATIC.JK,2025-08-19,4,400.0,3.149606,125.053926,20.426597,21214400.0,-2688800.0,0.0,411,111080900.0
3,MCOL.JK,2025-08-19,4,3920.0,54.444444,517.029286,17.971292,50800.0,-340400.0,-0.007624,411,2055200.0
4,NAIK.JK,2025-08-19,4,174.0,1.017544,166.533901,15.638414,4250700.0,895700.0,0.077651,411,138785600.0
5,BMSR.JK,2025-08-19,4,252.0,1.008,215.188406,10.977273,1011200.0,700.0,-0.007968,411,1367200.0
6,KRAS.JK,2025-08-19,4,296.0,1.006803,282.896973,8.759122,31836900.0,21960600.0,0.113597,411,142747800.0
7,BRNA.JK,2025-08-19,4,2240.0,2.966887,2187.04202,5.461407,5324600.0,-91700.0,-0.057894,411,22501900.0
8,MCOR.JK,2025-08-19,4,73.0,1.013889,71.959326,2.480693,-3338700.0,-4271700.0,0.0,411,17004100.0
9,HMSP.JK,2025-08-19,4,555.0,1.009091,550.379674,2.478442,7764000.0,1085600.0,0.018519,411,36962800.0


### BSJP v.2.1 (Gemini Version)

In [3]:
import pandas as pd, numpy as np
from pathlib import Path
from datetime import datetime, timedelta, date
from typing import Optional
from IPython.display import display

# ================== CONFIG ==================
FOLDER_1M = Path("emiten/cache_1m")
FOLDER_DAILY = Path("emiten/cache_daily")
SESSION_TZ = "Asia/Jakarta"
CUTOFF_STR = "14:15"
CUTOFF_TIME = datetime.strptime(CUTOFF_STR, "%H:%M").time()
TODAY = pd.Timestamp("today", tz=SESSION_TZ).normalize().date()
BASELINE_DAYS = 60 # max hari historis untuk median pace

print(f"Hari ini: {TODAY}, cutoff: {CUTOFF_STR}")

# --------------- UTIL ---------------
def to_jkt(series: pd.Series) -> pd.Series:
    s = pd.to_datetime(series, errors="coerce")
    if getattr(s.dt, "tz", None) is None:
        return s.dt.tz_localize(SESSION_TZ)
    return s.dt.tz_convert(SESSION_TZ)

def pick_work_date(df_dt: pd.Series, today_date: date) -> Optional[date]:
    dlist = pd.Series(df_dt.dt.date.unique()).dropna().sort_values().tolist()
    if not dlist: return None
    for d in reversed(dlist):
        if d <= today_date: return d
    return None

def read_daily_flex(path: Path) -> Optional[pd.DataFrame]:
    if not path.exists(): return None
    try:
        df = pd.read_csv(path, low_memory=False)
        # Cari kolom tanggal secara fleksibel
        for dc in ("Date", "date", "Datetime"):
            if dc in df.columns:
                df["Date"] = pd.to_datetime(df[dc], errors="coerce").dt.date
                break
        else: return None

        # Cari kolom harga & volume secara fleksibel
        for col_name, new_name in [("Close", "Close"), ("Adj Close", "Close"), ("Volume", "Volume")]:
             if col_name in df.columns and new_name not in df.columns:
                 df[new_name] = pd.to_numeric(df[col_name], errors="coerce")

        if "Close" not in df.columns or "Volume" not in df.columns: return None
        return df.dropna(subset=["Date", "Close", "Volume"]).sort_values("Date").reset_index(drop=True)
    except Exception:
        return None

def vol_pace_robust(df_1m: pd.DataFrame, work_date: date, ticker: str) -> float:
    # Prioritas 1: Hitung dari data 1m jika histori cukup
    try:
        mask_today_cut = (df_1m["Datetime"].dt.date == work_date) & (df_1m["Datetime"].dt.time <= CUTOFF_TIME)
        vol_today_cut = float(df_1m.loc[mask_today_cut, "Volume"].sum())
        
        prev_days_1m = sorted([d for d in df_1m["Datetime"].dt.date.unique() if d < work_date])[-BASELINE_DAYS:]
        vols_1m_hist = []
        for d in prev_days_1m:
            m = (df_1m["Datetime"].dt.date == d) & (df_1m["Datetime"].dt.time <= CUTOFF_TIME)
            v = float(df_1m.loc[m, "Volume"].sum())
            if v > 0: vols_1m_hist.append(v)
        
        if len(vols_1m_hist) >= 10: # Butuh minimal 10 hari data historis 1m
            base_1m = float(np.median(vols_1m_hist))
            if base_1m > 0:
                # print(f"[INFO] {ticker} → 1m pace base={base_1m:.0f}")
                return vol_today_cut / base_1m
    except Exception:
        pass # Lanjut ke fallback jika gagal

    # Prioritas 2 (Fallback): Hitung dari data daily
    df_daily = read_daily_flex(FOLDER_DAILY / f"{ticker}.csv")
    if df_daily is not None:
        try:
            hist_daily = df_daily[df_daily["Date"] < work_date].sort_values("Date").tail(BASELINE_DAYS)
            if len(hist_daily) >= 20: # Butuh minimal 20 hari data historis daily
                base_daily = float(hist_daily["Volume"].median())
                if base_daily > 0:
                    # Estimasikan fraksi volume harian hingga jam cutoff
                    # Sesi trading efektif ~6.5 jam. Cutoff 14:15 ~75% dari sesi.
                    frac = 0.75 
                    # print(f"[INFO] {ticker} → daily pace fallback base={base_daily:.0f}")
                    return vol_today_cut / (base_daily * frac)
        except Exception as e:
            print(f"[WARN] {ticker} → daily fallback failed: {e}")
            
    return np.nan

# --------------- MAIN ----------------
SUMMARY = []
files = sorted(FOLDER_1M.glob("*.csv"))
print(f"[INFO] Ditemukan {len(files)} file 1m di {FOLDER_1M.resolve()}")

for fp in files:
    ticker = fp.stem
    try:
        df = pd.read_csv(fp, low_memory=False, parse_dates=["Datetime"])
        df["Datetime"] = to_jkt(df["Datetime"])
        for c in ("Open", "High", "Low", "Close", "Volume"):
            df[c] = pd.to_numeric(df.get(c), errors="coerce")

        work_date = pick_work_date(df["Datetime"], TODAY)
        if work_date is None: continue
        
        df_dwork = df[df["Datetime"].dt.date == work_date].copy().dropna(subset=["Close","Volume"])
        if df_dwork.empty or df_dwork["Volume"].sum() == 0: continue

        # ---- METRIK INTRADAY ----
        df_dwork["OBV"] = np.sign(df_dwork["Close"].diff()).fillna(0.0) * df_dwork["Volume"]
        df_dwork["OBV"] = df_dwork["OBV"].cumsum()
        
        last_row = df_dwork.iloc[-1]
        last_px = float(last_row["Close"])

        w_start = last_row["Datetime"] - timedelta(minutes=5)
        df_5m = df_dwork[df_dwork["Datetime"] >= w_start]
        mom_5m = float(df_5m["Close"].iloc[-1] / df_5m["Close"].iloc[0]) if len(df_5m) > 1 else 1.0

        m_cut = df_dwork["Datetime"].dt.time <= CUTOFF_TIME
        sesi_cut = df_dwork.loc[m_cut]
        vwap_cut = float((sesi_cut["Close"] * sesi_cut["Volume"]).sum() / sesi_cut["Volume"].sum()) if not sesi_cut.empty and sesi_cut['Volume'].sum() > 0 else last_px

        obv_now = float(df_dwork["OBV"].iloc[-1])
        obv_cut = float(sesi_cut["OBV"].iloc[-1]) if m_cut.any() else 0.0
        
        pace = vol_pace_robust(df, work_date, ticker)

        # ---- METRIK BARU ----
        afternoon_power = (last_px / vwap_cut) - 1.0 if pd.notna(vwap_cut) else 0.0
        open_px = df_dwork['Open'].iloc[0]
        daily_return = (last_px/open_px) - 1.0 if pd.notna(open_px) and open_px > 0 else 0.0

        # ---- SKOR BARU (WEIGHTED & CONTINUOUS) ----
        score = 0
        w_pace = 1.5
        w_power = 2.5
        w_obv = 1.0
        w_mom = 0.5
        
        # Bobot dari volume pace (dibatasi maks 10x untuk stabilitas)
        score += w_pace * np.log1p(min(pace, 10)) if pd.notna(pace) and pace > 1 else 0
        
        # Bobot dari afternoon power (kenaikan dari VWAP cutoff)
        score += w_power * afternoon_power if pd.notna(afternoon_power) and afternoon_power > 0.005 else 0

        # Bobot dari OBV
        score += w_obv if pd.notna(obv_cut) and obv_now > obv_cut else 0

        # Bobot dari momentum 5 menit terakhir (dibatasi maks naik 5%)
        score += w_mom * min(mom_5m - 1, 0.05) if pd.notna(mom_5m) and mom_5m > 1.002 else 0
        
        SUMMARY.append({
            "ticker": ticker, "date": work_date, "score": score, "last": last_px,
            "afternoon_power": afternoon_power, "daily_return": daily_return,
            "vol_pace": pace, "mom_5m": mom_5m
        })

    except Exception as e:
        print(f"[ERROR] {ticker} → {e}")
        continue

# --------------- OUTPUT -----------------
if SUMMARY:
    df_result = pd.DataFrame(SUMMARY)
    df_result = df_result.sort_values("score", ascending=False).reset_index(drop=True)
    
    # Format untuk keterbacaan
    for col, fmt in {
        "score": "{:.3f}", "afternoon_power": "{:,.2%}", "daily_return": "{:,.2%}",
        "vol_pace": "{:.2f}x", "mom_5m": "{:.3f}"
    }.items():
        if col in df_result.columns:
            df_result[col] = df_result[col].apply(lambda x: fmt.format(x) if pd.notna(x) else 'N/A')

    display(df_result.head(30))
else:
    print("❌ Tidak ada kandidat yang lolos.")

Hari ini: 2025-08-20, cutoff: 14:15
[INFO] Ditemukan 773 file 1m di /home/mkemalw/Projects/SSSAHAM_SERVICE/emiten/cache_1m


Unnamed: 0,ticker,date,score,last,afternoon_power,daily_return,vol_pace,mom_5m
0,IATA.JK,2025-08-19,785.037,18275.0,"31,321.10%","31,408.62%",2.82x,1.0
1,SRSN.JK,2025-08-19,160.335,3880.0,"6,315.22%","7,220.75%",4.14x,1.0
2,HAJJ.JK,2025-08-19,110.049,5550.0,"4,400.98%","4,900.00%",0.28x,50.0
3,BGTG.JK,2025-08-19,68.118,2530.0,"2,592.40%","2,535.42%",8.05x,1.008
4,BATA.JK,2025-08-19,62.0,1290.0,"2,480.00%","2,480.00%",0.72x,1.0
5,BAPA.JK,2025-08-19,39.798,825.0,"1,408.03%","1,456.60%",149.93x,0.994
6,CMPP.JK,2025-08-19,34.19,1310.0,"1,282.26%","1,579.49%",3.15x,1.0
7,TLDN.JK,2025-08-19,27.503,7225.0,971.58%,"1,028.91%",7.52x,1.0
8,KDSI.JK,2025-08-19,24.696,4210.0,803.97%,788.19%,16.28x,1.0
9,MKTR.JK,2025-08-19,21.712,910.0,758.31%,750.47%,5.17x,8.667


### BSJP V.2.2 (Gemini Version)

In [8]:
import pandas as pd, numpy as np
from pathlib import Path
from datetime import datetime, timedelta, date
from typing import Optional
from IPython.display import display

# ================== CONFIG v2.2 ==================
FOLDER_1M = Path("emiten/cache_1m")
FOLDER_DAILY = Path("emiten/cache_daily")
SESSION_TZ = "Asia/Jakarta"
CUTOFF_STR = "14:15"
CUTOFF_TIME = datetime.strptime(CUTOFF_STR, "%H:%M").time()
TODAY = pd.Timestamp("today", tz=SESSION_TZ).normalize().date()
BASELINE_DAYS = 60

print(f"Hari ini: {TODAY}, cutoff: {CUTOFF_STR}")

# --------------- UTIL ---------------
def to_jkt(series: pd.Series) -> pd.Series:
    s = pd.to_datetime(series, errors="coerce")
    if getattr(s.dt, "tz", None) is None: return s.dt.tz_localize(SESSION_TZ)
    return s.dt.tz_convert(SESSION_TZ)

def pick_work_date(df_dt: pd.Series, today_date: date) -> Optional[date]:
    dlist = pd.Series(df_dt.dt.date.unique()).dropna().sort_values().tolist()
    if not dlist: return None
    for d in reversed(dlist):
        if d <= today_date: return d
    return None

def read_daily_flex(path: Path) -> Optional[pd.DataFrame]:
    if not path.exists(): return None
    try:
        df = pd.read_csv(path, low_memory=False)
        for dc in ("Date", "date", "Datetime"):
            if dc in df.columns:
                df["Date"] = pd.to_datetime(df[dc], errors="coerce").dt.date
                break
        else: return None
        for col_name, new_name in [("Close", "Close"), ("Adj Close", "Close"), ("Volume", "Volume")]:
            if col_name in df.columns and new_name not in df.columns:
                df[new_name] = pd.to_numeric(df[col_name], errors="coerce")
        if "Close" not in df.columns or "Volume" not in df.columns: return None
        return df.dropna(subset=["Date", "Close", "Volume"]).sort_values("Date").reset_index(drop=True)
    except Exception: return None

def vol_pace_robust(df_1m: pd.DataFrame, work_date: date, ticker: str) -> float:
    try:
        mask_today_cut = (df_1m["Datetime"].dt.date == work_date) & (df_1m["Datetime"].dt.time <= CUTOFF_TIME)
        vol_today_cut = float(df_1m.loc[mask_today_cut, "Volume"].sum())
        prev_days_1m = sorted([d for d in df_1m["Datetime"].dt.date.unique() if d < work_date])[-BASELINE_DAYS:]
        vols_1m_hist = [float(df_1m.loc[(df_1m["Datetime"].dt.date == d) & (df_1m["Datetime"].dt.time <= CUTOFF_TIME), "Volume"].sum()) for d in prev_days_1m]
        vols_1m_hist = [v for v in vols_1m_hist if v > 0]
        if len(vols_1m_hist) >= 10:
            base_1m = float(np.median(vols_1m_hist))
            if base_1m > 0: return vol_today_cut / base_1m
    except Exception: pass
    
    df_daily = read_daily_flex(FOLDER_DAILY / f"{ticker}.csv")
    if df_daily is not None:
        try:
            hist_daily = df_daily[df_daily["Date"] < work_date].tail(BASELINE_DAYS)
            if len(hist_daily) >= 20:
                base_daily = float(hist_daily["Volume"].median())
                if base_daily > 0: return vol_today_cut / (base_daily * 0.75)
        except Exception: pass
    return np.nan

# --------------- MAIN ----------------
SUMMARY = []
files = sorted(FOLDER_1M.glob("*.csv"))
print(f"[INFO] Ditemukan {len(files)} file 1m.")

for fp in files:
    ticker = fp.stem
    try:
        df = pd.read_csv(fp, low_memory=False, parse_dates=["Datetime"])
        df["Datetime"] = to_jkt(df["Datetime"])
        for c in ("Open", "High", "Low", "Close", "Volume"):
            df[c] = pd.to_numeric(df.get(c), errors="coerce")

        work_date = pick_work_date(df["Datetime"], TODAY)
        if work_date is None: continue
        
        df_dwork = df[df["Datetime"].dt.date == work_date].copy().dropna(subset=["Close", "Volume"])
        if df_dwork.empty or df_dwork["Volume"].sum() == 0: continue

        # ---- LOAD PREV_CLOSE DARI DAILY ----
        df_daily = read_daily_flex(FOLDER_DAILY / f"{ticker}.csv")
        prev_close = np.nan
        if df_daily is not None:
            prev_day_data = df_daily[df_daily["Date"] < work_date]
            if not prev_day_data.empty:
                prev_close = prev_day_data.iloc[-1]["Close"]

        if pd.isna(prev_close): continue # Skip jika tidak ada data H-1

        # ---- HITUNG METRIK KUNCI ----
        high_px = df_dwork['High'].max()
        low_px = df_dwork['Low'].min()
        last_px = df_dwork['Close'].iloc[-1]
        
        daily_return = (last_px / prev_close) - 1.0
        
        # ---- FILTER KEWAJARAN & FILTER AWAL ----
        if not (0.01 < daily_return < 0.40): continue # Hanya proses saham yang naik antara 1% - 40%
        
        pace = vol_pace_robust(df, work_date, ticker)
        if not (pd.notna(pace) and pace > 1.2): continue

        # ---- METRIK LANJUTAN ----
        daily_range = high_px - low_px
        closing_strength = (last_px - low_px) / daily_range if daily_range > 0 else 1.0

        # VWAP 5 menit pertama sebagai Open yang stabil
        start_time = df_dwork['Datetime'].min()
        first_5min_data = df_dwork[df_dwork['Datetime'] <= start_time + timedelta(minutes=5)]
        stable_open = float((first_5min_data["Close"] * first_5min_data["Volume"]).sum() / first_5min_data["Volume"].sum()) if not first_5min_data.empty and first_5min_data['Volume'].sum() > 0 else df_dwork['Open'].iloc[0]

        afternoon_power = (last_px / stable_open) - 1.0
        
        # ---- SKOR MULTIPLIKATIF v2.2 ----
        price_term = (1 + daily_return) * (1 + max(0, afternoon_power)) * closing_strength
        volume_term = np.log1p(min(pace, 50))
        
        score = price_term * volume_term

        SUMMARY.append({
            "ticker": ticker, "date": work_date, "score": score, "last": last_px,
            "daily_return": daily_return, "closing_strength": closing_strength,
            "afternoon_power": afternoon_power, "vol_pace": pace
        })

    except Exception:
        continue

# --------------- OUTPUT -----------------
if SUMMARY:
    df_result = pd.DataFrame(SUMMARY)
    df_result = df_result.sort_values("score", ascending=False).reset_index(drop=True)
    
    for col, fmt in {
        "score": "{:.3f}", "daily_return": "{:,.2%}", "closing_strength": "{:,.2%}",
        "afternoon_power": "{:,.2%}", "vol_pace": "{:.2f}x"
    }.items():
        if col in df_result.columns:
            df_result[col] = df_result[col].apply(lambda x: fmt.format(x) if pd.notna(x) else 'N/A')

    print("\n[✓] TOP CANDIDATES BSJP v2.2")
    display(df_result.head(30))
else:
    print("❌ Tidak ada kandidat yang lolos filter awal.")

Hari ini: 2025-08-20, cutoff: 14:15
[INFO] Ditemukan 773 file 1m.

[✓] TOP CANDIDATES BSJP v2.2


Unnamed: 0,ticker,date,score,last,daily_return,closing_strength,afternoon_power,vol_pace
0,TAYS.JK,2025-08-20,7.125,70.0,34.62%,100.00%,34.62%,247.68x
1,ACST.JK,2025-08-20,6.893,121.0,34.44%,100.00%,30.41%,124.05x
2,CENT.JK,2025-08-20,6.258,135.0,35.00%,100.00%,33.66%,31.08x
3,LPCK.JK,2025-08-20,6.095,635.0,24.51%,100.00%,24.51%,52.31x
4,SAPX.JK,2025-08-20,5.863,372.0,24.83%,100.00%,19.45%,60.06x
5,JARR.JK,2025-08-20,5.021,1005.0,24.84%,100.00%,26.06%,23.29x
6,GZCO.JK,2025-08-20,4.734,140.0,22.81%,83.33%,20.78%,45.05x
7,JKON.JK,2025-08-20,4.728,108.0,17.39%,88.89%,15.23%,134.62x
8,LIFE.JK,2025-08-19,4.726,14250.0,20.00%,100.00%,0.17%,71.72x
9,MFIN.JK,2025-08-20,4.36,1500.0,25.00%,100.00%,0.00%,31.72x


### BSJP V.2.2.1 (Gemini Version)

In [26]:
import pandas as pd, numpy as np
from pathlib import Path
from datetime import datetime, timedelta, date
from typing import Optional
from IPython.display import display

# ================== MODIFIKASI: Simulasi Jam 09:05 ==================
FOLDER_1M = Path("emiten/cache_1m")
FOLDER_DAILY = Path("emiten/cache_daily")
SESSION_TZ = "Asia/Jakarta"
CUTOFF_STR = "09:05"  # <-- PERUBAHAN UTAMA DI SINI
CUTOFF_TIME = datetime.strptime(CUTOFF_STR, "%H:%M").time()
TODAY = pd.Timestamp("today", tz=SESSION_TZ).normalize().date()
BASELINE_DAYS = 60

print(f"Hari ini: {TODAY}, cutoff: {CUTOFF_STR} (SIMULASI PAGI)")

# --------------- UTIL ---------------
def to_jkt(series: pd.Series) -> pd.Series:
    s = pd.to_datetime(series, errors="coerce")
    if getattr(s.dt, "tz", None) is None: return s.dt.tz_localize(SESSION_TZ)
    return s.dt.tz_convert(SESSION_TZ)

def pick_work_date(df_dt: pd.Series, today_date: date) -> Optional[date]:
    dlist = pd.Series(df_dt.dt.date.unique()).dropna().sort_values().tolist()
    if not dlist: return None
    for d in reversed(dlist):
        if d <= today_date: return d
    return None

def read_daily_flex(path: Path) -> Optional[pd.DataFrame]:
    if not path.exists(): return None
    try:
        df = pd.read_csv(path, low_memory=False)
        for dc in ("Date", "date", "Datetime"):
            if dc in df.columns:
                df["Date"] = pd.to_datetime(df[dc], errors="coerce").dt.date
                break
        else: return None
        for col_name, new_name in [("Close", "Close"), ("Adj Close", "Close"), ("Volume", "Volume")]:
            if col_name in df.columns and new_name not in df.columns:
                df[new_name] = pd.to_numeric(df[col_name], errors="coerce")
        if "Close" not in df.columns or "Volume" not in df.columns: return None
        return df.dropna(subset=["Date", "Close", "Volume"]).sort_values("Date").reset_index(drop=True)
    except Exception: return None

def vol_pace_robust(df_1m: pd.DataFrame, work_date: date, ticker: str) -> float:
    try:
        mask_today_cut = (df_1m["Datetime"].dt.date == work_date) & (df_1m["Datetime"].dt.time <= CUTOFF_TIME)
        vol_today_cut = float(df_1m.loc[mask_today_cut, "Volume"].sum())
        prev_days_1m = sorted([d for d in df_1m["Datetime"].dt.date.unique() if d < work_date])[-BASELINE_DAYS:]
        vols_1m_hist = [float(df_1m.loc[(df_1m["Datetime"].dt.date == d) & (df_1m["Datetime"].dt.time <= CUTOFF_TIME), "Volume"].sum()) for d in prev_days_1m]
        vols_1m_hist = [v for v in vols_1m_hist if v > 0]
        if len(vols_1m_hist) >= 10:
            base_1m = float(np.median(vols_1m_hist))
            if base_1m > 0: return vol_today_cut / base_1m
    except Exception: pass
    
    df_daily = read_daily_flex(FOLDER_DAILY / f"{ticker}.csv")
    if df_daily is not None:
        try:
            hist_daily = df_daily[df_daily["Date"] < work_date].tail(BASELINE_DAYS)
            if len(hist_daily) >= 20:
                base_daily = float(hist_daily["Volume"].median())
                # Fraksi disesuaikan untuk 5 menit pagi (sekitar 5% dari total volume harian)
                if base_daily > 0: return vol_today_cut / (base_daily * 0.05) # <-- PERUBAHAN KEDUA
        except Exception: pass
    return np.nan

# --------------- MAIN ----------------
SUMMARY = []
files = sorted(FOLDER_1M.glob("*.csv"))
print(f"[INFO] Ditemukan {len(files)} file 1m.")

for fp in files:
    ticker = fp.stem
    try:
        df = pd.read_csv(fp, low_memory=False, parse_dates=["Datetime"])
        df["Datetime"] = to_jkt(df["Datetime"])
        for c in ("Open", "High", "Low", "Close", "Volume"):
            df[c] = pd.to_numeric(df.get(c), errors="coerce")

        work_date = pick_work_date(df["Datetime"], TODAY)
        if work_date is None: continue
        
        # MODIFIKASI: Filter data 1 menit hanya sampai CUTOFF_TIME
        df_dwork_full = df[df["Datetime"].dt.date == work_date].copy()
        df_dwork = df_dwork_full[df_dwork_full["Datetime"].dt.time <= CUTOFF_TIME].dropna(subset=["Close", "Volume"])
        
        if df_dwork.empty or df_dwork["Volume"].sum() == 0: continue

        df_daily = read_daily_flex(FOLDER_DAILY / f"{ticker}.csv")
        prev_close = np.nan
        if df_daily is not None:
            prev_day_data = df_daily[df_daily["Date"] < work_date]
            if not prev_day_data.empty:
                prev_close = prev_day_data.iloc[-1]["Close"]

        if pd.isna(prev_close): continue

        high_px = df_dwork['High'].max()
        low_px = df_dwork['Low'].min()
        last_px = df_dwork['Close'].iloc[-1]
        
        daily_return = (last_px / prev_close) - 1.0
        
        if not (0.01 < daily_return < 0.40): continue
        
        pace = vol_pace_robust(df, work_date, ticker)
        if not (pd.notna(pace) and pace > 1.2): continue

        daily_range = high_px - low_px
        closing_strength = (last_px - low_px) / daily_range if daily_range > 0 else 1.0

        start_time = df_dwork['Datetime'].min()
        first_5min_data = df_dwork[df_dwork['Datetime'] <= start_time + timedelta(minutes=5)]
        stable_open = float((first_5min_data["Close"] * first_5min_data["Volume"]).sum() / first_5min_data["Volume"].sum()) if not first_5min_data.empty and first_5min_data['Volume'].sum() > 0 else df_dwork['Open'].iloc[0]

        afternoon_power = (last_px / stable_open) - 1.0
        
        price_term = (1 + daily_return) * (1 + max(0, afternoon_power)) * closing_strength
        volume_term = np.log1p(min(pace, 50))
        
        score = price_term * volume_term

        SUMMARY.append({
            "ticker": ticker, "date": work_date, "score": score, "last": last_px,
            "daily_return": daily_return, "closing_strength": closing_strength,
            "afternoon_power": afternoon_power, "vol_pace": pace
        })

    except Exception:
        continue

# --------------- OUTPUT -----------------
if SUMMARY:
    df_result = pd.DataFrame(SUMMARY)
    df_result = df_result.sort_values("score", ascending=False).reset_index(drop=True)
    
    for col, fmt in {
        "score": "{:.3f}", "daily_return": "{:,.2%}", "closing_strength": "{:,.2%}",
        "afternoon_power": "{:,.2%}", "vol_pace": "{:.2f}x"
    }.items():
        if col in df_result.columns:
            df_result[col] = df_result[col].apply(lambda x: fmt.format(x) if pd.notna(x) else 'N/A')

    print(f"\n[✓] TOP CANDIDATES (SIMULASI {CUTOFF_STR})")
    display(df_result.head(30))
else:
    print(f"\n❌ Tidak ada kandidat yang lolos filter pada simulasi jam {CUTOFF_STR}.")

Hari ini: 2025-08-20, cutoff: 09:05 (SIMULASI PAGI)
[INFO] Ditemukan 773 file 1m.

[✓] TOP CANDIDATES (SIMULASI 09:05)


Unnamed: 0,ticker,date,score,last,daily_return,closing_strength,afternoon_power,vol_pace
0,CSIS.JK,2025-08-19,5.229,128.0,33.33%,96.88%,2.95%,463.65x
1,GRPH.JK,2025-08-19,5.033,90.0,21.62%,100.00%,5.26%,2820.52x
2,LIFE.JK,2025-08-19,4.726,14250.0,20.00%,100.00%,0.17%,65.26x
3,MFIN.JK,2025-08-19,4.636,1140.0,18.75%,96.55%,2.85%,675.48x
4,NTBK.JK,2025-08-19,4.296,55.0,7.84%,100.00%,1.32%,9780.00x
5,EMDE.JK,2025-08-19,4.192,94.0,5.62%,100.00%,1.14%,49.64x
6,LPPS.JK,2025-08-19,4.067,71.0,2.90%,100.00%,0.51%,55.85x
7,SMMA.JK,2025-08-19,3.827,16350.0,1.87%,100.00%,0.13%,41.60x
8,SGRO.JK,2025-08-19,3.792,4020.0,11.36%,90.24%,1.36%,40.37x
9,BWPT.JK,2025-08-19,3.59,129.0,5.74%,85.71%,0.74%,63.63x


### BSJP v.2.2.2 (Gemini Version)

In [30]:
import pandas as pd, numpy as np
from pathlib import Path
from datetime import datetime, timedelta, date
from typing import Optional
from IPython.display import display
import pandas as pd, numpy as np
from pathlib import Path
from datetime import datetime, timedelta, date
from typing import Optional
from IPython.display import display

# ================== MODIFIKASI: Simulasi Jam 11:45 (Jam Stabil) ==================
FOLDER_1M = Path("emiten/cache_1m")
FOLDER_DAILY = Path("emiten/cache_daily")
SESSION_TZ = "Asia/Jakarta"
CUTOFF_STR = "11:45"  # <-- PERUBAHAN UTAMA DI SINI
CUTOFF_TIME = datetime.strptime(CUTOFF_STR, "%H:%M").time()
TODAY = pd.Timestamp("today", tz=SESSION_TZ).normalize().date()
BASELINE_DAYS = 60

print(f"Hari ini: {TODAY}, cutoff: {CUTOFF_STR} (SIMULASI JAM STABIL)")

# --------------- UTIL ---------------
def to_jkt(series: pd.Series) -> pd.Series:
    s = pd.to_datetime(series, errors="coerce")
    if getattr(s.dt, "tz", None) is None: return s.dt.tz_localize(SESSION_TZ)
    return s.dt.tz_convert(SESSION_TZ)

def pick_work_date(df_dt: pd.Series, today_date: date) -> Optional[date]:
    dlist = pd.Series(df_dt.dt.date.unique()).dropna().sort_values().tolist()
    if not dlist: return None
    for d in reversed(dlist):
        if d <= today_date: return d
    return None

def read_daily_flex(path: Path) -> Optional[pd.DataFrame]:
    if not path.exists(): return None
    try:
        df = pd.read_csv(path, low_memory=False)
        for dc in ("Date", "date", "Datetime"):
            if dc in df.columns:
                df["Date"] = pd.to_datetime(df[dc], errors="coerce").dt.date
                break
        else: return None
        for col_name, new_name in [("Close", "Close"), ("Adj Close", "Close"), ("Volume", "Volume")]:
            if col_name in df.columns and new_name not in df.columns:
                df[new_name] = pd.to_numeric(df[col_name], errors="coerce")
        if "Close" not in df.columns or "Volume" not in df.columns: return None
        return df.dropna(subset=["Date", "Close", "Volume"]).sort_values("Date").reset_index(drop=True)
    except Exception: return None

def vol_pace_robust(df_1m: pd.DataFrame, work_date: date, ticker: str) -> float:
    try:
        mask_today_cut = (df_1m["Datetime"].dt.date == work_date) & (df_1m["Datetime"].dt.time <= CUTOFF_TIME)
        vol_today_cut = float(df_1m.loc[mask_today_cut, "Volume"].sum())
        prev_days_1m = sorted([d for d in df_1m["Datetime"].dt.date.unique() if d < work_date])[-BASELINE_DAYS:]
        vols_1m_hist = [float(df_1m.loc[(df_1m["Datetime"].dt.date == d) & (df_1m["Datetime"].dt.time <= CUTOFF_TIME), "Volume"].sum()) for d in prev_days_1m]
        vols_1m_hist = [v for v in vols_1m_hist if v > 0]
        if len(vols_1m_hist) >= 10:
            base_1m = float(np.median(vols_1m_hist))
            if base_1m > 0: return vol_today_cut / base_1m
    except Exception: pass
    
    df_daily = read_daily_flex(FOLDER_DAILY / f"{ticker}.csv")
    if df_daily is not None:
        try:
            hist_daily = df_daily[df_daily["Date"] < work_date].tail(BASELINE_DAYS)
            if len(hist_daily) >= 20:
                base_daily = float(hist_daily["Volume"].median())
                # Fraksi disesuaikan untuk sesi pagi-siang (sekitar 50% dari total volume harian)
                if base_daily > 0: return vol_today_cut / (base_daily * 0.50) # <-- PERUBAHAN KEDUA
        except Exception: pass
    return np.nan

# --------------- MAIN ----------------
SUMMARY = []
files = sorted(FOLDER_1M.glob("*.csv"))
print(f"[INFO] Ditemukan {len(files)} file 1m.")

for fp in files:
    ticker = fp.stem
    try:
        df = pd.read_csv(fp, low_memory=False, parse_dates=["Datetime"])
        df["Datetime"] = to_jkt(df["Datetime"])
        for c in ("Open", "High", "Low", "Close", "Volume"):
            df[c] = pd.to_numeric(df.get(c), errors="coerce")

        work_date = pick_work_date(df["Datetime"], TODAY)
        if work_date is None: continue
        
        # MODIFIKASI: Filter data 1 menit hanya sampai CUTOFF_TIME
        df_dwork_full = df[df["Datetime"].dt.date == work_date].copy()
        df_dwork = df_dwork_full[df_dwork_full["Datetime"].dt.time <= CUTOFF_TIME].dropna(subset=["Close", "Volume"])
        
        if df_dwork.empty or df_dwork["Volume"].sum() == 0: continue

        df_daily = read_daily_flex(FOLDER_DAILY / f"{ticker}.csv")
        prev_close = np.nan
        if df_daily is not None:
            prev_day_data = df_daily[df_daily["Date"] < work_date]
            if not prev_day_data.empty:
                prev_close = prev_day_data.iloc[-1]["Close"]

        if pd.isna(prev_close): continue

        high_px = df_dwork['High'].max()
        low_px = df_dwork['Low'].min()
        last_px = df_dwork['Close'].iloc[-1]
        
        daily_return = (last_px / prev_close) - 1.0
        
        if not (0.01 < daily_return < 0.40): continue
        
        pace = vol_pace_robust(df, work_date, ticker)
        if not (pd.notna(pace) and pace > 1.2): continue

        daily_range = high_px - low_px
        closing_strength = (last_px - low_px) / daily_range if daily_range > 0 else 1.0

        start_time = df_dwork['Datetime'].min()
        first_5min_data = df_dwork[df_dwork['Datetime'] <= start_time + timedelta(minutes=5)]
        stable_open = float((first_5min_data["Close"] * first_5min_data["Volume"]).sum() / first_5min_data["Volume"].sum()) if not first_5min_data.empty and first_5min_data['Volume'].sum() > 0 else df_dwork['Open'].iloc[0]

        afternoon_power = (last_px / stable_open) - 1.0
        
        price_term = (1 + daily_return) * (1 + max(0, afternoon_power)) * closing_strength
        volume_term = np.log1p(min(pace, 50))
        
        score = price_term * volume_term

        SUMMARY.append({
            "ticker": ticker, "date": work_date, "score": score, "last": last_px,
            "daily_return": daily_return, "closing_strength": closing_strength,
            "afternoon_power": afternoon_power, "vol_pace": pace
        })

    except Exception:
        continue

# --------------- OUTPUT -----------------
if SUMMARY:
    df_result = pd.DataFrame(SUMMARY)
    df_result = df_result.sort_values("score", ascending=False).reset_index(drop=True)
    
    for col, fmt in {
        "score": "{:.3f}", "daily_return": "{:,.2%}", "closing_strength": "{:,.2%}",
        "afternoon_power": "{:,.2%}", "vol_pace": "{:.2f}x"
    }.items():
        if col in df_result.columns:
            df_result[col] = df_result[col].apply(lambda x: fmt.format(x) if pd.notna(x) else 'N/A')

    print(f"\n[✓] TOP CANDIDATES (SIMULASI {CUTOFF_STR})")
    display(df_result.head(30))
else:
    print(f"\n❌ Tidak ada kandidat yang lolos filter pada simulasi jam {CUTOFF_STR}.")

Hari ini: 2025-08-20, cutoff: 11:45 (SIMULASI JAM STABIL)
[INFO] Ditemukan 773 file 1m.

[✓] TOP CANDIDATES (SIMULASI 11:45)


Unnamed: 0,ticker,date,score,last,daily_return,closing_strength,afternoon_power,vol_pace
0,KBLV.JK,2025-08-19,7.412,178.0,34.85%,100.00%,39.80%,325.89x
1,MAYA.JK,2025-08-19,6.725,254.0,31.61%,100.00%,29.96%,108.73x
2,MFIN.JK,2025-08-19,5.321,1200.0,25.00%,100.00%,8.26%,190.16x
3,CSIS.JK,2025-08-19,4.981,127.0,32.29%,93.75%,2.15%,173.40x
4,MINE.JK,2025-08-19,4.888,630.0,24.75%,100.00%,24.02%,22.55x
5,BEER.JK,2025-08-19,4.737,123.0,23.00%,78.05%,25.51%,100.65x
6,PSKT.JK,2025-08-19,4.289,84.0,9.09%,100.00%,0.00%,82.89x
7,BMAS.JK,2025-08-19,4.101,730.0,9.77%,100.00%,0.00%,40.92x
8,ASLC.JK,2025-08-19,4.095,85.0,19.72%,73.68%,18.06%,139.73x
9,DFAM.JK,2025-08-19,4.082,54.0,1.89%,100.00%,1.89%,100.30x


## Simulation

### Simulation Time Based 9.30

In [15]:
# === BELI 09:30, JUAL DI TUTUP HARI ===
# nominal rata Rp1.000.000 per ticker (IDX lot 100), fee round-trip 0.30%

import pandas as pd, numpy as np
from pathlib import Path

# ---------------- CONFIG ----------------
FOLDER_1M = Path("emiten/cache_1m")
TICKERS   = ["ACST.JK","SAPX.JK","JKON.JK","MFIN.JK","UANG.JK",
             "SOSS.JK","TAYS.JK","PNBS.JK","BWPT.JK","DATA.JK"]
TRADE_DATE  = "2025-08-20"
ENTRY_TIME  = "09:30"        # beli di bar >= 09:30
CLOSE_TIME  = "14:55"        # ambil bar terakhir hingga 14:55–15:00
CAPITAL_PER_TICKER = 1_000_000
ROUND_TRIP_FEE = 0.003       # 0.30% (beli+jual)
IDX_LOT = 100
TZ = "Asia/Jakarta"

# --------------- UTIL -------------------
def to_jkt(series: pd.Series) -> pd.Series:
    s = pd.to_datetime(series, errors="coerce")
    s = s.dt.tz_localize(TZ) if s.dt.tz is None else s.dt.tz_convert(TZ)
    return s

def load_1m(tik: str) -> pd.DataFrame:
    fp = (FOLDER_1M / tik).with_suffix(".JK.csv")
    df = pd.read_csv(fp)
    df["Datetime"] = to_jkt(df["Datetime"])
    df = df.set_index("Datetime").sort_index()
    return df[["Open","High","Low","Close"]]

def bar_at_or_after(df: pd.DataFrame, dt):
    ix = df.index.searchsorted(dt)
    if ix >= len(df): return None, None
    return df.index[ix], float(df.iloc[ix]["Close"])   # eksekusi konservatif di Close bar tsb

def last_close(df: pd.DataFrame, date_str: str):
    s = pd.Timestamp(f"{date_str} 09:00", tz=TZ)
    e = pd.Timestamp(f"{date_str} {CLOSE_TIME}", tz=TZ)
    dfd = df.loc[(df.index>=s) & (df.index<=e)]
    if dfd.empty: return None, None
    return dfd.index[-1], float(dfd.iloc[-1]["Close"])

def lot_qty(price, capital):
    raw = np.floor(capital / price)
    lots = int(raw // IDX_LOT * IDX_LOT)
    return max(lots, 0)

# --------------- CORE -------------------
rows = []
for tik in TICKERS:
    try:
        df = load_1m(tik)
    except Exception as e:
        rows.append({"ticker": tik, "status": f"DATA_ERROR: {e}"})
        continue

    e_dt = pd.Timestamp(f"{TRADE_DATE} {ENTRY_TIME}", tz=TZ)
    x_dt = pd.Timestamp(f"{TRADE_DATE} {CLOSE_TIME}", tz=TZ)

    ent_dt, ent_px = bar_at_or_after(df, e_dt)
    cls_dt, cls_px = last_close(df, TRADE_DATE)

    if ent_dt is None or cls_dt is None:
        rows.append({"ticker": tik, "status": "NO_BARS"})
        continue

    qty = lot_qty(ent_px, CAPITAL_PER_TICKER)
    if qty == 0:
        rows.append({"ticker": tik, "status": "CAPITAL_TOO_SMALL_FOR_LOT", "entry_price": round(ent_px,2)})
        continue

    buy_val  = ent_px * qty
    sell_val = cls_px * qty
    fee = ROUND_TRIP_FEE * (buy_val + sell_val)
    gross = (cls_px - ent_px) * qty
    net   = gross - fee
    ret_pct = net / buy_val if buy_val>0 else np.nan

    rows.append({
        "ticker": tik,
        "entry_time": ent_dt.strftime("%H:%M"),
        "entry_price": round(ent_px,2),
        "exit_time": cls_dt.strftime("%H:%M"),
        "exit_price": round(cls_px,2),
        "qty": int(qty),
        "notional_buy": round(buy_val,0),
        "fee": round(fee,0),
        "gross_pnl": round(gross,0),
        "net_pnl": round(net,0),
        "net_return_%": round(ret_pct*100,2),
        "status": "OK"
    })

result = pd.DataFrame(rows)
# urutkan sesuai input
result["__order"] = result["ticker"].map({t:i for i,t in enumerate(TICKERS)})
result = result.sort_values("__order").drop(columns="__order")

# total
valid = result[result["status"]=="OK"].copy()
total_capital = len(valid) * CAPITAL_PER_TICKER
total_net_pnl = int(valid["net_pnl"].sum()) if not valid.empty else 0
avg_return_pct = round(valid["net_return_%"].mean(), 2) if not valid.empty else 0.0

print("=== Hasil per-ticker ===")
print(result.to_string(index=False))
print("\n=== TOTAL ===")
print(f"Tickers traded : {len(valid)} dari {len(TICKERS)}")
print(f"Total capital  : Rp{total_capital:,.0f}")
print(f"Total NET PnL  : Rp{total_net_pnl:,.0f}")
print(f"Avg return/tkr : {avg_return_pct}%")

# (opsional) simpan ke CSV
# result.to_csv("pnl_0930_to_close_2025-08-20.csv", index=False)


=== Hasil per-ticker ===
 ticker entry_time  entry_price exit_time  exit_price   qty  notional_buy    fee  gross_pnl  net_pnl  net_return_% status
ACST.JK      09:30        119.0     14:53       121.0  8400      999600.0 6048.0    16800.0  10752.0          1.08     OK
SAPX.JK      09:30        360.0     14:55       368.0  2700      972000.0 5897.0    21600.0  15703.0          1.62     OK
JKON.JK      09:30        107.0     14:55       105.0  9300      995100.0 5915.0   -18600.0 -24515.0         -2.46     OK
MFIN.JK      09:31       1500.0     14:52      1500.0   600      900000.0 5400.0        0.0  -5400.0         -0.60     OK
UANG.JK      09:30        660.0     14:54       660.0  1500      990000.0 5940.0        0.0  -5940.0         -0.60     OK
SOSS.JK      09:32        675.0     14:28       675.0  1400      945000.0 5670.0        0.0  -5670.0         -0.60     OK
TAYS.JK      09:30         53.0     14:55        65.0 18800      996400.0 6655.0   225600.0 218945.0         21.97     OK

### Simulation Time Flight or Hold 9.30 + 14.30

In [17]:
# === Time-based switch strategy ===
# 1. Buy tickers at 09:30 cutoff
# 2. If not in 14:15 cutoff, sell at 14:15 bar
# 3. Buy new tickers at 14:15 cutoff (not in 09:30)
# 4. Close everything at 14:55

import pandas as pd, numpy as np
from pathlib import Path

FOLDER_1M = Path("emiten/cache_1m")
TZ = "Asia/Jakarta"
CAPITAL_PER_TICKER = 1_000_000
ROUND_TRIP_FEE = 0.003
IDX_LOT = 100
TRADE_DATE = "2025-08-20"
ENTRY_0930 = "09:30"
CHECK_1415 = "14:15"
CLOSE_TIME = "14:55"

tickers_0930 = ["ACST.JK","SAPX.JK","JKON.JK","MFIN.JK","UANG.JK",
                "SOSS.JK","TAYS.JK","PNBS.JK","BWPT.JK","DATA.JK"]
tickers_1415 = ["TAYS.JK","ACST.JK","CENT.JK","LPCK.JK","SAPX.JK",
                "JARR.JK","GZCO.JK","JKON.JK","MFIN.JK","PSKT.JK"]

def to_jkt(series): 
    s = pd.to_datetime(series, errors="coerce")
    return s.dt.tz_localize(TZ) if s.dt.tz is None else s.dt.tz_convert(TZ)

def load_1m(tik):
    fp = FOLDER_1M / f"{tik}.csv"
    df = pd.read_csv(fp)
    df["Datetime"] = to_jkt(df["Datetime"])
    df = df.set_index("Datetime").sort_index()
    return df[["Open","High","Low","Close"]]

def bar_price(df, dt):
    ix = df.index.searchsorted(dt)
    if ix >= len(df): return None
    return float(df.iloc[ix]["Close"])

def lot_qty(price, capital=CAPITAL_PER_TICKER):
    raw = np.floor(capital/price)
    lots = int(raw // IDX_LOT * IDX_LOT)
    return max(lots,0)

rows=[]
all_tickers = set(tickers_0930) | set(tickers_1415)

for tik in sorted(all_tickers):
    try:
        df = load_1m(tik)
    except: 
        rows.append({"ticker":tik,"status":"NO DATA"}); continue

    e0930 = pd.Timestamp(f"{TRADE_DATE} {ENTRY_0930}", tz=TZ)
    e1415 = pd.Timestamp(f"{TRADE_DATE} {CHECK_1415}", tz=TZ)
    eclose= pd.Timestamp(f"{TRADE_DATE} {CLOSE_TIME}", tz=TZ)

    # harga-harga
    p0930 = bar_price(df, e0930)
    p1415 = bar_price(df, e1415)
    pclose= bar_price(df, eclose)
    if p0930 is None or p1415 is None or pclose is None:
        rows.append({"ticker":tik,"status":"MISSING PRICES"}); continue

    result = {"ticker":tik, "status":"OK"}
    qty=0; buy_val=0; pnl=0; fee=0

    if tik in tickers_0930:
        qty = lot_qty(p0930)
        buy_val = p0930*qty
        # jika masih muncul 14:15, hold sampai close
        if tik in tickers_1415:
            gross = (pclose - p0930)*qty
            fee   = ROUND_TRIP_FEE*(buy_val+pclose*qty)
            pnl   = gross - fee
            result.update(entry="09:30",exit="Close",entry_px=p0930,exit_px=pclose,
                          qty=qty,net_pnl=round(pnl,0))
        else:
            # jual di 14:15
            gross = (p1415 - p0930)*qty
            fee   = ROUND_TRIP_FEE*(buy_val+p1415*qty)
            pnl   = gross - fee
            result.update(entry="09:30",exit="14:15",entry_px=p0930,exit_px=p1415,
                          qty=qty,net_pnl=round(pnl,0))
    if tik in tickers_1415 and tik not in tickers_0930:
        qty2=lot_qty(p1415)
        buy_val2=p1415*qty2
        gross2=(pclose - p1415)*qty2
        fee2=ROUND_TRIP_FEE*(buy_val2+pclose*qty2)
        pnl2=gross2-fee2
        result.update(entry="14:15",exit="Close",entry_px=p1415,exit_px=pclose,
                      qty=qty2,net_pnl=round(pnl2,0))
    rows.append(result)

res=pd.DataFrame(rows)
print(res)
print("Total PnL:", res["net_pnl"].fillna(0).sum())


     ticker status  entry   exit  entry_px  exit_px    qty   net_pnl
0   ACST.JK     OK  09:30  Close     119.0    121.0   8400   10752.0
1   BWPT.JK     OK  09:30  14:15     145.0    170.0   6800  163574.0
2   CENT.JK     OK  14:15  Close     135.0    135.0   7400   -5994.0
3   DATA.JK     OK  09:30  14:15    2340.0   2360.0    400    2360.0
4   GZCO.JK     OK  14:15  Close     143.0    138.0   6900  -40317.0
5   JARR.JK     OK  14:15  Close    1005.0   1005.0    900   -5427.0
6   JKON.JK     OK  09:30  Close     107.0    105.0   9300  -24515.0
7   LPCK.JK     OK  14:15  Close     615.0    620.0   1600    2072.0
8   MFIN.JK     OK  09:30  Close    1500.0   1500.0    600   -5400.0
9   PNBS.JK     OK  09:30  14:15      50.0     50.0  20000   -6000.0
10  PSKT.JK     OK  14:15  Close      92.0     92.0  10800   -5962.0
11  SAPX.JK     OK  09:30  Close     360.0    368.0   2700   15703.0
12  SOSS.JK     OK  09:30  14:15     675.0    675.0   1400   -5670.0
13  TAYS.JK     OK  09:30  Close  

### Top Gainer Generator

In [12]:
import pandas as pd
from pathlib import Path

# ===================== CONFIG =====================
FOLDER_DAILY = Path("emiten/cache_daily")
LAST_N_DAYS  = 100             # ambil 9 HARI TERAKHIR
TOP_N_PER_DAY = 10           # berapa banyak top gainer tiap hari
MIN_PCT_FILTER = None        # contoh: 0.20 untuk >20%; None = tanpa filter
TICKER_SUFFIX = ".JK"        # tambahkan jika nama file tanpa suffix

# ==================== LOADER ======================
def load_daily_csv(fp: Path) -> pd.DataFrame:
    df = pd.read_csv(fp)
    # Normalisasi kolom tanggal
    date_col = None
    for c in ["Date", "Datetime", "date", "DATE", "timestamp", "Timestamp"]:
        if c in df.columns:
            date_col = c
            break
    if date_col is None:
        raise ValueError(f"Tidak ada kolom tanggal di {fp.name}")
    df["Date"] = pd.to_datetime(df[date_col], errors="coerce").dt.date
    # Pastikan kolom Close ada
    close_col = "Close" if "Close" in df.columns else "close"
    if close_col not in df.columns:
        raise ValueError(f"Tidak ada kolom Close di {fp.name}")
    out = df[["Date", close_col]].rename(columns={close_col: "Close"}).dropna()
    out = out.sort_values("Date")
    return out

# ================== KUMPUL DATA ===================
all_frames = []
for fp in FOLDER_DAILY.glob("*.csv"):
    try:
        base = fp.stem  # contoh: ACST.JK atau ACST
        ticker = base if base.endswith(TICKER_SUFFIX) else f"{base}{TICKER_SUFFIX}"
        d = load_daily_csv(fp)
        d.insert(0, "ticker", ticker)
        all_frames.append(d)
    except Exception as e:
        print(f"[SKIP] {fp.name}: {e}")

if not all_frames:
    raise SystemExit("Tidak ada data yang berhasil dibaca dari cache_daily.")

df = pd.concat(all_frames, ignore_index=True)

# =========== HITUNG DAILY RETURN PER TICKER ===========
# Return = Close / Close_{t-1} - 1 (per ticker)
df = df.sort_values(["ticker", "Date"])
df["prev_close"] = df.groupby("ticker")["Close"].shift(1)
df["daily_return"] = (df["Close"] / df["prev_close"]) - 1

# Buang baris pertama per ticker (tidak ada prev_close)
df = df.dropna(subset=["prev_close"])

# Ambil 9 HARI KALENDER TERAKHIR YANG ADA DI DATA
last_days = sorted(df["Date"].unique())[-LAST_N_DAYS:]

df_last = df[df["Date"].isin(last_days)].copy()

# (Opsional) filter minimal persentase kenaikan harian
if MIN_PCT_FILTER is not None:
    df_last = df_last[df_last["daily_return"] >= MIN_PCT_FILTER].copy()

# Urutkan dari terbesar
df_last = df_last.sort_values(["Date", "daily_return"], ascending=[True, False])

# ================== OUTPUT GABUNGAN ===================
# Tabel gabungan semua hari (urut per hari, return desc)
df_last["return_%"] = (df_last["daily_return"] * 100).round(2)
df_last["Close"] = df_last["Close"].round(2)
df_last["prev_close"] = df_last["prev_close"].round(2)
cols = ["Date", "ticker", "prev_close", "Close", "return_%"]
daily_gainers_all = df_last[cols].reset_index(drop=True)

print("=== Top Gainer Harian (gabungan 9 hari terakhir) ===")
print(daily_gainers_all.to_string(index=False))

# =========== RINGKASAN: TOP N PER HARI ===========
def topn_per_day(df_input, topn=TOP_N_PER_DAY):
    out = []
    for d, grp in df_input.groupby("Date"):
        g = grp.sort_values("return_%", ascending=False).head(topn)
        g = g.assign(rank=range(1, len(g)+1))
        out.append(g)
    return pd.concat(out, ignore_index=True) if out else pd.DataFrame(columns=cols+["rank"])

daily_gainers_topn = topn_per_day(daily_gainers_all, TOP_N_PER_DAY)

print("\n=== Ringkasan TOP N per hari ===")
print(daily_gainers_topn[["Date","rank","ticker","prev_close","Close","return_%"]].to_string(index=False))

# (Opsional) simpan ke CSV
# daily_gainers_all.to_csv("top_gainers_9hari_all.csv", index=False)
daily_gainers_topn.to_csv("simulation/top_gainers_9hari_topN.csv", index=False)


=== Top Gainer Harian (gabungan 9 hari terakhir) ===
      Date  ticker  prev_close     Close  return_%
2025-03-11 MINA.JK      189.00    254.00     34.39
2025-03-11 RELI.JK      354.00    442.00     24.86
2025-03-11 MINE.JK      270.00    336.00     24.44
2025-03-11 BAIK.JK       78.00     97.00     24.36
2025-03-11 SONA.JK     3750.00   4600.00     22.67
2025-03-11 BTEK.JK        6.00      7.00     16.67
2025-03-11 JGLE.JK        6.00      7.00     16.67
2025-03-11 NAIK.JK      555.00    630.00     13.51
2025-03-11 RDTX.JK    12100.00  13475.00     11.36
2025-03-11 BESS.JK      900.00    990.00     10.00
2025-03-11 GLOB.JK       90.00     99.00     10.00
2025-03-11 KBAG.JK       20.00     22.00     10.00
2025-03-11 KLIN.JK      140.00    154.00     10.00
2025-03-11 REAL.JK       10.00     11.00     10.00
2025-03-11 SOSS.JK      340.00    374.00     10.00
2025-03-11 UDNG.JK       50.00     55.00     10.00
2025-03-11 DCII.JK   169950.00 186925.00      9.99
2025-03-11 KSIX.JK      202.0

### Simulasi Rekomendasi vs Top Gainer

In [2]:
# ONE-CELL ANALYSIS: Rekom 09:30 vs Top Gainer (>20%)
# - Scan semua file: rekomendasi/bsjp_rekomendasi_*.csv  (RELATIF ke notebook)
# - Merge dengan simulation/top_gainers_9hari_topN.csv
# - Hitung precision & recall per hari + overall

import pandas as pd
from pathlib import Path
from typing import List
from IPython.display import display

# ================= CONFIG =================
REKOM_GLOB = "rekomendasi/bsjp_rekomendasi_*.csv"   # folder relatif
TOP_GAINERS_PATH = "simulation/top_gainers_9hari_topN.csv"
TICKER_SUFFIX = ".JK"   # pastikan konsisten (biarkan .JK)

# ============== HELPERS ===================
def _norm_date(col):
    dt = pd.to_datetime(col, errors="coerce")
    return dt.dt.date

def load_rekom_files(glob_pattern: str) -> pd.DataFrame:
    files = sorted(Path().glob(glob_pattern))   # <- PAKAI RELATIF, BUKAN Path("/").glob(...)
    frames: List[pd.DataFrame] = []
    for fp in files:
        try:
            df = pd.read_csv(fp)
            df.columns = [c.strip().lower() for c in df.columns]
            if "ticker" not in df.columns:
                continue
            # date: pakai kolom kalau ada; jika tidak, infer dari nama file: bsjp_rekomendasi_YYYY-MM-DD.csv
            if "date" in df.columns:
                df["date"] = _norm_date(df["date"])
            else:
                stem = fp.stem  # bsjp_rekomendasi_2025-08-12
                d = pd.to_datetime(stem.split("_")[-1], errors="coerce")
                df["date"] = d.date() if pd.notna(d) else pd.NaT

            df["ticker"] = df["ticker"].astype(str).str.upper()

            # Pastikan suffix .JK tidak hilang / double
            if TICKER_SUFFIX:
                df["ticker"] = df["ticker"].str.replace(f"{TICKER_SUFFIX}$", "", regex=True) + TICKER_SUFFIX

            frames.append(df[["date","ticker"]].dropna())
        except Exception as e:
            print(f"[SKIP] {fp}: {e}")

    if not frames:
        return pd.DataFrame(columns=["date","ticker"])

    out = pd.concat(frames, ignore_index=True)
    out = (out.dropna(subset=["date","ticker"])
              .drop_duplicates(subset=["date","ticker"])
              .sort_values(["date","ticker"]))
    return out

def load_top_gainers(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df.columns = [c.strip().lower() for c in df.columns]

    # normalisasi nama kolom standar
    if "date" not in df.columns and "Date" in df.columns:
        df.rename(columns={"Date": "date"}, inplace=True)
    if "ticker" not in df.columns and "Ticker" in df.columns:
        df.rename(columns={"Ticker": "ticker"}, inplace=True)

    df["date"] = _norm_date(df["date"])
    df["ticker"] = df["ticker"].astype(str).str.upper()
    if TICKER_SUFFIX:
        df["ticker"] = df["ticker"].str.replace(f"{TICKER_SUFFIX}$", "", regex=True) + TICKER_SUFFIX

    # standarkan kolom return jika ada
    if "return_%" not in df.columns and "return_pct" in df.columns:
        df.rename(columns={"return_pct": "return_%"}, inplace=True)

    keep = ["date","ticker"]
    for extra in ["return_%","prev_close","close","rank"]:
        if extra in df.columns: keep.append(extra)
    df = (df[keep]
            .dropna(subset=["date","ticker"])
            .drop_duplicates(subset=["date","ticker"])
            .sort_values(["date","ticker"]))
    return df

# ============== MAIN PIPELINE ==============
rekom = load_rekom_files(REKOM_GLOB)
gainers = load_top_gainers(TOP_GAINERS_PATH)

# gabungan: semua rekom → cek apakah termasuk top gainer hari itu
merged = rekom.merge(gainers[["date","ticker"]], on=["date","ticker"], how="left", indicator=True)
merged["status"] = merged["_merge"].map({
    "both":"Overlap",
    "left_only":"False Positive",
    "right_only":"(n/a)"
}).astype("string")
merged = merged.drop(columns=["_merge"]).sort_values(["date","ticker"]).reset_index(drop=True)

# precision per hari (proporsi overlap di antara rekom hari itu)
daily_prec = (
    merged.groupby("date")
          .agg(
              rekom_count=("ticker", "size"),
              overlap=("status", lambda s: (s == "Overlap").sum())
          )
          .reset_index()
          .sort_values("date")
)
daily_prec["precision_%"] = (
    (daily_prec["overlap"] / daily_prec["rekom_count"])
    .fillna(0).mul(100).round(1)
)


# recall per hari (berapa banyak top gainer hari itu yang tertangkap rekomendasi)
tg_daily = gainers.groupby("date")["ticker"].nunique().reset_index(name="topgainer_count")
overlap_daily = (merged[merged["status"]=="Overlap"]
                 .groupby("date")["ticker"]
                 .nunique()
                 .reset_index(name="overlap_unique"))

daily = (daily_prec
         .merge(tg_daily, on="date", how="left")
         .merge(overlap_daily, on="date", how="left"))
daily["overlap_unique"] = daily["overlap_unique"].fillna(0).astype(int)
daily["topgainer_count"] = daily["topgainer_count"].fillna(0).astype(int)
daily["recall_%"] = daily.apply(
    lambda r: round((r["overlap_unique"]/r["topgainer_count"]*100),1) if r["topgainer_count"]>0 else 0.0,
    axis=1
)

# overall metrics
total_rekom = int(len(merged))
total_overlap = int((merged["status"]=="Overlap").sum())
overall_precision = round((total_overlap/total_rekom*100), 1) if total_rekom else 0.0

total_topgainer_daysum = int(tg_daily["topgainer_count"].sum()) if not tg_daily.empty else 0
total_overlap_daysum = int(overlap_daily["overlap_unique"].sum()) if not overlap_daily.empty else 0
overall_recall = round((total_overlap_daysum/total_topgainer_daysum*100), 1) if total_topgainer_daysum else 0.0

summary_overall = pd.DataFrame([{
    "days": int(daily.shape[0]),
    "rekom_total": total_rekom,
    "overlap_total": total_overlap,
    "overall_precision_%": overall_precision,
    "topgainer_total(uniq/day sum)": total_topgainer_daysum,
    "overlap_unique(day sum)": total_overlap_daysum,
    "overall_recall_%": overall_recall
}])

# ============== DISPLAY ====================
print("=== Detail Join (head) ===")
display(merged.head(30))
print("\n=== Metrics per Hari ===")
display(daily)
print("\n=== Summary Overall ===")
display(summary_overall)

# (opsional) simpan
merged.to_csv("simulation/out_0930_vs_topgainer_detail.csv", index=False)
daily.to_csv("simulation/out_0930_vs_topgainer_metrics_per_hari.csv", index=False)
summary_overall.to_csv("simulation/out_0930_vs_topgainer_summary.csv", index=False)


[SKIP] rekomendasi/bsjp_rekomendasi_2025-07-28.csv: No columns to parse from file
=== Detail Join (head) ===


Unnamed: 0,date,ticker,status
0,2025-07-23,AMAR.JK,Overlap
1,2025-07-23,ARGO.JK,Overlap
2,2025-07-23,ARTA.JK,Overlap
3,2025-07-23,BKSW.JK,False Positive
4,2025-07-23,DCII.JK,Overlap
5,2025-07-23,IMAS.JK,Overlap
6,2025-07-23,IMJS.JK,Overlap
7,2025-07-23,MEGA.JK,False Positive
8,2025-07-23,MLBI.JK,False Positive
9,2025-07-23,RUNS.JK,False Positive



=== Metrics per Hari ===


Unnamed: 0,date,rekom_count,overlap,precision_%,topgainer_count,overlap_unique,recall_%
0,2025-07-23,10,6,60.0,10,6,60.0
1,2025-07-24,10,3,30.0,10,3,30.0
2,2025-07-25,10,5,50.0,10,5,50.0
3,2025-07-29,10,6,60.0,10,6,60.0
4,2025-07-30,10,7,70.0,10,7,70.0
5,2025-07-31,10,6,60.0,10,6,60.0
6,2025-08-01,10,4,40.0,10,4,40.0
7,2025-08-04,10,6,60.0,10,6,60.0
8,2025-08-05,10,5,50.0,10,5,50.0
9,2025-08-06,10,4,40.0,10,4,40.0



=== Summary Overall ===


Unnamed: 0,days,rekom_total,overlap_total,overall_precision_%,topgainer_total(uniq/day sum),overlap_unique(day sum),overall_recall_%
0,19,190,108,56.8,1000,108,10.8


# NEWS MARKET AGREGATOR

In [1]:
pip install requests feedparser pandas python-dateutil beautifulsoup4 lxml

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting lxml
  Downloading lxml-6.0.1-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
Downloading lxml-6.0.1-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl (5.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m7.5 MB/s[0m  [33m0:00:00[0m6m0:00:01[0m0:01[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (pyproject.toml) ... [?25ldone
[?25h  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6089 sha256=54fb3771d2919ca824ed78fd64b93a0cb8ab5460abf0ec

In [None]:
# pip install feedparser pandas python-dateutil beautifulsoup4 lxml requests
import feedparser, pandas as pd, time, hashlib, requests, re
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
from datetime import datetime
from dateutil import tz

WIB = tz.gettz("Asia/Jakarta")
def _to_wib(struct_or_none):
    if struct_or_none:
        return datetime.fromtimestamp(time.mktime(struct_or_none), tz=WIB)
    return datetime.now(WIB)

def _clean_html(s): return BeautifulSoup(s or "", "lxml").get_text(" ", strip=True)

def _unwrap_gnews_link(link:str)->str:
    # sebagian link RSS GNews berbentuk news.google.com/... dgn ?url=<publisher>
    if "news.google." in link:
        qs = parse_qs(urlparse(link).query)
        if "url" in qs and qs["url"]:
            return qs["url"][0]
    return link

def fetch_gnews(query:str, when="1d", lang="id", country="ID"):
    # Contoh query: 'market saham OR emiten second liner site:cnbcindonesia.com OR site:kontan.co.id'
    q = f"{query} when:{when}" if when else query
    rss = f"https://news.google.com/rss/search?q={requests.utils.quote(q)}&hl={lang}&gl={country}&ceid={country}%3A{lang}"
    parsed = feedparser.parse(rss)
    rows = []
    for e in parsed.entries:
        title   = _clean_html(getattr(e,"title",""))
        summary = _clean_html(getattr(e,"summary","") or getattr(e,"description",""))
        link    = _unwrap_gnews_link(getattr(e,"link",""))
        pub     = _to_wib(getattr(e,"published_parsed", None) or getattr(e,"updated_parsed", None))
        rows.append({"source":"GoogleNews", "title":title, "summary":summary, "link":link, "published_wib":pub})
    df = pd.DataFrame(rows)
    if df.empty: return df
    # dedup by link (atau hash title+summary)
    def _key(r):
        if r["link"]: return r["link"]
        return hashlib.sha1((r["title"]+"||"+r["summary"]).encode("utf-8")).hexdigest()
    df["dedup_key"] = df.apply(_key, axis=1)
    df = df.sort_values("published_wib", ascending=False).drop_duplicates("dedup_key")
    return df[["published_wib","source","title","summary","link"]]

# CONTOH PEMAKAIAN
# Fokus media lokal + 1 hari terakhir
df = fetch_gnews(
    query="(saham hari ini OR emiten)) "
    when="1d"
)
print(len(df))
df.head(10)


3


Unnamed: 0,published_wib,source,title,summary,link
0,2025-08-27 10:47:43+07:00,GoogleNews,Rotasi Dana Investor Asing Poles Saham Lapis D...,Rotasi Dana Investor Asing Poles Saham Lapis D...,https://news.google.com/rss/articles/CBMiugFBV...
1,2025-08-27 10:05:11+07:00,GoogleNews,"Was-Was Dirut Summarecon (SMRA) Soal Pinjol, H...","Was-Was Dirut Summarecon (SMRA) Soal Pinjol, H...",https://news.google.com/rss/articles/CBMixwFBV...
2,2025-08-27 06:32:42+07:00,GoogleNews,Serangan Drone Ukraina Picu Kebakaran dan Evak...,Serangan Drone Ukraina Picu Kebakaran dan Evak...,https://news.google.com/rss/articles/CBMiqgFBV...
