In [4]:
#!/usr/bin/env python3
import pandas as pd
from pathlib import Path

def check_gdelt_folder(base_dir: str):
    base = Path(base_dir)
    raw_dir = base / "raw"
    flt_dir = base / "filtered"
    man_dir = base / "_manifest"

    print(f"[INFO] Checking folder: {base.resolve()}")

    # 1. Đếm file
    n_raw = len(list(raw_dir.glob("*.parquet"))) if raw_dir.exists() else 0
    n_flt = len(list(flt_dir.glob("*.parquet"))) if flt_dir.exists() else 0
    n_man = len(list(man_dir.glob("*.json"))) if man_dir.exists() else 0
    print(f"  Raw files: {n_raw}")
    print(f"  Filtered files: {n_flt}")
    print(f"  Manifest files: {n_man}")

    # 2. Load thử một số file filtered
    counts = []
    if flt_dir.exists():
        for p in sorted(flt_dir.glob("*.parquet")):
            try:
                df = pd.read_parquet(p)
                counts.append((p.name, len(df)))
            except Exception as e:
                counts.append((p.name, f"ERR {e}"))

    if counts:
        total_rows = sum(c for _, c in counts if isinstance(c, int))
        empty_files = [f for f, c in counts if c == 0]
        print(f"  Total rows in filtered: {total_rows}")
        print(f"  Empty filtered files: {len(empty_files)}")
        if empty_files:
            print("   → " + ", ".join(empty_files[:10]) + (" ..." if len(empty_files) > 10 else ""))
    else:
        print("  (no filtered files to check)")


# ví dụ chạy
check_gdelt_folder("../../work/data/gdelt_full_hardfix_split")


[INFO] Checking folder: /home/kylh/phd/tw_fin_rl/work/data/gdelt_full_hardfix_split
  Raw files: 7400
  Filtered files: 7402
  Manifest files: 7400
  Total rows in filtered: 210769
  Empty filtered files: 1354
   → gdelt_filtered_BNB_2020-08-12_0900_1200_134f6fc592247854.parquet, gdelt_filtered_BNB_2020-08-13_0900_1200_726b3689988d0ded.parquet, gdelt_filtered_BNB_2020-08-14_0900_1200_61116ddafbae152e.parquet, gdelt_filtered_BNB_2020-08-15_0900_1200_af6e51e65c7989d9.parquet, gdelt_filtered_BNB_2020-08-19_0900_1200_1415df204170dc7b.parquet, gdelt_filtered_BNB_2020-08-22_0900_1200_402b59824b3b077c.parquet, gdelt_filtered_BNB_2020-08-23_0900_1200_a99747c1bbdb9e89.parquet, gdelt_filtered_BNB_2020-08-24_0900_1200_bd6d4d7686c327fe.parquet, gdelt_filtered_BNB_2020-08-28_0900_1200_5c05ad5a1032fe99.parquet, gdelt_filtered_BNB_2020-08-29_0900_1200_7159e84c0ad99880.parquet ...


In [6]:
#!/usr/bin/env python3
"""
Quick check for MERGED parquet files:
- Tự động tìm các file có 'merged' trong tên hoặc nằm trong thư mục 'merged/'
- Đếm số dòng mà không load toàn bộ (dùng pyarrow ParquetFile)
- In thống kê: số file, tổng số dòng, số file rỗng, liệt kê vài file rỗng

Usage:
  python check_merged_parquet.py /path/to/base_dir
  # tuỳ chọn: chỉ định glob riêng
  python check_merged_parquet.py /data --glob "**/merged/*.parquet"
"""
import sys
from pathlib import Path

try:
    import pyarrow.parquet as pq
except ImportError:
    print("[ERROR] Please install pyarrow: pip install pyarrow", file=sys.stderr)
    sys.exit(1)

def count_rows_parquet(fp: Path) -> int:
    try:
        pf = pq.ParquetFile(fp)
        md = pf.metadata
        return sum(md.row_group(i).num_rows for i in range(md.num_row_groups))
    except Exception as e:
        print(f"[WARN] Can't read {fp}: {e}")
        return -1  # unreadable

def looks_merged(p: Path) -> bool:
    name_has = 'merged' in p.name.lower()
    folder_has = any(part.lower() == 'merged' for part in p.parts)
    return name_has or folder_has

def find_merged_files(base: Path, custom_glob: str | None) -> list[Path]:
    if custom_glob:
        return sorted(base.glob(custom_glob))
    # Mặc định: hai pattern phổ biến
    found = set()
    for pat in ("**/*merged*.parquet", "**/merged/*.parquet"):
        for fp in base.glob(pat):
            found.add(fp)
    # Fallback: toàn bộ parquet rồi lọc bằng looks_merged
    if not found:
        found = {p for p in base.glob("**/*.parquet") if looks_merged(p)}
    return sorted(found)

def main():
    if len(sys.argv) < 2:
        print("Usage: python check_merged_parquet.py <base_dir> [--glob \"**/merged/*.parquet\"]")
        sys.exit(1)

    base = Path(sys.argv[1]).expanduser().resolve()
    custom_glob = None
    if len(sys.argv) >= 4 and sys.argv[2] == "--glob":
        custom_glob = sys.argv[3]

    merged_files = find_merged_files(base, custom_glob)
    print(f"[INFO] Checking MERGED under: {base}")
    print(f"  Merged files: {len(merged_files)}")

    total_rows = 0
    empty_names = []
    unreadable = 0

    for fp in merged_files:
        rows = count_rows_parquet(fp)
        if rows < 0:
            unreadable += 1
            continue
        total_rows += rows
        if rows == 0 and len(empty_names) < 30:
            empty_names.append(fp.name)

    print(f"  Total rows in merged: {total_rows}")
    print(f"  Empty merged files: {len(empty_names)}")
    if empty_names:
        print("   → " + ", ".join(empty_names) + (" ..." if len(empty_names) == 30 else ""))
    if unreadable:
        print(f"  [WARN] Unreadable merged files: {unreadable}")

import sys

# giả lập lệnh:
# python check_merged_parquet.py /home/kylh/phd/tw_fin_rl/work/data/gdelt_full_hardfix_split
sys.argv = [
    "check_merged_parquet.py",
    "/home/kylh/phd/tw_fin_rl/work/data/gdelt_full_hardfix_split"
]

# gọi hàm main trong notebook
main()



[INFO] Checking MERGED under: /home/kylh/phd/tw_fin_rl/work/data/gdelt_full_hardfix_split
  Merged files: 2
  Total rows in merged: 160804
  Empty merged files: 0


In [7]:
import pandas as pd
from pathlib import Path

def analyze_merged(base_dir: str, glob_pat: str = "**/*merged*.parquet"):
    base = Path(base_dir).expanduser().resolve()
    files = sorted(base.glob(glob_pat))

    dfs = []
    for fp in files:
        print(f"[INFO] Loading {fp} ...")
        try:
            df = pd.read_parquet(fp)
            dfs.append(df)
        except Exception as e:
            print(f"  [WARN] Can't read {fp}: {e}")
    if not dfs:
        return None
    
    data = pd.concat(dfs, ignore_index=True)
    
    # Xác định token từ cột 'symbol' (nếu có)
    if "symbol" in data.columns:
        tokens = data["symbol"].unique().tolist()
    else:
        # fallback: đoán từ text
        tokens = ["BTC","ETH","BNB","SOL"]

    results = {}
    for token in tokens:
        # chọn subset theo cột 'symbol'
        if "symbol" in data.columns:
            sub = data[data["symbol"] == token]
        else:
            mask = data["text"].str.contains(token, case=False, na=False)
            sub = data[mask]
        
        if sub.empty:
            continue

        results[token] = {
            "n_articles": len(sub),
            "time_range": (sub["date"].min(), sub["date"].max()) if "date" in sub.columns else None,
            "sources": sub["source"].nunique() if "source" in sub.columns else None,
            "langs": sub["lang"].value_counts().head(5).to_dict() if "lang" in sub.columns else None,
            "sample_titles": sub["title"].dropna().head(3).tolist() if "title" in sub.columns else None
        }
    return results, data

# Chạy:
stats, df_all = analyze_merged("/home/kylh/phd/tw_fin_rl/work/data/gdelt_full_hardfix_split")

import pprint
pprint.pprint(stats)


[INFO] Loading /home/kylh/phd/tw_fin_rl/work/data/gdelt_full_hardfix_split/filtered/gdelt_filtered_merged_2020-08-11_2025-09-03.parquet ...
[INFO] Loading /home/kylh/phd/tw_fin_rl/work/data/gdelt_full_hardfix_split/filtered/gdelt_raw_merged_2020-08-11_2025-09-03.parquet ...
{'BNB': {'langs': None,
         'n_articles': 3266,
         'sample_titles': ['Binance Cryptocurrency Auto Trading App',
                           'Elliptic adds BNB to its blockchain analytics '
                           'platform',
                           'US 1844907O588 Binance ! # AK56 #! ~recognizes '
                           'importances Customer of Awareness ddsSS - '
                           'Minnesota Twins Talk'],
         'sources': None,
         'time_range': (datetime.date(2020, 8, 16), datetime.date(2025, 9, 3))},
 'BTC': {'langs': None,
         'n_articles': 24930,
         'sample_titles': ['Bitcoin Breaks All - Time High Against Argentine '
                           'Peso , Turkish Lir