In [1]:
# ============================================================
# 001_arxiv_new_feed
# ============================================================
#
# Overview
# --------
# This notebook is responsible for fetching and normalizing newly published
# papers from arXiv on a periodic basis.
#
# The primary goal is to build a lightweight, reproducible "research intake"
# layer that continuously monitors arXiv categories of interest and converts
# raw feed data into a structured format suitable for downstream processing
# (e.g., summarization, classification, note-taking, and slide generation).
#
# This notebook is intentionally scoped to:
#   - Retrieval of new arXiv entries
#   - Minimal cleaning / normalization
#   - Deduplication and basic metadata handling
#
# Any heavy analysis, LLM-based interpretation, or knowledge synthesis should
# be handled in subsequent notebooks.
#
#
# Structure
# ---------
# 1. Configuration
#    - arXiv categories
#    - query parameters (date range, max results, update window)
#    - output paths / storage settings
#
# 2. Feed Retrieval
#    - Fetch recent arXiv entries via API / RSS
#    - Handle pagination and update windows
#
# 3. Normalization
#    - Standardize metadata (title, authors, abstract, published date, URL)
#    - Convert into a tabular / JSON-friendly format
#
# 4. Deduplication & Filtering
#    - Remove already-seen papers
#    - Apply basic keyword or category-level filters if needed
#
# 5. Output
#    - Persist normalized results for downstream notebooks
#    - Log execution summary (counts, timestamps)
#
#
# Notes
# -----
# - This notebook is designed to be idempotent when run with the same inputs.
# - Downstream notebooks should treat its outputs as append-only sources.
# - Keep this notebook free of model-specific or product-specific logic
#   to ensure long-term maintainability.
#
# - Naming convention:
#   001_xxx notebooks = "data intake / ingestion" layer
#
# ============================================================


In [2]:
# ============================================================
# 1. Configuration (Widgets) ‚Äî show output only after Apply
# ============================================================

from dataclasses import dataclass
from datetime import datetime, timezone, timedelta
import os

import ipywidgets as widgets
from IPython.display import display, clear_output


@dataclass
class ArxivFeedConfig:
    query_text: str
    max_results: int
    lookback_days: int | None

    sort_by: str = "submittedDate"
    sort_order: str = "descending"
    output_format: str = "csv"    
    output_basename: str = "arxiv_new_feed"
    enable_dedup: bool = True


# ------------------------
# Widgets (User Inputs)
# ------------------------
w_query = widgets.Textarea(
    value="venture capital OR startup OR innovation policy",
    description="query_text",
    layout=widgets.Layout(width="900px", height="80px"),
)

w_max = widgets.IntSlider(
    value=30, min=1, max=200, step=1,
    description="max_results",
    continuous_update=False,
    layout=widgets.Layout(width="900px"),
)

w_lookback = widgets.IntSlider(
    value=14, min=0, max=365, step=1,
    description="lookback_days",
    continuous_update=False,
    layout=widgets.Layout(width="900px"),
)

btn_apply = widgets.Button(description="Apply", button_style="primary")
out = widgets.Output()


# ------------------------
# Paths (Project Layout)
# ------------------------
NOTEBOOK_DIR = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, ".."))  # assume ./notebooks
DATA_DIR = os.path.join(PROJECT_ROOT, "data")
ARXIV_DIR = os.path.join(DATA_DIR, "arxiv")
CACHE_DIR = os.path.join(DATA_DIR, "cache")

os.makedirs(ARXIV_DIR, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)


# ------------------------
# Apply Handler
# ------------------------
def apply_config(_):
    global CFG, RUN_TS, CUTOFF_DT, OUTPUT_PATH

    lookback_days = int(w_lookback.value)
    lookback_days = None if lookback_days == 0 else lookback_days

    CFG = ArxivFeedConfig(
        query_text=w_query.value.strip(),
        max_results=int(w_max.value),
        lookback_days=lookback_days,
    )

    RUN_TS = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

    CUTOFF_DT = None
    if CFG.lookback_days is not None:
        CUTOFF_DT = datetime.now(timezone.utc) - timedelta(days=int(CFG.lookback_days))

    OUTPUT_PATH = os.path.join(
        ARXIV_DIR,
        f"{CFG.output_basename}_{RUN_TS}.{CFG.output_format}"
    )

    with out:
        clear_output(wait=True)
        print("‚úÖ Configuration applied")
        print(f"  query_text   : {CFG.query_text}")
        print(f"  max_results  : {CFG.max_results}")
        print(f"  lookback_days: {CFG.lookback_days}")
        print(f"  cutoff_dt    : {CUTOFF_DT}")
        print(f"  output_path  : {OUTPUT_PATH}")


btn_apply.on_click(apply_config)


# ------------------------
# Render UI (NO auto-apply)
# ------------------------
display(widgets.VBox([
    widgets.HTML("<h3>arXiv New Feed ‚Äî Configuration</h3>"),
    w_query,
    w_max,
    w_lookback,
    btn_apply,
    out
]))


VBox(children=(HTML(value='<h3>arXiv New Feed ‚Äî Configuration</h3>'), Textarea(value='venture capital OR start‚Ä¶

In [8]:
# ============================================================
# 2. Feed Retrieval
# ============================================================
#
# This section fetches recent arXiv entries using the official arXiv API (Atom feed).
# It is intentionally lightweight:
#   - Build a query from CFG
#   - Fetch entries (with pagination safety)
#   - Parse Atom -> list[dict]
#   - (Optional) apply lookback filter using CUTOFF_DT
#
# Output:
#   - entries: List[dict] with normalized raw fields (still "pre-DataFrame")
#
# Prereq:
#   - Run "1. Configuration" and click Apply so CFG / CUTOFF_DT exist.

import time
import urllib.parse
import requests
import feedparser


# ------------------------
# Helpers
# ------------------------
def build_arxiv_api_url(
    query_text: str,
    start: int = 0,
    max_results: int = 30,
    sort_by: str = "submittedDate",
    sort_order: str = "descending",
) -> str:
    base = "http://export.arxiv.org/api/query"
    params = {
        "search_query": f"all:{query_text}",
        "start": start,
        "max_results": max_results,
        "sortBy": sort_by,
        "sortOrder": sort_order,
    }
    return f"{base}?{urllib.parse.urlencode(params)}"


def parse_arxiv_entry(e) -> dict:
    """
    Convert feedparser entry into a dict with stable keys.
    """
    # arXiv id is usually in e.id (e.g., http://arxiv.org/abs/xxxx.xxxxxv1)
    arxiv_id = None
    if getattr(e, "id", None):
        arxiv_id = e.id.split("/abs/")[-1]

    # "published_parsed" is a time.struct_time
    published_dt = None
    if getattr(e, "published_parsed", None):
        published_dt = datetime(*e.published_parsed[:6], tzinfo=timezone.utc)

    updated_dt = None
    if getattr(e, "updated_parsed", None):
        updated_dt = datetime(*e.updated_parsed[:6], tzinfo=timezone.utc)

    authors = []
    if getattr(e, "authors", None):
        authors = [a.get("name") for a in e.authors if a.get("name")]

    # Primary link (abs page)
    abs_url = getattr(e, "link", None)

    # PDF link (sometimes in links)
    pdf_url = None
    if getattr(e, "links", None):
        for l in e.links:
            if l.get("type") == "application/pdf":
                pdf_url = l.get("href")
                break

    # Categories
    categories = []
    if getattr(e, "tags", None):
        categories = [t.get("term") for t in e.tags if t.get("term")]

    return {
        "arxiv_id": arxiv_id,
        "title": (getattr(e, "title", "") or "").strip().replace("\n", " "),
        "authors": authors,
        "abstract": (getattr(e, "summary", "") or "").strip().replace("\n", " "),
        "published_dt": published_dt,
        "updated_dt": updated_dt,
        "abs_url": abs_url,
        "pdf_url": pdf_url,
        "categories": categories,
    }


def fetch_arxiv_entries(
    cfg,
    cutoff_dt=None,
    timeout_sec: int = 30,
    polite_sleep_sec: float = 1.0,
) -> list[dict]:
    """
    Fetch entries from arXiv API (single page).
    NOTE: We request cfg.max_results in one shot. If you later need >2000,
          add pagination (start offsets).
    """
    url = build_arxiv_api_url(
        query_text=cfg.query_text,
        start=0,
        max_results=cfg.max_results,
        sort_by=cfg.sort_by,
        sort_order=cfg.sort_order,
    )

    # arXiv requests polite usage; we also set a User-Agent.
    headers = {"User-Agent": "researchOS/001_arxiv_new_feed (contact: you@example.com)"}

    resp = requests.get(url, headers=headers, timeout=timeout_sec)
    resp.raise_for_status()

    # polite delay (helps avoid being rate-limited if you run repeatedly)
    time.sleep(polite_sleep_sec)

    feed = feedparser.parse(resp.text)

    entries = []
    for e in feed.entries:
        d = parse_arxiv_entry(e)

        # optional lookback filter
        if cutoff_dt is not None and d["published_dt"] is not None:
            if d["published_dt"] < cutoff_dt:
                continue

        entries.append(d)

    return entries


# ------------------------
# Run Retrieval
# ------------------------
if "CFG" not in globals():
    raise RuntimeError("CFG is not defined. Run '1. Configuration' and click Apply first.")

print("üîé Fetching arXiv entries...")
entries = fetch_arxiv_entries(CFG, cutoff_dt=CUTOFF_DT)

print(f"‚úÖ Retrieved {len(entries)} entries")
if len(entries) > 0:
    print("  - latest published:", max(e["published_dt"] for e in entries if e["published_dt"] is not None))
    print("  - oldest  published:", min(e["published_dt"] for e in entries if e["published_dt"] is not None))
    print("  - example arxiv_id :", entries[0]["arxiv_id"])


üîé Fetching arXiv entries...
‚úÖ Retrieved 30 entries
  - latest published: 2025-12-29 18:59:33+00:00
  - oldest  published: 2025-12-29 08:26:27+00:00
  - example arxiv_id : 2512.23707v1


In [9]:
# ============================================================
# 3. Normalization
# ============================================================
#
# This section converts raw `entries` (list[dict]) into a normalized DataFrame
# with stable, downstream-friendly columns.
#
# Goals:
#   - Ensure consistent schema
#   - Create a few convenience fields (e.g., authors_str)
#   - Basic text cleanup (whitespace)
#   - Prepare for dedup/filter/output steps
#
# Output:
#   - df: normalized pandas.DataFrame

import pandas as pd
import re


# ------------------------
# Helpers
# ------------------------
def _clean_text(s: str) -> str:
    if s is None:
        return ""
    s = str(s)
    s = s.replace("\n", " ").replace("\r", " ").strip()
    s = re.sub(r"\s+", " ", s)
    return s


def normalize_entries_to_df(entries: list[dict]) -> pd.DataFrame:
    if entries is None:
        entries = []

    rows = []
    for e in entries:
        authors = e.get("authors") or []
        categories = e.get("categories") or []

        row = {
            "arxiv_id": e.get("arxiv_id"),
            "title": _clean_text(e.get("title", "")),
            "authors": authors,
            "authors_str": ", ".join([_clean_text(a) for a in authors if a]),
            "abstract": _clean_text(e.get("abstract", "")),
            "published_dt": e.get("published_dt"),
            "updated_dt": e.get("updated_dt"),
            "abs_url": e.get("abs_url"),
            "pdf_url": e.get("pdf_url"),
            "categories": categories,
            "categories_str": ", ".join([_clean_text(c) for c in categories if c]),
            "source": "arxiv_api",
        }
        rows.append(row)

    df = pd.DataFrame(rows)

    # Enforce column order (stable schema)
    desired_cols = [
        "arxiv_id",
        "title",
        "authors_str",
        "authors",
        "abstract",
        "published_dt",
        "updated_dt",
        "abs_url",
        "pdf_url",
        "categories_str",
        "categories",
        "source",
    ]
    for c in desired_cols:
        if c not in df.columns:
            df[c] = None
    df = df[desired_cols]

    # Normalize dtypes
    if not df.empty:
        df["published_dt"] = pd.to_datetime(df["published_dt"], utc=True, errors="coerce")
        df["updated_dt"] = pd.to_datetime(df["updated_dt"], utc=True, errors="coerce")

        # Ensure strings are strings (avoid NaN surprises downstream)
        for c in ["arxiv_id", "title", "authors_str", "abstract", "abs_url", "pdf_url", "categories_str", "source"]:
            df[c] = df[c].fillna("").astype(str).map(_clean_text)

    return df


# ------------------------
# Run Normalization
# ------------------------
if "entries" not in globals():
    raise RuntimeError("entries is not defined. Run '2. Feed Retrieval' first.")

df = normalize_entries_to_df(entries)

print("‚úÖ Normalization complete")
print("  rows   :", len(df))
print("  columns:", list(df.columns))

# Quick peek
display(df.head(10))


‚úÖ Normalization complete
  rows   : 30
  columns: ['arxiv_id', 'title', 'authors_str', 'authors', 'abstract', 'published_dt', 'updated_dt', 'abs_url', 'pdf_url', 'categories_str', 'categories', 'source']


Unnamed: 0,arxiv_id,title,authors_str,authors,abstract,published_dt,updated_dt,abs_url,pdf_url,categories_str,categories,source
0,2512.23707v1,Training AI Co-Scientists Using Rubric Rewards,"Shashwat Goel, Rishi Hazra, Dulhan Jayalath, T...","[Shashwat Goel, Rishi Hazra, Dulhan Jayalath, ...",AI co-scientists are emerging as a tool to ass...,2025-12-29 18:59:33+00:00,2025-12-29 18:59:33+00:00,https://arxiv.org/abs/2512.23707v1,https://arxiv.org/pdf/2512.23707v1,"cs.LG, cs.CL, cs.HC","[cs.LG, cs.CL, cs.HC]",arxiv_api
1,2512.23703v1,Robo-Dopamine: General Process Reward Modeling...,"Huajie Tan, Sixiang Chen, Yijie Xu, Zixiao Wan...","[Huajie Tan, Sixiang Chen, Yijie Xu, Zixiao Wa...",The primary obstacle for applying reinforcemen...,2025-12-29 18:57:44+00:00,2025-12-29 18:57:44+00:00,https://arxiv.org/abs/2512.23703v1,https://arxiv.org/pdf/2512.23703v1,cs.RO,[cs.RO],arxiv_api
2,2512.23694v1,Bellman Calibration for V-Learning in Offline ...,"Lars van der Laan, Nathan Kallus","[Lars van der Laan, Nathan Kallus]","We introduce Iterated Bellman Calibration, a s...",2025-12-29 18:52:18+00:00,2025-12-29 18:52:18+00:00,https://arxiv.org/abs/2512.23694v1,https://arxiv.org/pdf/2512.23694v1,"stat.ML, cs.LG, econ.EM","[stat.ML, cs.LG, econ.EM]",arxiv_api
3,2512.23688v1,Unlocking WebRTC for End User Driven Innovation,Kundan Singh,[Kundan Singh],We present a software architecture to enable e...,2025-12-29 18:44:59+00:00,2025-12-29 18:44:59+00:00,https://arxiv.org/abs/2512.23688v1,https://arxiv.org/pdf/2512.23688v1,"cs.MM, cs.NI","[cs.MM, cs.NI]",arxiv_api
4,2512.23650v1,Do You Have Freestyle? Expressive Humanoid Loc...,"Zhe Li, Cheng Chi, Yangyang Wei, Boan Zhu, Tao...","[Zhe Li, Cheng Chi, Yangyang Wei, Boan Zhu, Ta...","Humans intuitively move to sound, but current ...",2025-12-29 17:59:24+00:00,2025-12-29 17:59:24+00:00,https://arxiv.org/abs/2512.23650v1,https://arxiv.org/pdf/2512.23650v1,cs.RO,[cs.RO],arxiv_api
5,2512.23649v1,RoboMirror: Understand Before You Imitate for ...,"Zhe Li, Cheng Chi, Yangyang Wei, Boan Zhu, Tao...","[Zhe Li, Cheng Chi, Yangyang Wei, Boan Zhu, Ta...",Humans learn locomotion through visual observa...,2025-12-29 17:59:19+00:00,2025-12-29 17:59:19+00:00,https://arxiv.org/abs/2512.23649v1,https://arxiv.org/pdf/2512.23649v1,"cs.RO, cs.CV","[cs.RO, cs.CV]",arxiv_api
6,2512.23626v1,Regret-Based Federated Causal Discovery with U...,"Federico Baldo, Charles K. Assaad","[Federico Baldo, Charles K. Assaad]",Most causal discovery methods recover a comple...,2025-12-29 17:30:01+00:00,2025-12-29 17:30:01+00:00,https://arxiv.org/abs/2512.23626v1,https://arxiv.org/pdf/2512.23626v1,"cs.AI, cs.LG","[cs.AI, cs.LG]",arxiv_api
7,2512.23618v1,Verifiable Off-Chain Governance,"Jake Hartnell, Eugenio Battaglia","[Jake Hartnell, Eugenio Battaglia]",Current DAO governance praxis limits organizat...,2025-12-29 17:24:10+00:00,2025-12-29 17:24:10+00:00,https://arxiv.org/abs/2512.23618v1,https://arxiv.org/pdf/2512.23618v1,"cs.GT, cs.CE","[cs.GT, cs.CE]",arxiv_api
8,2512.23617v1,Le Cam Distortion: A Decision-Theoretic Framew...,Deniz Akdemir,[Deniz Akdemir],Distribution shift is the defining challenge o...,2025-12-29 17:21:44+00:00,2025-12-29 17:21:44+00:00,https://arxiv.org/abs/2512.23617v1,https://arxiv.org/pdf/2512.23617v1,"cs.LG, cs.AI, math.ST, stat.ME, stat.ML","[cs.LG, cs.AI, math.ST, stat.ME, stat.ML]",arxiv_api
9,2512.23611v1,Close the Loop: Synthesizing Infinite Tool-Use...,"Yuwen Li, Wei Zhang, Zelong Huang, Mason Yang,...","[Yuwen Li, Wei Zhang, Zelong Huang, Mason Yang...",Enabling Large Language Models (LLMs) to relia...,2025-12-29 17:12:39+00:00,2025-12-29 17:12:39+00:00,https://arxiv.org/abs/2512.23611v1,https://arxiv.org/pdf/2512.23611v1,cs.CL,[cs.CL],arxiv_api


In [6]:
# ============================================================
# 4. Deduplication & Filtering
# ============================================================
#
# This section removes already-seen papers and applies lightweight filters.
#
# Dedup strategy:
#   - Primary key: arxiv_id
#   - Maintain an "append-only" history index file under CACHE_DIR
#   - On each run: load seen_ids -> drop -> update seen_ids
#
# Filtering (optional):
#   - Drop rows with empty title / arxiv_id
#   - (Optional) keyword include/exclude rules (lightweight; keep simple here)
#
# Output:
#   - df_new: DataFrame containing only "new" entries for this run

import os
import json
import pandas as pd


# ------------------------
# Configurable knobs (optional)
# ------------------------
# Lightweight include/exclude keyword rules (case-insensitive).
# Leave empty lists to disable.
INCLUDE_KEYWORDS = []   # e.g., ["startup", "venture", "innovation"]
EXCLUDE_KEYWORDS = []   # e.g., ["survey", "benchmark"]

SEEN_IDS_PATH = os.path.join(CACHE_DIR, "arxiv_seen_ids.json")


# ------------------------
# Helpers
# ------------------------
def load_seen_ids(path: str) -> set[str]:
    if not os.path.exists(path):
        return set()
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return set([str(x) for x in data])
        if isinstance(data, dict) and "seen_ids" in data:
            return set([str(x) for x in data["seen_ids"]])
    except Exception:
        # If corrupted, fail softly and rebuild from scratch
        return set()
    return set()


def save_seen_ids(path: str, seen_ids: set[str]) -> None:
    tmp_path = path + ".tmp"
    payload = {
        "seen_ids": sorted(list(seen_ids)),
        "updated_at_utc": datetime.now(timezone.utc).isoformat(),
    }
    with open(tmp_path, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)
    os.replace(tmp_path, path)


def apply_keyword_filters(df: pd.DataFrame,
                          include_keywords: list[str],
                          exclude_keywords: list[str]) -> pd.DataFrame:
    if df.empty:
        return df

    text = (df["title"].fillna("") + " " + df["abstract"].fillna("")).str.lower()

    # Include: keep rows that match ANY include keyword
    if include_keywords:
        inc = pd.Series(False, index=df.index)
        for kw in include_keywords:
            kw = kw.strip().lower()
            if kw:
                inc = inc | text.str.contains(kw, regex=False)
        df = df[inc].copy()

    # Exclude: drop rows that match ANY exclude keyword
    if exclude_keywords:
        exc = pd.Series(False, index=df.index)
        for kw in exclude_keywords:
            kw = kw.strip().lower()
            if kw:
                exc = exc | text.str.contains(kw, regex=False)
        df = df[~exc].copy()

    return df


# ------------------------
# Run Dedup & Filtering
# ------------------------
if "df" not in globals():
    raise RuntimeError("df is not defined. Run '3. Normalization' first.")

df_work = df.copy()

# Basic sanity filter
df_work = df_work[df_work["arxiv_id"].str.len() > 0].copy()
df_work = df_work[df_work["title"].str.len() > 0].copy()

# Load seen ids
seen_ids = load_seen_ids(SEEN_IDS_PATH)

before = len(df_work)

# Dedup within current batch (safety)
df_work = df_work.drop_duplicates(subset=["arxiv_id"], keep="first").copy()

# Dedup vs history
if getattr(CFG, "enable_dedup", True):
    is_new = ~df_work["arxiv_id"].isin(seen_ids)
    df_new = df_work[is_new].copy()
else:
    df_new = df_work.copy()

# Optional keyword filtering
df_new = apply_keyword_filters(df_new, INCLUDE_KEYWORDS, EXCLUDE_KEYWORDS)

after = len(df_new)

# Update seen ids with ALL ids we encountered this run (not only df_new),
# so reruns won't re-emit the same papers.
seen_ids_updated = seen_ids | set(df_work["arxiv_id"].tolist())
save_seen_ids(SEEN_IDS_PATH, seen_ids_updated)

print("‚úÖ Deduplication & Filtering complete")
print(f"  input rows (normalized) : {len(df)}")
print(f"  after sanity filters    : {before}")
print(f"  new rows (this run)     : {after}")
print(f"  seen_ids file           : {SEEN_IDS_PATH}")

display(df_new.head(20))


‚úÖ Deduplication & Filtering complete
  input rows (normalized) : 30
  after sanity filters    : 30
  new rows (this run)     : 0
  seen_ids file           : /Users/yuetoya/Desktop/researchOS100-private/data/cache/arxiv_seen_ids.json


Unnamed: 0,arxiv_id,title,authors_str,authors,abstract,published_dt,updated_dt,abs_url,pdf_url,categories_str,categories,source


In [7]:
# ============================================================
# 5. Output
# ============================================================
#
# Persist deduplicated results as CSV.
# No optional dependencies required.

import os
import shutil


# ------------------------
# Run Output
# ------------------------
if "df_new" not in globals():
    raise RuntimeError("df_new is not defined. Run '4. Deduplication & Filtering' first.")

if "OUTPUT_PATH" not in globals():
    raise RuntimeError("OUTPUT_PATH is not defined. Run '1. Configuration' first.")

if df_new.empty:
    print("‚ö†Ô∏è No new entries to save. Skipping output.")
else:
    df_new.to_csv(OUTPUT_PATH, index=False)

    latest_path = os.path.join(
        ARXIV_DIR,
        f"{CFG.output_basename}_latest.csv"
    )

    # overwrite latest pointer (copy, not symlink for portability)
    shutil.copy2(OUTPUT_PATH, latest_path)

    print("‚úÖ Output saved")
    print(f"  rows written : {len(df_new)}")
    print(f"  output_path  : {OUTPUT_PATH}")
    print(f"  latest_path  : {latest_path}")


# ------------------------
# Run Summary
# ------------------------
print("\nüìå Run Summary")
print("  query_text     :", CFG.query_text)
print("  max_results    :", CFG.max_results)
print("  lookback_days  :", CFG.lookback_days)
print("  retrieved_rows :", len(df))
print("  new_rows       :", len(df_new))


‚ö†Ô∏è No new entries to save. Skipping output.

üìå Run Summary
  query_text     : venture capital OR startup OR innovation policy
  max_results    : 30
  lookback_days  : 14
  retrieved_rows : 30
  new_rows       : 0
