# Imports

In [None]:
import sqlite3
import pandas as pd
from pathlib import Path

In [7]:
# Load and merge the signal table from all .db files
base_dir = Path("rsds-20241113")
parts = []

# Loop over all files and process them in order
for db_path in sorted(base_dir.glob("*.db")):
    con = sqlite3.connect(db_path)
    tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", con)["name"].tolist()

    # Only load data if the 'signal' table exists
    if "signal" in tables:
        # Read the columns we care about from the signal table into a df
        df = pd.read_sql("SELECT time, panel, kind, args FROM signal;", con)
        # add new column that records which db file this data came from
        df["source_db"] = db_path.name
        parts.append(df)

    con.close()

# Combine separate dataframes into one dataframe and sort by time and source data base
signal_all = pd.concat(parts, ignore_index=True)
signal_all = signal_all.sort_values(["source_db", "time"]).reset_index(drop=True)
signal_all.head()

Unnamed: 0,time,panel,kind,args,source_db
0,1731471896989,id-cc25aca1-c264-46cb-a3f1-ba0e6d1ac4d8,INotebookModel.changed:cellsChange,"{""delta"":[{""op"":""delete"",""arg"":1},{""op"":""inser...",001.db
1,1731471896990,id-cc25aca1-c264-46cb-a3f1-ba0e6d1ac4d8,INotebookModel.changed:metadataChange,"{""delta"":[{""key"":""kernelspec"",""act"":""update"",""...",001.db
2,1731471896990,id-cc25aca1-c264-46cb-a3f1-ba0e6d1ac4d8,INotebookModel.changed:nbformatChanged,"{""key"":""nbformat"",""val"":4}",001.db
3,1731471896990,id-cc25aca1-c264-46cb-a3f1-ba0e6d1ac4d8,INotebookModel.changed:nbformatChanged,"{""key"":""nbformat_minor"",""val"":5}",001.db
4,1731471897275,id-cc25aca1-c264-46cb-a3f1-ba0e6d1ac4d8,ISessionContext.sessionChanged,"{""val"":""5d642d19-ab4c-417a-ae37-cf7900863696""}",001.db


# Analysis

The research question being analyzed in this notebook is as follows: What types of actions most frequently lead to non-reproducible notebook states? 

We attempt to answer this question using two methods: proxy-based detection and replay-based reproducibility validation

### Proxy-based detection
A notebook enters a potentially non-reproducible state when:
- the kernel/session changes, or
- a cell is executed without a preceding code edit (i.e., execution happens while code hasn’t changed since the prior event)

These actions can create hidden states.

After flagging the proxies, we will analyze what action occurred right before it. Then we will compare how frequently such an action happened right before a triggering event to how often it happened in general using lift.
Lift is defined as the ratio between an event’s conditional probability in a target context and its overall probability.

In [None]:
# Implement proxies as boolean flags
signal_all["prev_kind"] = signal_all.groupby("source_db")["kind"].shift(1)
signal_all["kernel_change"] = (signal_all["kind"] == "ISessionContext.sessionChanged")
signal_all["exec_event"] = (signal_all["kind"] == "ISharedCell.changed:executionCountChange")
signal_all["exec_without_edit"] = (signal_all["exec_event"] & (signal_all["prev_kind"] != "ISharedCell.changed:sourceChange"))
signal_all["non_repro_state"] = signal_all["kernel_change"] | signal_all["exec_without_edit"]

In [14]:
# Filter events that indicate a non-reproducible state
non_repro_events = signal_all[signal_all["non_repro_state"]].copy()
non_repro_events.shape[0]

2899

In [17]:
# Identify which actions lead to it, by analyzing the immediately preceding action
non_repro_events["pos_cause_kind"] = non_repro_events["prev_kind"]
pos_cause_counts = non_repro_events["pos_cause_kind"].value_counts()
pos_cause_counts

pos_cause_kind
ISharedCell.changed:outputsChange           1571
ISharedCell.changed:executionCountChange    1219
INotebookModel.changed:cellsChange            82
INotebookModel.changed:nbformatChanged        22
ISharedCell.changed:metadataChange             2
INotebookModel.changed:metadataChange          2
ISharedCell.changed:sourceChange               1
Name: count, dtype: int64

In [19]:
# Compare frequencies to baseline: how much more often does an action appear right before non-reproducable states than in general?
baseline = signal_all["kind"].value_counts(normalize=True)
before_nonrepro = non_repro_events["pos_cause_kind"].value_counts(normalize=True)
comparison = (pd.concat([baseline.rename("overall_share"),  before_nonrepro.rename("share_before_nonrepro")], axis=1).fillna(0))
comparison["lift"] = comparison["share_before_nonrepro"] / comparison["overall_share"].replace(0, pd.NA)
comparison.sort_values("lift", ascending=False).head(15)

Unnamed: 0,overall_share,share_before_nonrepro,lift
INotebookModel.changed:nbformatChanged,0.001121,0.007589,6.772266
ISharedCell.changed:executionCountChange,0.081331,0.42049,5.17009
ISharedCell.changed:outputsChange,0.264635,0.541911,2.04777
INotebookModel.changed:cellsChange,0.014276,0.028286,1.981325
INotebookModel.changed:metadataChange,0.001031,0.00069,0.669196
ISharedCell.changed:metadataChange,0.001412,0.00069,0.48862
ISharedCell.changed:sourceChange,0.635612,0.000345,0.000543
ISessionContext.sessionChanged,0.000583,0.0,0.0


Nieks manier

for each session:

-reconstruct the observed final notebook (from signals/snapshots),

-execute it clean top-to-bottom in a fresh kernel,

-compare outputs → label reproducible vs non-reproducible,

-then analyze which actions predict that label.

In [7]:
# Step 1: generate notebook snapshot(s) from each db
# 1A) install redspot CLI 

In [8]:
import sqlite3
import pandas as pd
from pathlib import Path

db_path = Path("rsds-20241113/001.db")

con = sqlite3.connect(db_path)
signal = pd.read_sql("SELECT time, panel, kind, args FROM signal ORDER BY time", con)
con.close()

signal["panel"].value_counts().head(10)

panel
id-cc25aca1-c264-46cb-a3f1-ba0e6d1ac4d8    2520
Name: count, dtype: int64

In [9]:
import json

def parse_args(x):
    try:
        return json.loads(x)
    except Exception:
        return None

signal["args_parsed"] = signal["args"].apply(parse_args)
signal[["kind", "args_parsed"]].head(5)


Unnamed: 0,kind,args_parsed
0,INotebookModel.changed:cellsChange,"{'delta': [{'op': 'delete', 'arg': 1}, {'op': ..."
1,INotebookModel.changed:metadataChange,"{'delta': [{'key': 'kernelspec', 'act': 'updat..."
2,INotebookModel.changed:nbformatChanged,"{'key': 'nbformat', 'val': 4}"
3,INotebookModel.changed:nbformatChanged,"{'key': 'nbformat_minor', 'val': 5}"
4,ISessionContext.sessionChanged,{'val': '5d642d19-ab4c-417a-ae37-cf7900863696'}


In [10]:
def apply_text_delta(old: str, delta_ops: list) -> str:
    """
    Apply Redspot text delta operations to a string.
    Each op is one of:
      {"op":"retain","arg":N}
      {"op":"delete","arg":N}
      {"op":"insert","arg":"..."}
    """
    if old is None:
        old = ""
    i = 0  # cursor into old string
    out = []

    for op in delta_ops:
        t = op.get("op")
        a = op.get("arg")

        if t == "retain":
            out.append(old[i:i+a])
            i += a
        elif t == "delete":
            i += a
        elif t == "insert":
            out.append(a)
        else:
            # unknown op -> ignore safely
            pass

    # append any remaining old text (usually not needed, but safe)
    out.append(old[i:])
    return "".join(out)


In [11]:
def apply_list_delta(old: list, delta_ops: list) -> list:
    """
    Apply Redspot list delta operations to a list.
    Ops:
      {"op":"retain","arg":N}
      {"op":"delete","arg":N}
      {"op":"insert","arg":[...]}  # arg is a list to insert
    """
    if old is None:
        old = []
    i = 0
    out = []

    for op in delta_ops:
        t = op.get("op")
        a = op.get("arg")

        if t == "retain":
            out.extend(old[i:i+a])
            i += a
        elif t == "delete":
            i += a
        elif t == "insert":
            # insert expects a list
            if isinstance(a, list):
                out.extend(a)
            else:
                out.append(a)
        else:
            pass

    out.extend(old[i:])
    return out


In [12]:
def reconstruct_panel(panel_df: pd.DataFrame):
    """
    Reconstruct final notebook state for one panel by applying signals in time order.
    Returns:
      cell_order: list[str]
      cells: dict[cell_id] -> cell dict
    """
    cell_order = []
    cells = {}  # cell_id -> {"cell_type":..., "source":..., "outputs":..., "execution_count":..., "metadata":...}

    for _, row in panel_df.iterrows():
        kind = row["kind"]
        a = row["args_parsed"] or {}

        # 1) cellsChange: edits the notebook cell list (insert/delete/retain)
        if kind == "INotebookModel.changed:cellsChange":
            delta = a.get("delta", [])
            new_order = []
            cursor = 0

            for op in delta:
                t = op.get("op")
                arg = op.get("arg")

                if t == "retain":
                    new_order.extend(cell_order[cursor:cursor+arg])
                    cursor += arg

                elif t == "delete":
                    # delete arg cells from the existing order
                    cursor += arg

                elif t == "insert":
                    # arg is a list of cell objects
                    for cell_obj in arg:
                        cid = cell_obj["id"]
                        # store/overwrite cell content
                        cells[cid] = {
                            "cell_type": cell_obj.get("cell_type", "code"),
                            "source": cell_obj.get("source", ""),
                            "outputs": cell_obj.get("outputs", []) or [],
                            "execution_count": cell_obj.get("execution_count", None),
                            "metadata": cell_obj.get("metadata", {}) or {},
                        }
                        new_order.append(cid)

            # append any remaining cells that weren't consumed (safety)
            new_order.extend(cell_order[cursor:])
            cell_order = new_order

        # 2) sourceChange: apply text delta to a specific cell's source
        elif kind == "ISharedCell.changed:sourceChange":
            cid = a.get("cell")
            delta = a.get("delta", [])
            if cid is not None:
                if cid not in cells:
                    # create placeholder if we haven't seen it via cellsChange yet
                    cells[cid] = {"cell_type": "code", "source": "", "outputs": [], "execution_count": None, "metadata": {}}
                cells[cid]["source"] = apply_text_delta(cells[cid].get("source", ""), delta)

        # 3) outputsChange: apply list delta to a specific cell's outputs
        elif kind == "ISharedCell.changed:outputsChange":
            cid = a.get("cell")
            delta = a.get("delta", [])
            if cid is not None:
                if cid not in cells:
                    cells[cid] = {"cell_type": "code", "source": "", "outputs": [], "execution_count": None, "metadata": {}}
                cells[cid]["outputs"] = apply_list_delta(cells[cid].get("outputs", []), delta)

        # 4) executionCountChange: set execution_count
        elif kind == "ISharedCell.changed:executionCountChange":
            cid = a.get("cell")
            val = a.get("val")
            if cid is not None:
                if cid not in cells:
                    cells[cid] = {"cell_type": "code", "source": "", "outputs": [], "execution_count": None, "metadata": {}}
                cells[cid]["execution_count"] = val

        # ignore metadata/session changes for reconstruction (you can add later)

    return cell_order, cells


In [15]:
import nbformat

def normalize_outputs(outputs):
    """
    Convert a list of output dicts from RSDS into nbformat output objects
    so nbformat.write(...) won't crash.
    """
    if outputs is None:
        return []

    normalized = []
    for o in outputs:
        if isinstance(o, dict):
            # Convert dict -> NotebookNode output
            normalized.append(nbformat.from_dict(o))
        else:
            # Already a NotebookNode or something else, keep it
            normalized.append(o)
    return normalized


In [16]:
def state_to_notebook(cell_order, cells):
    nb = new_notebook()
    nb.cells = []

    for cid in cell_order:
        c = cells.get(cid)
        if not c:
            continue

        cell_type = c.get("cell_type", "code")

        if cell_type == "markdown":
            nb.cells.append(new_markdown_cell(source=c.get("source", "")))
        else:
            code = new_code_cell(source=c.get("source", ""))

            # ✅ FIX: normalize outputs
            code["outputs"] = normalize_outputs(c.get("outputs", []) or [])

            code["execution_count"] = c.get("execution_count", None)
            code["metadata"] = c.get("metadata", {}) or {}
            nb.cells.append(code)

    return nb


In [17]:
out_dir = Path("reconstructed")
out_dir.mkdir(exist_ok=True)

panel_name = signal["panel"].iloc[0]  # pick first panel for now
panel_df = signal[signal["panel"] == panel_name].copy()

cell_order, cells = reconstruct_panel(panel_df)
nb_obs = state_to_notebook(cell_order, cells)

obs_path = out_dir / f"{db_path.stem}_{panel_name}_observed.ipynb"
nbformat.write(nb_obs, obs_path)

obs_path


WindowsPath('reconstructed/001_id-cc25aca1-c264-46cb-a3f1-ba0e6d1ac4d8_observed.ipynb')

In [51]:
# check csv

import pandas as pd

df = pd.read_csv("train.csv")
print(df.columns.tolist())
df.head()

['passenger_id', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked']


Unnamed: 0,passenger_id,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [52]:
import pandas as pd
import re

def normalize_titanic_schema_v2(path):
    df = pd.read_csv(path)

    rename_map = {
        "Sex": "sex",
        "SibSp": "sibsp",
        "Parch": "parch",
        "Pclass": "pclass",
        "Fare": "fare",
        "Age": "age",
        "Embarked": "embarked",
        "Cabin": "cabin",
        "Ticket": "ticket",
        "Name": "name",
        "PassengerId": "passenger_id",   # ✅ underscore version expected by notebook
        "Survived": "survived",
        # If you've already lowercased without underscore earlier:
        "passengerid": "passenger_id",   # ✅ catch your current column name too
    }

    # Keep only applicable renames
    rename_map = {k: v for k, v in rename_map.items() if k in df.columns}

    df = df.rename(columns=rename_map)
    #df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    df.to_csv(path, index=False)

    print(f"{path} -> renamed: {rename_map}")
    print("Columns now:", df.columns.tolist())

normalize_titanic_schema_v2("train.csv")
normalize_titanic_schema_v2("test.csv")


train.csv -> renamed: {}
Columns now: ['passenger_id', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked']
test.csv -> renamed: {}
Columns now: ['passenger_id', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'survived']


In [53]:
import pandas as pd

test = pd.read_csv("test.csv")
if "survived" not in test.columns:
    test["survived"] = 0
    test.to_csv("test.csv", index=False)
    print("Added dummy survived column to test.csv")
else:
    print("test.csv already has survived")

test.csv already has survived


In [54]:
import nbformat
from nbformat.v4 import new_code_cell

# 1) Load the observed notebook
nb = nbformat.read(obs_path, as_version=4)

# 2) The patch cell: sanitize train_x/test_x columns right before LightGBM
patch = new_code_cell(
    """
import re
import unicodedata
import pandas as pd

def _safe_feature_names(cols):
    out = []
    seen = {}
    for c in cols:
        s = unicodedata.normalize("NFKC", str(c)).strip()
        s = re.sub(r"\\s+", "_", s)
        s = re.sub(r"[^0-9a-zA-Z_]+", "_", s)
        s = re.sub(r"_+", "_", s).strip("_")
        if s == "":
            s = "col"
        # ensure uniqueness
        k = s
        if k in seen:
            seen[k] += 1
            s = f"{k}_{seen[k]}"
        else:
            seen[k] = 0
        out.append(s)
    return out

# apply if these variables exist in the notebook at this point
if "train_x" in globals():
    train_x.columns = _safe_feature_names(train_x.columns)
if "test_x" in globals():
    test_x.columns = _safe_feature_names(test_x.columns)

print("Sanitized feature names. Example:", list(train_x.columns)[:10])
"""
)

# 3) Insert patch right before the cell that contains "LightGBMTunerCV" or "lgb.Dataset"
insert_at = None
for i, cell in enumerate(nb.cells):
    if cell.cell_type == "code":
        src = cell.source
        if "LightGBMTunerCV" in src or "lgb.Dataset" in src:
            insert_at = i
            break

if insert_at is None:
    insert_at = len(nb.cells)

nb.cells.insert(insert_at, patch)

patched_obs_path = obs_path.with_name(obs_path.stem + "_patched.ipynb")
nbformat.write(nb, patched_obs_path)

patched_obs_path


WindowsPath('reconstructed/001_id-cc25aca1-c264-46cb-a3f1-ba0e6d1ac4d8_observed_patched.ipynb')

In [56]:
from nbformat.v4 import new_code_cell
import nbformat

nb = nbformat.read(patched_obs_path, as_version=4)

align_patch = new_code_cell(
    """
# --- PATCH: align test_x columns to train_x columns ---
# LightGBM requires test features to match training features exactly.

train_cols = list(train_x.columns)

# Add any missing columns to test_x with zeros
missing = [c for c in train_cols if c not in test_x.columns]
for c in missing:
    test_x[c] = 0

# Drop any extra columns not seen during training
extra = [c for c in test_x.columns if c not in train_cols]
if len(extra) > 0:
    test_x = test_x.drop(columns=extra)

# Reorder to match training
test_x = test_x[train_cols]

print("Aligned test_x to train_x:",
      "train_x:", train_x.shape,
      "test_x:", test_x.shape,
      "missing added:", len(missing),
      "extra dropped:", len(extra))
"""
)

# Insert right before the first cell that calls ".predict(" OR that defines best_model/model training
insert_at = None
for i, cell in enumerate(nb.cells):
    if cell.cell_type == "code" and (".predict(" in cell.source or "best_model" in cell.source):
        insert_at = i
        break

if insert_at is None:
    insert_at = len(nb.cells)

nb.cells.insert(insert_at, align_patch)

patched2_path = patched_obs_path.with_name(patched_obs_path.stem + "_aligned.ipynb")
nbformat.write(nb, patched2_path)

patched2_path



WindowsPath('reconstructed/001_id-cc25aca1-c264-46cb-a3f1-ba0e6d1ac4d8_observed_patched_aligned.ipynb')

In [57]:
from nbclient import NotebookClient

def execute_notebook(input_path: Path, output_path: Path, timeout=600):
    nb = nbformat.read(input_path, as_version=4)
    client = NotebookClient(nb, timeout=timeout, kernel_name="python3")
    client.execute()
    nbformat.write(nb, output_path)

replay_dir = Path("replayed")
replay_dir.mkdir(exist_ok=True)

# replay_path = replay_dir / obs_path.name.replace("_observed", "_replayed")
# execute_notebook(obs_path, replay_path)

# replay_path = Path("replayed") / patched_obs_path.name.replace("_patched", "_replayed")
# execute_notebook(patched_obs_path, replay_path)

replay_path = Path("replayed") / patched2_path.name.replace("_aligned", "_replayed")
execute_notebook(patched2_path, replay_path)

replay_path


WindowsPath('replayed/001_id-cc25aca1-c264-46cb-a3f1-ba0e6d1ac4d8_observed_patched_replayed.ipynb')

In [58]:
# Compare observed vs replay

import nbformat
import hashlib
import json

def normalize_output(output):
    """
    Convert a single output object into a stable JSON-serializable dict.
    This removes volatile fields (like execution count) and keeps content.
    """
    out = dict(output)

    # Remove volatile fields that change even when notebook is "reproducible"
    out.pop("execution_count", None)

    # For errors, keep only the essentials
    if out.get("output_type") == "error":
        out = {
            "output_type": "error",
            "ename": out.get("ename"),
            "evalue": out.get("evalue"),
        }

    return out

def outputs_fingerprint(nb_path):
    """
    Create a hash representing all cell outputs in order.
    If this hash differs between observed and replayed -> outputs differ.
    """
    nb = nbformat.read(nb_path, as_version=4)

    all_outputs = []
    for cell in nb.cells:
        if cell.get("cell_type") == "code":
            outs = cell.get("outputs", [])
            all_outputs.append([normalize_output(o) for o in outs])

    payload = json.dumps(all_outputs, sort_keys=True, ensure_ascii=False).encode("utf-8")
    return hashlib.sha256(payload).hexdigest()

obs_fp = outputs_fingerprint(obs_path)         # your observed notebook path
rep_fp = outputs_fingerprint(replay_path)      # your replayed notebook path

print("Observed outputs hash:", obs_fp)
print("Replayed  outputs hash:", rep_fp)
print("Same outputs?", obs_fp == rep_fp)


Observed outputs hash: 584ac5279520e243ecb921f74a55e22f754622ad5f14107099ed18f5259ef599
Replayed  outputs hash: f7a19b4f94bb5c45dc234a1299eea4d856c8eaaffb391927055c9707d7c830a2
Same outputs? False


In [59]:
def diff_cells(obs_path, rep_path):
    obs = nbformat.read(obs_path, as_version=4)
    rep = nbformat.read(rep_path, as_version=4)

    diffs = []
    for i, (c1, c2) in enumerate(zip(obs.cells, rep.cells)):
        if c1.get("cell_type") != "code":
            continue
        o1 = [normalize_output(o) for o in c1.get("outputs", [])]
        o2 = [normalize_output(o) for o in c2.get("outputs", [])]
        if o1 != o2:
            diffs.append(i)
    return diffs

diff_idxs = diff_cells(obs_path, replay_path)
print("Cells with different outputs:", diff_idxs[:20], "..." if len(diff_idxs) > 20 else "")
print("Total differing cells:", len(diff_idxs))


Cells with different outputs: [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15] 
Total differing cells: 15


In [61]:
import nbformat
from pprint import pprint

def print_differing_cells(obs_path, rep_path):
    obs = nbformat.read(obs_path, as_version=4)
    rep = nbformat.read(rep_path, as_version=4)

    for i, (c1, c2) in enumerate(zip(obs.cells, rep.cells)):
        if c1.get("cell_type") != "code":
            continue

        o1 = [normalize_output(o) for o in c1.get("outputs", [])]
        o2 = [normalize_output(o) for o in c2.get("outputs", [])]

        if o1 != o2:
            print("=" * 80)
            print(f"Cell {i}")
            print("- Source code:")
            print(c1.source)
            print("- Observed outputs:")
            pprint(o1)
            print("- Replayed outputs:")
            pprint(o2)

print_differing_cells(obs_path, replay_path)


Cell 0
- Source code:
!pip install pandas
import pandas as pd
- Observed outputs:
[{'name': 'stdout',
  'output_type': 'stream',
          '/kernel/lib/python3.12/site-packages (2.2.3)\n'
          '/kernel/lib/python3.12/site-packages (from pandas) (2.1.3)\n'
          '/kernel/lib/python3.12/site-packages (from pandas) (2.9.0.post0)\n'
          '/kernel/lib/python3.12/site-packages (from pandas) (2024.2)\n'
          '/kernel/lib/python3.12/site-packages (from pandas) (2024.2)\n'
          '/kernel/lib/python3.12/site-packages (from '
          'python-dateutil>=2.8.2->pandas) (1.16.0)\n'
          '\n'
          '\x1b[1m[\x1b[0m\x1b[34;49mnotice\x1b[0m\x1b[1;39;49m]\x1b[0m\x1b[39;49m '
          'A new release of pip is available: '
          '\x1b[0m\x1b[31;49m24.2\x1b[0m\x1b[39;49m -> '
          '\x1b[0m\x1b[32;49m24.3.1\x1b[0m\n'
          '\x1b[1m[\x1b[0m\x1b[34;49mnotice\x1b[0m\x1b[1;39;49m]\x1b[0m\x1b[39;49m '
          'To update, run: \x1b[0m\x1b[32;49mpython -m pip instal

In [62]:
def summarize_differences(obs_path, rep_path):
    obs = nbformat.read(obs_path, as_version=4)
    rep = nbformat.read(rep_path, as_version=4)

    for i, (c1, c2) in enumerate(zip(obs.cells, rep.cells)):
        if c1.get("cell_type") != "code":
            continue

        if c1.get("outputs", []) != c2.get("outputs", []):
            print(f"Cell {i}: outputs differ | source length = {len(c1.source)} chars")

summarize_differences(obs_path, replay_path)


Cell 0: outputs differ | source length = 39 chars
Cell 1: outputs differ | source length = 55 chars
Cell 2: outputs differ | source length = 52 chars
Cell 3: outputs differ | source length = 132 chars
Cell 4: outputs differ | source length = 133 chars
Cell 5: outputs differ | source length = 157 chars
Cell 7: outputs differ | source length = 73 chars
Cell 8: outputs differ | source length = 612 chars
Cell 9: outputs differ | source length = 20 chars
Cell 10: outputs differ | source length = 19 chars
Cell 11: outputs differ | source length = 2230 chars
Cell 12: outputs differ | source length = 195 chars
Cell 13: outputs differ | source length = 383 chars
Cell 14: outputs differ | source length = 111 chars
Cell 15: outputs differ | source length = 0 chars
