# Prepare Datasets
## Setting:

In [1]:
from pathlib import Path
import sys
import os
from pathlib import Path
import pandas as pd
import subprocess
from pathlib import Path

def _add_notebooks_src_to_path():
    here = Path.cwd().resolve()
    for p in [here, *here.parents]:
        candidate = p / "notebooks" / "src"
        if candidate.is_dir():
            if str(candidate) not in sys.path:
                sys.path.insert(0, str(candidate))
            return candidate
    raise FileNotFoundError("Could not find 'notebooks/src' from current working directory.")

print("Using helpers from:", _add_notebooks_src_to_path())

from constants import (
    REPO_ROOT, PKG_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_ROOT, ensure_repo_importable
)
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))
datasets_name = ["wikipedia", "reddit", "ucim" ]   # e.g., "uci"
bipartite = True
RAW_DATA_FILE = RAW_DATA_DIR 


print("REPO_ROOT       :", REPO_ROOT)
print("RAW_DATA_FILE    :", RAW_DATA_FILE)

Using helpers from: /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/notebooks/src
REPO_ROOT       : /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain
RAW_DATA_FILE    : /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/raw


## 1. Download data

The raw dataset files should have the same format as the Wikipedia dataset; that is: 

- First column: Source node ids
- Second column: Target node ids
- Third column: UNIX timestamp
- Fourth column: State label (not necessary for link prediction task)
- Fifth column and onwards: Comma seperated list of edge features

### Download Reddit

In [2]:
#!curl http://snap.stanford.edu/jodie/reddit.csv > "{RAW_DATA_DIR}/reddit.csv"

### Download Wikipedia

In [3]:
#!curl http://snap.stanford.edu/jodie/wikipedia.csv > "{RAW_DATA_DIR}/wikipedia.csv"

### Download UCI-Messages

In [4]:
#!curl http://opsahl.co.uk/tnet/datasets/OCnodeslinks.txt > "{RAW_DATA_DIR}/UCI-Messages.txt"

#### Reformat UCI-Messages:
The UCI datasets do not have this form by default. To make the conversion easier, the 
[format_uci_data.py](./format_uci_data.py) is provided. First download the dataset to a file from the website 
mentioned above. Then use the script to convert the downloaded file to an appropriate .csv file by running:

In [5]:
input_path = f"{RAW_DATA_DIR}/UCI-Messages.txt"
output_path = RAW_DATA_DIR / "ucim.csv"

inp = Path(input_path)
outp = Path(output_path)
outp.parent.mkdir(parents=True, exist_ok=True)

# --- Read raw ---
# Original script uses sep=" ". If your file has variable whitespace, consider sep=r"\s+", engine="python".
raw_data = pd.read_csv(inp, sep=" ", header=None)

# --- Transform ---
raw_data.columns = ['timestamp', 'item_id', 'user_id', 'state_label']
raw_data['timestamp'] = pd.to_datetime(raw_data['timestamp'])
raw_data['timestamp'] = (raw_data['timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
reordered_data = raw_data[['user_id', 'item_id', 'timestamp', 'state_label']]

# --- Save ---
reordered_data.to_csv(outp, index=False)
print(f"Reformatted file saved to {outp}")

# (Optional) Peek at the first few rows
try:
    from IPython.display import display
    display(reordered_data.head())
except Exception:
    print(reordered_data.head())

Reformatted file saved to /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/raw/ucim.csv


Unnamed: 0,user_id,item_id,timestamp,state_label
0,3,3,1080072715,1
1,1,1,1080072740,1
2,15,15,1080699604,1
3,141,141,1081348428,1
4,2,2,1081994911,1


## 2. Generate Synthetic Datasets

In [6]:
from time_to_explain.data.generate_synthetic_dataset import prepare_dataset

results = {}

results["erdos_small"] = prepare_dataset(
    project_root=REPO_ROOT,
    dataset_name="erdos_small",
    recipe="erdos_temporal",
    config_path=REPO_ROOT / "time_to_explain" / "data" / "configs" / "erdos_small.json",
    split=(0.8, 0.1, 0.1),
    visualize=True,
    overwrite=True,
)

try:
    results["hawkes_small"] = prepare_dataset(
        project_root=REPO_ROOT,
        dataset_name="hawkes_small",
        recipe="hawkes_exp",
        config_path=REPO_ROOT / "time_to_explain" / "data" / "configs" / "hawkes_small.json",
        split=(0.8, 0.1, 0.1),
        visualize=True,
        overwrite=True,
    )
except RuntimeError as exc:
    print(f"Skipping hawkes_small: {exc}")

results


Generated dataset 'erdos_small' using recipe 'erdos_temporal'.
{
  "num_interactions": 637,
  "min_ts": 0.015524928120638215,
  "max_ts": 7.994895618651639,
  "num_users": 44,
  "num_items": 46,
  "label_balance": {
    "0": 622,
    "1": 15
  }
}
Processed data   : /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/processed/erdos_small
Visualization dir: /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/processed/erdos_small/plots



[HawkesExp] Branching ratio spectral radius ≈ 1.245 ≥ 1; the process may be highly clustered or explosive on long horizons.



Generated dataset 'hawkes_small' using recipe 'hawkes_exp'.
{
  "num_interactions": 15437,
  "min_ts": 0.09196483972970995,
  "max_ts": 9.99994409423994,
  "num_users": 50,
  "num_items": 50,
  "label_balance": {
    "-1": 14327,
    "1": 555,
    "0": 555
  }
}
Processed data   : /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/processed/hawkes_small
Visualization dir: /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/processed/hawkes_small/plots


{'erdos_small': {'dataset': 'erdos_small',
  'recipe': 'erdos_temporal',
  'config': {'num_nodes': 50,
   'p': 0.06,
   'horizon': 8.0,
   'rate': 0.6,
   'positive_block_size': 8,
   'node_feat_dim': 4,
   'seed': 7},
  'stats': {'num_interactions': 637,
   'min_ts': 0.015524928120638215,
   'max_ts': 7.994895618651639,
   'num_users': 44,
   'num_items': 46,
   'label_balance': {0: 622, 1: 15}},
  'split': {'fractions': [0.8, 0.1, 0.1],
   'counts': {'train': 509, 'val': 64, 'test': 64},
   'cutoffs': {'train_max_ts': 6.513623874806881,
    'val_max_ts': 7.2725773226354296}},
  'processed_dir': '/Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/processed/erdos_small',
  'raw_path': '/Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/raw/erdos_small.csv',
  'visualizations': {'event_count': ['/Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/processed/e

## 3. Preprocess Datasets

In [7]:
from time_to_explain.data.tgnn_setup import setup_tgnn_data


print(f"REPO_ROOT:{REPO_ROOT}")
# common options
setup_tgnn_data(
    root=REPO_ROOT,      # repo root; defaults to cwd or env ROOT/REPO_ROOT/PROJECT_ROOT
    #only=["wikipedia", "reddit"],   # subset: "reddit", "wikipedia", "simulate_v1", "simulate_v2"
    force=False,                    # re-download even if file exists
    do_process=True,                # process real datasets (wikipedia/reddit)
    do_index=True,                  # generate explain indices
    seed=42,
    index_size=500
)

REPO_ROOT:/Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain
ROOT: /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain
✔ Skipping (exists): /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/raw/reddit.csv
✔ Skipping (exists): /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/raw/wikipedia.csv
✔ Skipping (exists): /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/raw/simulate_v1.csv
✔ Skipping (exists): /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/processed/ml_simulate_v1.csv
✔ Skipping (exists): /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/processed/ml_simulate_v1.npy
✔ Skipping (exists): /Users/juliawenkmann/Documents/CodingProjects/master_thesis/time_to_explain/resources/datasets/processed/ml_simulate_v1_

## 4. Visualize

In [8]:
!pip install plotly kaleido



In [None]:

import json
from pathlib import Path

import pandas as pd
from time_to_explain.core.registry import available_datasets
from time_to_explain.data.io import load_processed_dataset
from time_to_explain.data.generate_synthetic_dataset import prepare_dataset
from time_to_explain.utils.visualization import (
    visualize_folder,
    plot_bipartite_graph,
    animate_bipartite_graph,
    plot_explain_timeline,
    visualize_to_files,
)

repo_root = REPO_ROOT

def choose_explain_indices(num_events: int, *, count: int = 3) -> list[int]:
    if num_events <= 0:
        return []
    anchors = [0, num_events // 2, num_events - 1]
    seen: list[int] = []
    for a in anchors:
        if 0 <= a < num_events and a not in seen:
            seen.append(int(a))
        if len(seen) == count:
            break
    return seen

datasets = available_datasets()
print(f"Available synthetic recipes: {datasets}")

for recipe_name in datasets:
    print(f"=== {recipe_name} ===")
    try:
        bundle = load_processed_dataset(recipe_name)
        needs_generation = False
    except FileNotFoundError:
        bundle = None
        needs_generation = True

    if needs_generation:
        try:
            summary = prepare_dataset(
                project_root=repo_root,
                dataset_name=recipe_name,
                recipe=recipe_name,
                visualize=False,
                overwrite=True,
            )
            print(f"Generated processed dataset at {summary['processed_dir']}")
            bundle = load_processed_dataset(recipe_name)
        except Exception as exc:
            print(f"Skipping {recipe_name}: {exc}")
            continue

    interactions = bundle["interactions"] if bundle is not None else None
    explain_indices = choose_explain_indices(len(interactions) if interactions is not None else 0)

    try:
        visualize_folder(recipe_name)
        plot_bipartite_graph(recipe_name, max_users=50, max_items=50)
        animate_bipartite_graph(recipe_name, bins=25, cumulative=True)
        animate_bipartite_graph(recipe_name, bins=25, cumulative=True, pruned=0.25)
        if explain_indices:
            plot_explain_timeline(recipe_name, event_indices=explain_indices, window=15, max_base_points=20000)
        else:
            print(f"No events found in {recipe_name}; skipping explain timeline.")
        visualize_to_files(
            recipe_name,
            out_dir=Path('viz_out') / recipe_name,
            explain_indices=explain_indices,
            explain_time_window=15,
        )
    except Exception as exc:
        print(f"Visualization failed for {recipe_name}: {exc}")

processed_root = PROCESSED_DATA_DIR
synthetic_names = set(datasets)
processed_datasets: list[str] = []

if processed_root.exists():
    for entry in processed_root.iterdir():
        if entry.is_dir():
            processed_datasets.append(entry.name)

def is_synthetic_folder(name: str) -> bool:
    if name in synthetic_names:
        return True
    meta_path = processed_root / name / f"ml_{name}.json"
    if meta_path.exists():
        try:
            with meta_path.open("r", encoding="utf-8") as f:
                meta = json.load(f)
            recipe = meta.get("recipe") or (meta.get("metadata") or {}).get("recipe")
            return recipe is not None
        except Exception as exc:
            print(f"Could not parse metadata for {name}: {exc}")
    return False

def ensure_ml_format(dataset_name: str) -> None:
    dataset_dir = processed_root / dataset_name
    ml_csv = dataset_dir / f"ml_{dataset_name}.csv"
    if ml_csv.exists():
        return
    alt_csv = None
    for pattern in ("*_data.csv", "*.csv"):
        candidates = sorted(dataset_dir.glob(pattern))
        if candidates:
            alt_csv = candidates[0]
            break
    if alt_csv is None:
        print(f"No CSV source found for {dataset_name}; cannot build ml_ files.")
        return
    try:
        df = pd.read_csv(alt_csv)
    except Exception as exc:
        print(f"Failed to read {alt_csv}: {exc}")
        return
    rename_map = {
        "user_id": "u",
        "item_id": "i",
        "timestamp": "ts",
        "state_label": "label",
        "event_time": "ts",
        "event_id": "idx",
    }
    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})
    required = {"u", "i", "ts"}
    if not required.issubset(df.columns):
        print(f"Skipping {dataset_name}: columns {required} missing in {alt_csv.name}")
        return
    if "label" not in df.columns:
        df["label"] = 0
    df = df.sort_values("ts").reset_index(drop=True)
    if "idx" not in df.columns:
        df["idx"] = df.index.astype(int)
    if "e_idx" not in df.columns:
        df["e_idx"] = df["idx"] + 1
    # ensure dtypes
    for col in ("u", "i", "idx", "e_idx"):
        df[col] = df[col].astype(int)
    df["ts"] = df["ts"].astype(float)
    df["label"] = df["label"].astype(int)
    keep_cols = [c for c in ("u", "i", "ts", "label", "idx", "e_idx") if c in df.columns]
    df[keep_cols].to_csv(ml_csv, index=False)
    print(f"Created {ml_csv.name} from {alt_csv.name} for visualization support.")

processed_datasets = sorted(set(processed_datasets))
print(f"Processed datasets: {processed_datasets}")
non_synthetic = [name for name in processed_datasets if not is_synthetic_folder(name)]

print(f"Processed datasets (non-synthetic): {non_synthetic}")

for dataset_name in non_synthetic:
    print(f"=== processed: {dataset_name} ===")
    ensure_ml_format(dataset_name)
    try:
        bundle = load_processed_dataset(dataset_name)
    except FileNotFoundError as exc:
        print(f"Skipping {dataset_name}: {exc}")
        continue

    interactions = bundle["interactions"]
    explain_indices = choose_explain_indices(len(interactions))

    try:
        visualize_folder(dataset_name)
        plot_bipartite_graph(dataset_name, max_users=50, max_items=50)
        animate_bipartite_graph(dataset_name, bins=25, cumulative=True)
        animate_bipartite_graph(dataset_name, bins=25, cumulative=True, pruned=0.25)
        if explain_indices:
            plot_explain_timeline(dataset_name, event_indices=explain_indices, window=15, max_base_points=20000)
        else:
            print(f"No events found in {dataset_name}; skipping explain timeline.")
        visualize_to_files(
            dataset_name,
            out_dir=Path('viz_out') / dataset_name,
            explain_indices=explain_indices,
            explain_time_window=15,
        )
    except Exception as exc:
        print(f"Visualization failed for {dataset_name}: {exc}")


Available synthetic recipes: ['erdos_temporal', 'hawkes_exp']
=== erdos_temporal ===


=== hawkes_exp ===


Processed datasets: ['erdos_small', 'erdos_temporal', 'hawkes_exp', 'hawkes_small', 'reddit', 'uci_forums', 'ucim', 'wikipedia']
Processed datasets (non-synthetic): ['reddit', 'uci_forums', 'ucim', 'wikipedia']
=== processed: reddit ===
