In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import glob
import csv
import json
from collections import defaultdict, Counter
import pandas as pd

# ---------------- CONFIG ----------------
INPUT_DIR  = "tdrive_boxes_time_MINUTES"      # where *_boxes_reach.csv live
OUTPUT_DIR = "tdrive_popularity_sparse_1hour_250" # one file per starting big box

# Expected columns in input
COL_FILE       = "file"
COL_DATE       = "date"
COL_SLOT       = "slot_id"
COL_BOX_LABELS = "box_labels"   # "big/small/time;big/small/time;..."
# ----------------------------------------


def parse_label_loose(token: str):
    """
    Parse a token like 'big/small/time' or 'big/small' into (big_id:int, small_id:int).
    We deliberately ignore time; we only need big and small.
    Returns None if parsing fails.
    """
    try:
        parts = token.strip().split("/")
        if len(parts) < 2:
            return None
        big_id = int(parts[0].strip())
        small_id = int(parts[1].strip())
        return big_id, small_id
    except Exception:
        return None


def build_day_index_map(all_dates):
    """
    Map unique date strings -> day_idx (1..N) in ascending date order.
    """
    unique_sorted = sorted(set(all_dates))
    return {d: i + 1 for i, d in enumerate(unique_sorted)}


def process_folder(input_dir: str, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)

    # 1) Load all *_boxes_reach.csv
    paths = sorted(glob.glob(os.path.join(input_dir, "*_boxes_reach.csv")))
    if not paths:
        print("No *_boxes_reach.csv files found.")
        return

    frames = []
    for p in paths:
        try:
            df = pd.read_csv(p, dtype={COL_FILE: str, COL_DATE: str}, keep_default_na=False)
            cols = [COL_FILE, COL_DATE, COL_SLOT, COL_BOX_LABELS]
            df = df[[c for c in cols if c in df.columns]].copy()
            frames.append(df)
        except Exception as e:
            print(f"[WARN] Skipping {os.path.basename(p)}: {e}")

    if not frames:
        print("No valid data after reading files.")
        return

    data = pd.concat(frames, ignore_index=True)
    if data.empty:
        print("Input data is empty.")
        return

    # 2) Build day indices
    day_map = build_day_index_map(data[COL_DATE].tolist())
    data["day_idx"] = data[COL_DATE].map(day_map)

    # 3) Accumulator:
    #    counts[start_big][(day_idx, slot_id)] = Counter({small_id: crossings})
    counts = defaultdict(lambda: defaultdict(Counter))

    # 4) Iterate rows; count EVERY crossing (revisits included)
    for _, row in data.iterrows():
        date_str   = row.get(COL_DATE, "")
        slot_val   = row.get(COL_SLOT, None)
        labels_str = row.get(COL_BOX_LABELS, "")

        if not date_str or slot_val is None or not labels_str:
            continue

        try:
            slot_id = int(slot_val)
        except Exception:
            continue

        tokens = [tok.strip() for tok in labels_str.split(";") if tok.strip()]
        if not tokens:
            continue

        # Starting big box from the first label
        first = parse_label_loose(tokens[0])
        if first is None:
            continue
        start_big, _ = first

        day_idx = day_map[date_str]
        ctr = counts[start_big][(day_idx, slot_id)]

        # Count every occurrence of small_id (revisits included)
        for tok in tokens:
            parsed = parse_label_loose(tok)
            if parsed is None:
                continue
            _, small_id = parsed
            ctr[int(small_id)] += 1

    # 5) Write one CSV per starting big box
    for start_big, bucket in counts.items():
        out_path = os.path.join(output_dir, f"big_{start_big}_popularity_sparse.csv")
        with open(out_path, "w", newline="") as f:
            writer = csv.writer(f)
            # Header: day_idx, slot_id, nnz (#unique small boxes), total_crossings, counts_json
            writer.writerow(["day_idx", "slot_id", "nnz", "total_crossings", "counts_json"])
            for (day_idx, slot_id), ctr in sorted(bucket.items()):
                nnz = len(ctr)
                total_crossings = int(sum(ctr.values()))
                # JSON object: keys as strings for strict JSON; values as ints
                counts_json = json.dumps({str(k): int(v) for k, v in sorted(ctr.items())},
                                         separators=(',', ':'), sort_keys=True)
                writer.writerow([day_idx, slot_id, nnz, total_crossings, counts_json])

        print(f"[OK] Wrote {os.path.basename(out_path)} with {len(bucket)} rows")

    # Also write day index map for reference
    pd.DataFrame(
        [{"date": d, "day_idx": idx} for d, idx in sorted(day_map.items(), key=lambda x: x[1])]
    ).to_csv(os.path.join(output_dir, "day_index_map.csv"), index=False)
    print("[OK] Wrote day_index_map.csv")
def main():
    process_folder(INPUT_DIR, OUTPUT_DIR)
if __name__ == "__main__":
    main()


In [None]:
"""
Chosing the cellular areas (big_boxes) for the experimemts.
Enumerate small_ids across selected big boxes, align rows/columns, and write:
  - global_small_ids.csv        : col_idx (0-based), small_id
  - reverse_small_ids.csv       : small_id, col_idx (0-based)
  - big_<ID>_vectors.csv        : day_idx, slot_id, vector (zero-filled, aligned)

Behavior:
- Columns = UNION of small_ids observed in the SELECTED files only (not the whole folder).
- Rows    = UNION of (day_idx, slot_id) across the SELECTED files; zero-fill where missing.
- Values  = parsed from counts_json as float (works for both popularity ints and UWP floats).

Adjust SPARSE_DIR / OUT_DIR / BIG_IDS as needed.
"""

import os
import glob
import csv
import json
import pandas as pd

# ---------------- CONFIG ----------------
SPARSE_DIR = "tdrive_popularity_sparse_1hour_250"  # or popularity folder
OUT_DIR    = "Popularity_ENUM_Minutes"    # output folder
FILE_GLOB  = "big_*_popularity_sparse.csv"

# Select ONLY these big boxes; global small_ids are built from them
BIG_IDS = [21, 22]   # <-- edit as needed Change according to need, if want to reproduce result for Tdrive then the is [21,22], 
#for San Francisco [12,13]


# Vector serialization (keep float to support UWP). If using pure ints, it's fine too.
ROUND_DECIMALS = 6
# ----------------------------------------


def safe_load_json(s: str):
    try:
        return json.loads(s) if isinstance(s, str) and s else {}
    except Exception:
        return {}


def list_available_files(sparse_dir: str, pattern: str):
    """Return {big_id: filepath} for all matching files."""
    out = {}
    for p in glob.glob(os.path.join(sparse_dir, pattern)):
        base = os.path.basename(p)
        if base.startswith("big_") and base.endswith("_popularity_sparse.csv"):
            core = base[len("big_"):-len("_popularity_sparse.csv")]
            try:
                out[int(core)] = p
            except Exception:
                pass
    return out


def collect_union_small_ids(files_map):
    """Union of small_ids across the selected files, returned as a sorted list."""
    sids = set()
    for _, path in sorted(files_map.items()):
        try:
            df = pd.read_csv(path, usecols=["counts_json"])
        except Exception:
            continue
        for _, r in df.iterrows():
            d = safe_load_json(r.get("counts_json", ""))
            for k in d.keys():
                try:
                    sids.add(int(k))
                except Exception:
                    continue
    return sorted(sids)


def collect_union_day_slot(files_map):
    """Union of (day_idx, slot_id) across selected files, returned as a sorted DataFrame."""
    pairs = set()
    for _, path in files_map.items():
        try:
            df = pd.read_csv(path, usecols=["day_idx", "slot_id"])
        except Exception:
            continue
        for _, r in df.iterrows():
            d = r.get("day_idx")
            s = r.get("slot_id")
            if pd.notna(d) and pd.notna(s):
                try:
                    pairs.add((int(d), int(s)))
                except Exception:
                    pass
    if not pairs:
        return pd.DataFrame(columns=["day_idx", "slot_id"])
    return pd.DataFrame(sorted(pairs), columns=["day_idx", "slot_id"])


def write_global_index(out_dir, small_ids_sorted):
    """Write the enumeration mapping: col_idx (0-based), small_id."""
    os.makedirs(out_dir, exist_ok=True)
    path = os.path.join(out_dir, "global_small_ids.csv")
    with open(path, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["col_idx", "small_id"])
        for i, sid in enumerate(small_ids_sorted):  # 0-based
            w.writerow([i, sid])
    print(f"[OK] Wrote global_small_ids.csv (K={len(small_ids_sorted)})")


def write_reverse_mapping(out_dir, small_ids_sorted):
    """Write reverse mapping: small_id -> col_idx (0-based)."""
    path = os.path.join(out_dir, "reverse_small_ids.csv")
    with open(path, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["small_id", "col_idx"])
        for i, sid in enumerate(small_ids_sorted):  # 0-based
            w.writerow([sid, i])
    print(f"[OK] Wrote reverse_small_ids.csv (K={len(small_ids_sorted)})")


def vector_to_string(vec):
    return "[" + ",".join(f"{v:.{ROUND_DECIMALS}f}" for v in vec) + "]"


def load_sparse_map(path):
    """
    Load one big_* file into:
      {(day_idx, slot_id): {small_id: float_value, ...}}
    """
    out = {}
    try:
        df = pd.read_csv(path)
    except Exception:
        return out

    if not {"day_idx", "slot_id", "counts_json"}.issubset(df.columns):
        return out

    for _, r in df.iterrows():
        d = r.get("day_idx")
        s = r.get("slot_id")
        if pd.isna(d) or pd.isna(s):
            continue
        try:
            key = (int(d), int(s))
        except Exception:
            continue
        dd = {}
        counts = safe_load_json(r.get("counts_json", ""))
        for k, v in counts.items():
            try:
                dd[int(k)] = float(v)  # supports UWP floats & int popularity
            except Exception:
                continue
        out[key] = dd
    return out


def convert_to_vectors(files_map, small_ids_sorted, union_day_slot, out_dir):
    """Write one vectors CSV per selected big box, zero-filling missing."""
    os.makedirs(out_dir, exist_ok=True)
    col_index = {sid: i for i, sid in enumerate(small_ids_sorted)}
    K = len(small_ids_sorted)

    for bid, path in sorted(files_map.items()):
        sparse_map = load_sparse_map(path)
        rows = []

        for _, rs in union_day_slot.iterrows():
            d = int(rs["day_idx"])
            s = int(rs["slot_id"])
            dd = sparse_map.get((d, s), {})  # {} => zero vector

            vec = [0.0] * K
            for sid, val in dd.items():
                j = col_index.get(sid)
                if j is not None:
                    vec[j] = val

            rows.append({"day_idx": d, "slot_id": s, "vector": vector_to_string(vec)})

        out_df = pd.DataFrame(rows).sort_values(["day_idx", "slot_id"])
        out_name = f"big_{bid}_vectors.csv"
        out_path = os.path.join(out_dir, out_name)
        out_df.to_csv(out_path, index=False)
        print(f"[OK] Wrote {out_name}  rows={len(out_df)}  vec_dim={K}")


def main():
    # Keep ONLY the selected big boxes
    all_files = list_available_files(SPARSE_DIR, FILE_GLOB)
    files_map = {bid: all_files[bid] for bid in BIG_IDS if bid in all_files}
    missing = [bid for bid in BIG_IDS if bid not in files_map]
    if missing:
        print(f"[WARN] Missing big IDs: {missing}")
    if not files_map:
        print("ERROR: none of the requested big IDs exist in the folder.")
        return

    # 1) UNION of small_ids across selected files -> enumerate 0..K-1
    small_ids_sorted = collect_union_small_ids(files_map)
    write_global_index(OUT_DIR, small_ids_sorted)
    write_reverse_mapping(OUT_DIR, small_ids_sorted)

    # 2) UNION of (day_idx, slot_id) across selected files -> consistent rows
    union_day_slot = collect_union_day_slot(files_map)
    if union_day_slot.empty:
        print("[INFO] No (day_idx, slot_id) pairs found. Nothing to write.")
        return

    # 3) Convert each selected file -> aligned, zero-filled vectors
    convert_to_vectors(files_map, small_ids_sorted, union_day_slot, OUT_DIR)

    print(f"All done. Outputs at: {OUT_DIR}")


if __name__ == "__main__":
    main()

In [None]:
import os
import ast
import numpy as np
import pandas as pd
# ---------------- CONFIG ----------------
INPUT_DIR = "Popularity_ENUM_Minutes"      # Folder where your CSVs are
FILES = ["big_21_vectors.csv", "big_22_vectors.csv"]  # Add more if needed
# <-- edit as needed Change according to need, if want to reproduce result for Tdrive then the is [21,22], 
#for San Francisco [12,13]
SAVE_PATH = "big_Popularity_tensor.npy"        # Save in current directory
# ----------------------------------------


def parse_vector(vec_str):
    """Convert '[0,1,2,...]' string into list[float]."""
    try:
        return [float(x) for x in ast.literal_eval(vec_str)]
    except Exception:
        return []

tensors = []

for fname in FILES:
    path = os.path.join(INPUT_DIR, fname)
    if not os.path.exists(path):
        print(f"[WARN] File not found: {path}")
        continue

    # Read CSV and parse vector column
    df = pd.read_csv(path)
    df["vector_list"] = df["vector"].apply(parse_vector)

    # Convert to NumPy array (num_timeslots, vector_len)
    arr = np.array(df["vector_list"].to_list())
    tensors.append(arr)
    print(f"[OK] Loaded {fname} → shape {arr.shape}")

if not tensors:
    raise RuntimeError("No valid CSV files loaded. Exiting.")

# Stack all arrays into one 3D tensor: (num_big_boxes, num_timeslots, vector_len)
tensor = np.stack(tensors, axis=0)
print(f"\n Final tensor shape: {tensor.shape}")

# Save tensor in the **current directory** as `big_tensor.npy`
np.save(SAVE_PATH, tensor)
print(f"[OK] Saved combined tensor → {os.path.abspath(SAVE_PATH)}")