In [None]:
import random
import pandas as pd
from tqdm import tqdm

# --- Configuration ---
TRAIN_SIZE = 333_333
TEST_SIZE = 3_333
SEQ_LEN_RANGE = (12, 20)
STEPS_RANGE_FWD = (10, 15) # For Deduction/Induction
STEPS_RANGE_REV = (1, 3)  # For Abduction (multi-step reverse)
SEED = 42

# Wolfram Classes
WOLFRAM_CLASSES = {
    1: [0, 8, 32, 40, 128, 136, 160, 168],
    2: [1, 19, 23, 29, 37, 50, 108, 178],
    3: [30, 45, 60, 90, 105, 126, 150],
    4: [54, 106, 110, 124, 137, 147, 193]
}
# Weights for data balancing
CLASS_PROBS = [0.1, 0.2, 0.35, 0.35]

random.seed(SEED)

# --- Core Logic ---

def get_rule_bit(rule, l, c, r):
    pattern = (l << 2) | (c << 1) | r
    return 1 if (rule & (1 << pattern)) else 0

def get_next_state(state, rule):
    next_state = []
    L = len(state)
    for i in range(L):
        l, c, r = state[(i - 1) % L], state[i], state[(i + 1) % L]
        next_state.append(get_rule_bit(rule, l, c, r))
    return next_state

def get_rule_description(rule_num):
    """Explicit truth table string."""
    parts = []
    for i in range(7, -1, -1):
        pattern = format(i, '03b')
        outcome = (rule_num >> i) & 1
        parts.append(f"{pattern}->{outcome}")
    return ", ".join(parts)

def invert_rule_map(rule):
    """
    Returns a dict: {0: [valid_patterns...], 1: [valid_patterns...]}
    Example: {1: [7, 4, 1], 0: [6, 5, 3, 2, 0]}
    """
    mapping = {0: [], 1: []}
    for i in range(8):
        outcome = (rule >> i) & 1
        mapping[outcome].append(i)
    return mapping

# --- TASK 1: DEDUCTION (ConvNet Trace) ---

def generate_deduction_task(start_state, rule, steps):
    current = list(start_state)
    full_trace = []
    rule_desc = get_rule_description(rule)

    for t in range(steps):
        step_trace = []
        next_s = []
        L = len(current)

        # Scan window by window (explicit convolution)
        for i in range(L):
            l = current[(i - 1) % L]
            c = current[i]
            r = current[(i + 1) % L]
            res = get_rule_bit(rule, l, c, r)
            next_s.append(res)
            # Log local window operation concise
            step_trace.append(f"{i}:[{l}{c}{r}]->{res}")

        full_trace.append(f"Step {t+1} Scan: " + " | ".join(step_trace))
        full_trace.append(f"Step {t+1} Output: {''.join(map(str, next_s))}")
        current = next_s

    return {
        "task": "deduction",
        "input": f"Task: Deduction, Rule: {rule} [{rule_desc}], Steps: {steps}, Start: {''.join(map(str, start_state))}",
        "cot": "\n".join(full_trace),
        "target": "".join(map(str, current))
    }

# --- TASK 2: INDUCTION (Full Search Sweep) ---

def generate_induction_task(start_state, rule, steps, w_class):
    """
    Stochastic Search over Class Candidates. Logs EVERY step.
    """
    end_state = list(start_state)
    for _ in range(steps):
        end_state = get_next_state(end_state, rule)

    target_str = "".join(map(str, end_state))
    start_str = "".join(map(str, start_state))

    # Candidates
    candidates = list(WOLFRAM_CLASSES[w_class])
    if rule not in candidates: candidates.append(rule)
    random.shuffle(candidates)

    trace_lines = []
    trace_lines.append(f"Task: Find rule in Class {w_class}. Candidates: {candidates}")

    found_rule_desc = None

    for attempt_idx, r_candidate in enumerate(candidates):
        r_desc = get_rule_description(r_candidate)
        trace_lines.append(f"\nAttempt {attempt_idx+1}: Testing Rule {r_candidate} [{r_desc}]")

        curr = list(start_state)
        # Log all simulation steps
        for t in range(steps):
            curr = get_next_state(curr, r_candidate)
            trace_lines.append(f"  Sim Step {t+1}: {''.join(map(str, curr))}")

        final_sim_str = "".join(map(str, curr))

        if final_sim_str == target_str:
            trace_lines.append("Verification: Exact Match. Rule Confirmed.")
            found_rule_desc = r_desc
            break
        else:
            trace_lines.append(f"Verification: Mismatch (Expected {target_str}). Next...")

    if found_rule_desc is None:
        found_rule_desc = get_rule_description(rule)

    return {
        "task": "induction",
        "input": f"Task: Induction, Start: {start_str}, End: {target_str}, Steps: {steps}, Hint: Class {w_class}",
        "cot": "\n".join(trace_lines),
        "target": found_rule_desc
    }

# --- TASK 3: ABDUCTION (Likelihood Propagation) ---

def generate_probabilistic_abduction(target_state, rule, steps, true_start_state):
    """
    Backpropagates constraints using 'Likelihood of Priors' reasoning.
    """
    trace_lines = []

    # Generate Ground Truth checkpoints for validation
    checkpoints = [list(true_start_state)]
    curr = list(true_start_state)
    for _ in range(steps):
        curr = get_next_state(curr, rule)
        checkpoints.append(curr)

    rule_desc = get_rule_description(rule)
    inv_map = invert_rule_map(rule)

    # Trace logic
    trace_lines.append(f"Goal: Reverse {steps} steps by calculating likelihoods of priors.")

    solved_path = []
    current_target = list(checkpoints[-1])

    # Iterate backwards from T to 0
    for t in range(steps, 0, -1):
        L = len(current_target)
        trace_lines.append(f"\nStep -{steps-t+1}: Inverting state {''.join(map(str, current_target))}")

        # 1. Calculate and Log Priors (The Likelihood Part)
        # We calculate the probability of each bit being 1
        probs_1 = [0.0] * L

        trace_lines.append("Calculating Local Priors:")

        # We accumulate contributions to the probability
        # bit_probs[i] = [count_0, count_1]
        bit_counts = [[0, 0] for _ in range(L)]

        for i, val in enumerate(current_target):
            valid_pats = inv_map[val] # e.g. [7, 6, 0] -> [111, 110, 000]

            valid_bins = [format(p, '03b') for p in valid_pats]
            trace_lines.append(f"  Idx {i} requires neighborhood in {valid_bins}")

            # Aggregate Influence
            # Pattern P at index i affects:
            # - Left (bit 2) -> Index i-1
            # - Center (bit 1) -> Index i
            # - Right (bit 0) -> Index i+1

            for p in valid_pats:
                # Left Neighbor (i-1)
                bit_counts[(i-1)%L][(p >> 2) & 1] += 1
                # Center (i)
                bit_counts[i][(p >> 1) & 1] += 1
                # Right Neighbor (i+1)
                bit_counts[(i+1)%L][(p >> 0) & 1] += 1

        # 2. Compute Likelihoods and Sample
        trace_lines.append("Aggregating Probabilities & Sampling Greedy Candidate:")

        candidate = []
        prob_strs = []

        for i in range(L):
            c0, c1 = bit_counts[i]
            total = c0 + c1
            if total == 0:
                # Should not happen in valid trajectory, but fallback to 0.5
                prob_1 = 0.5
            else:
                prob_1 = c1 / total

            # Greedy Sampling
            choice = 1 if prob_1 >= 0.5 else 0
            candidate.append(choice)

            # Log concise probability
            prob_strs.append(f"{i}:{int(prob_1*100)}%")

        # Log probability map
        trace_lines.append("  Likelihood(1): " + " | ".join(prob_strs))

        cand_str = "".join(map(str, candidate))
        trace_lines.append(f"  Greedy Candidate: {cand_str}")

        # 3. Verify
        # We verify if this candidate actually produces the target
        check_next = get_next_state(candidate, rule)

        if check_next == current_target:
            trace_lines.append("  Verification: Match. Moving to next step.")
            current_target = candidate
        else:
            # If greedy failed (common in chaotic rules), we define a "fallback" strategy in the trace
            # In a real model, it might refine. Here, we reveal the 'True Parent'
            # effectively guiding the model: "Greedy approx failed, here is the corrected path."
            true_prev = checkpoints[t-1]
            true_prev_str = "".join(map(str, true_prev))

            trace_lines.append(f"  Verification: Mismatch. Correction applied -> {true_prev_str}")
            current_target = true_prev

        solved_path.insert(0, current_target)

    return {
        "task": "abduction",
        "input": f"Task: Abduction, Rule: {rule} [{rule_desc}], Steps: {steps}, End: {''.join(map(str, checkpoints[-1]))}",
        "cot": "\n".join(trace_lines),
        "target": "".join(map(str, solved_path[0]))
    }

# --- Main Generator Loop ---

def generate_dataset_partition(count, seen_hashes):
    tasks = []
    with tqdm(total=count) as pbar:
        while len(tasks) < count:
            w_class = random.choices(list(WOLFRAM_CLASSES.keys()), weights=CLASS_PROBS)[0]
            rule = random.choice(WOLFRAM_CLASSES[w_class])
            length = random.randint(*SEQ_LEN_RANGE)
            start_state = [random.randint(0, 1) for _ in range(length)]

            # 1. Deduction
            ded_steps = random.randint(*STEPS_RANGE_FWD)
            sim_hash = (rule, tuple(start_state), ded_steps, 'ded')
            if sim_hash not in seen_hashes:
                seen_hashes.add(sim_hash)
                tasks.append(generate_deduction_task(start_state, rule, ded_steps))
                pbar.update(1)
                if len(tasks) >= count: break

            # 2. Induction
            ind_steps = random.randint(*STEPS_RANGE_FWD)
            sim_hash = (rule, tuple(start_state), ind_steps, 'ind')
            if sim_hash not in seen_hashes:
                seen_hashes.add(sim_hash)
                tasks.append(generate_induction_task(start_state, rule, ind_steps, w_class))
                pbar.update(1)
                if len(tasks) >= count: break

            # 3. Abduction (Multi-step Probabilistic)
            abd_steps = random.randint(*STEPS_RANGE_REV)
            # Ensure valid end state by simulating forward first
            end_state = list(start_state)
            for _ in range(abd_steps):
                end_state = get_next_state(end_state, rule)

            sim_hash = (rule, tuple(end_state), abd_steps, 'abd')
            if sim_hash not in seen_hashes:
                seen_hashes.add(sim_hash)
                tasks.append(generate_probabilistic_abduction(end_state, rule, abd_steps, start_state))
                pbar.update(1)
                if len(tasks) >= count: break

    return tasks

if __name__ == "__main__":
    seen_hashes = set()

    print(f"Generating {TRAIN_SIZE} Training Tasks...")
    train_data = generate_dataset_partition(TRAIN_SIZE, seen_hashes)

    print(f"Generating {TEST_SIZE} Testing Tasks (Disjoint)...")
    test_data = generate_dataset_partition(TEST_SIZE, seen_hashes)

    df_train = pd.DataFrame(train_data)
    df_test = pd.DataFrame(test_data)

    # Shuffle
    df_train = df_train.sample(frac=1, random_state=SEED).reset_index(drop=True)
    df_test = df_test.sample(frac=1, random_state=SEED).reset_index(drop=True)

    print("\nSample Abduction Trace (Likelihood/Backprop):")
    print("-" * 60)
    print(df_train[df_train['task']=='abduction'].iloc[0]['cot'][:800] + "...")
    print("-" * 60)

    df_train.to_csv("eca_reasoning_train.csv", index=False)
    df_test.to_csv("eca_reasoning_test.csv", index=False)
    print("Files saved.")

Generating 333333 Training Tasks...


100%|██████████| 333333/333333 [01:53<00:00, 2927.74it/s]


Generating 3333 Testing Tasks (Disjoint)...


100%|██████████| 3333/3333 [00:01<00:00, 3214.05it/s]



Sample Abduction Trace (Likelihood/Backprop):
------------------------------------------------------------
Goal: Reverse 2 steps by calculating likelihoods of priors.

Step -1: Inverting state 10101001011101011
Calculating Local Priors:
  Idx 0 requires neighborhood in ['000', '011', '101', '110']
  Idx 1 requires neighborhood in ['001', '010', '100', '111']
  Idx 2 requires neighborhood in ['000', '011', '101', '110']
  Idx 3 requires neighborhood in ['001', '010', '100', '111']
  Idx 4 requires neighborhood in ['000', '011', '101', '110']
  Idx 5 requires neighborhood in ['001', '010', '100', '111']
  Idx 6 requires neighborhood in ['001', '010', '100', '111']
  Idx 7 requires neighborhood in ['000', '011', '101', '110']
  Idx 8 requires neighborhood in ['001', '010', '100', '111']
  Idx 9 requires neighborhood in ['000', '011', '101', '110']
  Idx 10 requires neighborhood in ['000', '011', '...
------------------------------------------------------------
Files saved.


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from huggingface_hub import login

# --- Configuration ---
# CHANGE THIS to your Hugging Face username and desired repo name
HF_USERNAME = "kreasof-ai"
REPO_NAME = "ECA-Zero"
REPO_ID = f"{HF_USERNAME}/{REPO_NAME}"

# Optional: If you didn't run 'huggingface-cli login',
# uncomment the line below and paste your token.
# login(token="hf_...")

def upload_dataset():
    print(f"Loading CSV files...")
    try:
        df_train = pd.read_csv("eca_reasoning_train.csv")
        df_test = pd.read_csv("eca_reasoning_test.csv")
    except FileNotFoundError:
        print("Error: CSV files not found. Please run the generation script first.")
        return

    print(f"Converting to Hugging Face Dataset format...")
    print(f"Train rows: {len(df_train)}")
    print(f"Test rows:  {len(df_test)}")

    # Convert Pandas DataFrames to HF Datasets
    train_dataset = Dataset.from_pandas(df_train)
    test_dataset = Dataset.from_pandas(df_test)

    # Combine into a DatasetDict (standard HF format)
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "test": test_dataset
    })

    # Define features/metadata (Optional but good practice)
    # The dataset naturally implies:
    # - task: string (deduction/abduction/induction)
    # - input: string (The prompt)
    # - cot: string (The Chain of Thought / Trace)
    # - target: string (The final answer)

    print(f"Pushing to Hugging Face Hub: {REPO_ID} ...")

    # Push to Hub
    try:
        dataset_dict.push_to_hub(
            REPO_ID,
            private=False, # Set to True if you want it private
            commit_message="Initial upload of ECA-Zero Reasoning benchmark"
        )
        print("\n✅ Success!")
        print(f"View your dataset here: https://huggingface.co/datasets/{REPO_ID}")

    except Exception as e:
        print(f"\n❌ Error uploading: {e}")
        print("Make sure you are logged in via 'huggingface-cli login' or provided a token.")

if __name__ == "__main__":
    upload_dataset()

Loading CSV files...
Converting to Hugging Face Dataset format...
Train rows: 333333
Test rows:  3333
Pushing to Hugging Face Hub: kreasof-ai/ECA-Zero ...


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/167 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :  32%|###1      | 33.5MB /  106MB            

Creating parquet from Arrow format:   0%|          | 0/167 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :  32%|###1      | 33.5MB /  106MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              : 100%|##########| 2.14MB / 2.14MB            

README.md: 0.00B [00:00, ?B/s]


✅ Success!
View your dataset here: https://huggingface.co/datasets/kreasof-ai/ECA-Zero
