In [2]:
# STEP 1: Upload CSV
from google.colab import files
import pandas as pd
import io

uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[filename]))
print(f"✅ Loaded file: {filename}")

# STEP 2: Clean and prepare data
df.dropna(subset=['Step ID'], inplace=True)
df['Next Step ID'] = df['Next Step ID'].replace('—', None)
df['Step ID'] = df['Step ID'].astype(str)
df['Next Step ID'] = df['Next Step ID'].astype(str)
df['Next Step ID'] = df['Next Step ID'].replace('nan', 'None')

# STEP 3: Build complete graph
import networkx as nx

all_step_ids = set(df['Step ID']) | (set(df['Next Step ID']) - {'None'})
step_id_to_field = {}

for _, row in df.iterrows():
    sid = row['Step ID']
    field = row['Target Field']
    if sid != 'None':
        step_id_to_field[sid] = field

for sid in all_step_ids:
    if sid not in step_id_to_field:
        step_id_to_field[sid] = f"[Unknown Step {sid}]"

G = nx.DiGraph()
for sid, label in step_id_to_field.items():
    G.add_node(sid, label=label)

for _, row in df.iterrows():
    src = row['Step ID']
    tgt = row['Next Step ID']
    if src != 'None' and tgt != 'None':
        G.add_edge(src, tgt)

print(f"✅ Total nodes: {len(G.nodes)}")
print(f"✅ Total edges: {len(G.edges)}")

# STEP 4: Terminal nodes
terminal_nodes = [n for n in G.nodes if G.out_degree(n) == 0]
start_node = '1'

# STEP 5: Traverse with full permutations (unlimited depth, allow cycles)
# Optional safeguard: max visits to prevent infinite loops
def dfs_unlimited(G, current, target, path=None, visited_counts=None, max_visits=1):
    if path is None:
        path = []
    if visited_counts is None:
        visited_counts = {}

    path = path + [current]
    visited_counts = visited_counts.copy()
    visited_counts[current] = visited_counts.get(current, 0) + 1

    if visited_counts[current] > max_visits:
        return []

    if current == target:
        return [path]

    paths = []
    for neighbor in G.successors(current):
        paths += dfs_unlimited(G, neighbor, target, path, visited_counts, max_visits)
    return paths

# STEP 6: Run full path search
all_paths = []
for terminal in terminal_nodes:
    if nx.has_path(G, start_node, terminal):
        found = dfs_unlimited(G, start_node, terminal, max_visits=1)
        for p in found:
            all_paths.append([step_id_to_field[sid] for sid in p])

print(f"✅ Found {len(all_paths)} total paths from Step 1 to terminal nodes (no depth limit, cycles allowed)")

# Display sample results
for i, path in enumerate(all_paths[:10], 1):
    print(f"Path {i}: {' → '.join(path)}")
if len(all_paths) > 10:
    print("...")

# STEP 7: Save to CSV
paths_df = pd.DataFrame({'Path': [' → '.join(p) for p in all_paths]})
paths_df.to_csv("all_permutation_paths.csv", index=False)
files.download("all_permutation_paths.csv")


# Debug

terminal_nodes = [n for n in G.nodes if G.out_degree(n) == 0]
reachable_terminals = [n for n in terminal_nodes if nx.has_path(G, '1', n)]
print(f"Terminal nodes reachable from Step 1: {len(reachable_terminals)}")

reachable = nx.descendants(G, '1')
print(f"Reachable nodes from Step 1: {len(reachable)}")

unreachable_nodes = [n for n in G.nodes if not nx.has_path(G, '1', n)]
print(f"Unreachable nodes from Step 1: {len(unreachable_nodes)}")
for n in unreachable_nodes[:10]:
    print(f"{n}: {step_id_to_field.get(n)}")


Saving Permutations - Sheet1-4.csv to Permutations - Sheet1-4.csv
✅ Loaded file: Permutations - Sheet1-4.csv
✅ Total nodes: 93
✅ Total edges: 102
✅ Found 580 total paths from Step 1 to terminal nodes (no depth limit, cycles allowed)
Path 1: legal_name → prefered_name → start_date → actual_name → birth_date → email → add_phone_number → phone_number → additional_contact_consent → additional_contact_phone_duplicate
Path 2: legal_name → prefered_name → start_date → actual_name → birth_date → email → add_phone_number → additional_contact_consent → additional_contact_phone_duplicate
Path 3: legal_name → prefered_name → start_date → actual_name → birth_date → email → add_phone_number → phone_number → additional_contact_consent → preferred_contact → age_ok → guardian_name → guardian_relationship → guardian_contact → guardian_phone → guardian_email
Path 4: legal_name → prefered_name → start_date → actual_name → birth_date → email → add_phone_number → additional_contact_consent → preferred_conta

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Terminal nodes reachable from Step 1: 3
Reachable nodes from Step 1: 92
Unreachable nodes from Step 1: 0
