In [None]:
import random
import csv
import os
from faker import Faker

fake = Faker()

# ----------------------------------------------------------------------
# CONFIG
# ----------------------------------------------------------------------
NUM_COMPANIES = 500
NUM_SHAREHOLDERS = 250
NUM_AUDITORS = 10
NUM_INVOICES = 1500

OUTPUT_DIR = "output_csv"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ----------------------------------------------------------------------
# NODE CREATION
# ----------------------------------------------------------------------
companies = [
    {
        "company_id": f"C{i}",
        "name": fake.company(),
        "country": random.choice(["US", "IN", "DE", "SG"]),
        "sector": random.choice(["FIN", "MFG", "TECH", "RETAIL"]),
        "risk_score": 0.0,
        "opportunity_score": 0.0,
        "is_supplier": False,
        "_pattern_marker": None,  # Track which pattern this entity belongs to
    }
    for i in range(NUM_COMPANIES)
]

shareholders = [
    {
        "shareholder_id": f"S{i}",
        "name": fake.name(),
        "type": random.choice(["INDIVIDUAL", "FUND", "INSTITUTION"]),
        "risk_score": 0.0,
        "opportunity_score": 0.0,
        "_pattern_marker": None,
    }
    for i in range(NUM_SHAREHOLDERS)
]

auditors = [
    {
        "auditor_id": f"A{i}",
        "name": fake.company() + " Audit",
        "risk_level": "HIGH" if i < 2 else random.choice(["LOW", "MEDIUM"]),
    }
    for i in range(NUM_AUDITORS)
]

invoices = [
    {
        "invoice_id": f"INV{i}",
        "amount": round(random.uniform(1000, 100000), 2),
        "status": random.choice(["PAID", "PENDING"]),
        "is_simulated": True,
        "_reserved": False,  # Track if invoice is reserved for patterns
    }
    for i in range(NUM_INVOICES)
]

# ----------------------------------------------------------------------
# EDGE ACCUMULATOR
# ----------------------------------------------------------------------
edges = []
# Format: (type, from, to, props_dict)

# Track used invoices to avoid duplication
used_invoice_ids = set()

# ----------------------------------------------------------------------
# PATTERN 1 - SHELL COMPANY CHAIN (FIXED)
# ----------------------------------------------------------------------
def embed_shell_pattern(companies, auditors, invoices):
    """
    Creates a shell company structure:
    - Chain of 3+ intermediary companies (we use 4 for robustness)
    - All audited by the SAME high-risk auditor
    - Each company has unusually LOW invoice count (1-2 invoices max)
    """
    high_risk = [a for a in auditors if a["risk_level"] == "HIGH"]
    if not high_risk:
        print("Warning: No high-risk auditors found!")
        return []
    
    auditor = random.choice(high_risk)
    
    # Select companies not already in patterns
    available = [c for c in companies if c["_pattern_marker"] is None]
    if len(available) < 5:
        print("Warning: Not enough companies for shell pattern!")
        return []
    
    main = random.choice(available)
    main["_pattern_marker"] = "PATTERN1_MAIN"
    
    remaining = [c for c in available if c != main]
    chain = random.sample(remaining, 4)
    for idx, c in enumerate(chain):
        c["_pattern_marker"] = f"PATTERN1_CHAIN_{idx}"

    pattern_edges = []
    
    # Build subsidiary chain: chain[0] -> main, chain[1] -> chain[0], etc.
    parent = main
    for c in chain:
        pattern_edges.append(
            ("SUBSIDIARY_OF", c["company_id"], parent["company_id"], 
             {"since_year": random.randint(2018, 2022)})
        )
        parent = c

    # All companies audited by same HIGH-RISK auditor
    for c in [main] + chain:
        pattern_edges.append(("AUDITED_BY", c["company_id"], auditor["auditor_id"], {}))

    # CRITICAL FIX: Each company issues only 1 invoice (unusually low activity)
    available_invs = [inv for inv in invoices if not inv["_reserved"]]
    if len(available_invs) < 5:
        print("Warning: Not enough invoices for shell pattern!")
        return pattern_edges
    
    shell_invoices = random.sample(available_invs, 5)
    for idx, c in enumerate([main] + chain):
        inv = shell_invoices[idx]
        inv["_reserved"] = True
        used_invoice_ids.add(inv["invoice_id"])
        pattern_edges.append(("ISSUES_TO", c["company_id"], inv["invoice_id"], {}))

    print(f"✓ Pattern 1 (Shell Company): Main={main['company_id']}, Chain={[c['company_id'] for c in chain]}, Auditor={auditor['auditor_id']}")
    
    return pattern_edges

# ----------------------------------------------------------------------
# PATTERN 2 - CIRCULAR TRADE (ENHANCED)
# ----------------------------------------------------------------------
def embed_circular_trade(companies):
    """
    Creates a closed circular supply loop:
    - Cycle of 3-5 companies using SUPPLIES relationships
    - Forms a complete cycle without external flows
    - High transaction volumes to simulate revenue inflation
    """
    available = [c for c in companies if c["_pattern_marker"] is None]
    if len(available) < 4:
        print("Warning: Not enough companies for circular trade!")
        return []
    
    # Create cycle of 4 companies (can be 3-5)
    cycle_size = random.randint(3, 5)
    cycle_nodes = random.sample(available, min(cycle_size, len(available)))
    
    for idx, c in enumerate(cycle_nodes):
        c["_pattern_marker"] = f"PATTERN2_CYCLE_{idx}"

    pattern_edges = []
    
    # Create circular SUPPLIES relationships
    for i in range(len(cycle_nodes)):
        a = cycle_nodes[i]["company_id"]
        b = cycle_nodes[(i + 1) % len(cycle_nodes)]["company_id"]
        # High volumes to indicate suspicious circular trading
        pattern_edges.append(
            ("SUPPLIES", a, b, {"annual_volume": random.randint(80, 150)})
        )

    print(f"✓ Pattern 2 (Circular Trade): Cycle={[c['company_id'] for c in cycle_nodes]}")
    
    return pattern_edges

# ----------------------------------------------------------------------
# PATTERN 3 - HIDDEN INFLUENCE (FIXED)
# ----------------------------------------------------------------------
def embed_hidden_influence(companies, shareholders, invoices):
    """
    Creates hidden influence pattern:
    - Influential shareholder owns >25% of supplier company
    - Supplier provides >80% of target's invoices
    - Shareholder is NOT a direct supplier themselves
    """
    available_companies = [c for c in companies if c["_pattern_marker"] is None]
    available_shareholders = [s for s in shareholders if s["_pattern_marker"] is None]
    
    if len(available_companies) < 2 or len(available_shareholders) < 1:
        print("Warning: Not enough entities for hidden influence pattern!")
        return []
    
    target = random.choice(available_companies)
    target["_pattern_marker"] = "PATTERN3_TARGET"
    
    remaining_companies = [c for c in available_companies if c != target]
    supplier = random.choice(remaining_companies)
    supplier["_pattern_marker"] = "PATTERN3_SUPPLIER"
    supplier["is_supplier"] = True
    
    influencer = random.choice(available_shareholders)
    influencer["_pattern_marker"] = "PATTERN3_INFLUENCER"

    pattern_edges = []
    
    # Influencer owns significant stake (>25%) in supplier
    ownership_pct = random.randint(30, 55)
    pattern_edges.append(
        ("OWNS_SHARE", influencer["shareholder_id"], supplier["company_id"],
         {"percentage": ownership_pct})
    )
    
    # CRITICAL FIX: Supplier provides >80% of target's invoices
    # We'll allocate 100 invoices from supplier to target
    # Then only 20-25 invoices from other sources to target (ensuring >80% concentration)
    
    available_invs = [inv for inv in invoices if not inv["_reserved"]]
    if len(available_invs) < 120:
        print("Warning: Not enough invoices for hidden influence pattern!")
        return pattern_edges
    
    # Supplier issues 100 invoices that target pays
    supplier_invoices = random.sample(available_invs, 100)
    for inv in supplier_invoices:
        inv["_reserved"] = True
        used_invoice_ids.add(inv["invoice_id"])
        pattern_edges.append(("ISSUES_TO", supplier["company_id"], inv["invoice_id"], {}))
        pattern_edges.append(("PAYS", target["company_id"], inv["invoice_id"], {}))
    
    # Add SUPPLIES relationship with high volume indicator
    pattern_edges.append(
        ("SUPPLIES", supplier["company_id"], target["company_id"],
         {"annual_volume": 100})  # Volume = number of invoices
    )
    
    # Store target and supplier IDs for later use in background generation
    target["_p3_invoice_count"] = 100
    
    print(f"✓ Pattern 3 (Hidden Influence): Influencer={influencer['shareholder_id']}, "
          f"Supplier={supplier['company_id']} ({ownership_pct}%), Target={target['company_id']}")
    
    return pattern_edges, target, supplier

# ----------------------------------------------------------------------
# BACKGROUND EDGE GENERATORS (ENHANCED)
# ----------------------------------------------------------------------
def generate_random_subsidiaries(companies, count=200):
    """Generate random subsidiary relationships avoiding pattern companies"""
    out = []
    eligible = [c for c in companies if not c["_pattern_marker"] or 
                not c["_pattern_marker"].startswith("PATTERN1")]
    
    for _ in range(min(count, len(eligible) * 2)):
        if len(eligible) < 2:
            break
        ch, p = random.sample(eligible, 2)
        out.append(("SUBSIDIARY_OF", ch["company_id"], p["company_id"],
                    {"since_year": random.randint(1990, 2023)}))
    return out

def generate_random_ownership(companies, shareholders):
    """Generate ownership stakes, avoiding pattern 3 conflicts"""
    out = []
    for c in companies:
        # Skip if this company is the supplier in pattern 3 (already has influencer ownership)
        if c.get("_pattern_marker") == "PATTERN3_SUPPLIER":
            # Add a few other minor shareholders
            num_owners = random.randint(1, 2)
        else:
            num_owners = random.randint(1, 4)
        
        available_sh = [s for s in shareholders if s.get("_pattern_marker") != "PATTERN3_INFLUENCER"]
        if len(available_sh) < num_owners:
            continue
            
        owners = random.sample(available_sh, num_owners)
        remaining = 100 if c.get("_pattern_marker") != "PATTERN3_SUPPLIER" else 45
        N = len(owners)

        for i, o in enumerate(owners):
            if i == N - 1:
                pct = remaining
            else:
                max_allowed = max(5, remaining - 5 * (N - i - 1))
                pct = random.randint(5, min(max_allowed, 25))  # Cap at 25% to avoid conflicts

            remaining -= pct

            out.append(("OWNS_SHARE", o["shareholder_id"], c["company_id"], {"percentage": pct}))

    return out

def generate_random_audits(companies, auditors):
    """Generate audit relationships, skipping pattern 1 companies (already audited)"""
    out = []
    for c in companies:
        if c.get("_pattern_marker") and c["_pattern_marker"].startswith("PATTERN1"):
            continue  # Already has high-risk auditor
        auditor = random.choice(auditors)
        out.append(("AUDITED_BY", c["company_id"], auditor["auditor_id"], {}))
    return out

def generate_random_supplies(companies, count=500):
    """Generate supply relationships avoiding circular pattern"""
    out = []
    eligible = [c for c in companies if not c.get("_pattern_marker") or 
                not c["_pattern_marker"].startswith("PATTERN2")]
    
    for _ in range(count):
        if len(eligible) < 2:
            break
        a, b = random.sample(eligible, 2)
        out.append(("SUPPLIES", a["company_id"], b["company_id"], 
                   {"annual_volume": random.randint(10, 60)}))
    return out

def generate_random_invoices(companies, invoices, target_p3=None):
    """
    Generate invoice relationships avoiding pattern conflicts
    If target_p3 is provided, give it only ~20 additional invoices from other sources
    """
    out = []
    available_invs = [inv for inv in invoices if not inv["_reserved"]]
    
    # Handle pattern 3 target specially
    if target_p3 is not None:
        # Give target only 20 additional invoices from random sources
        target_extra_invs = random.sample(available_invs, min(20, len(available_invs)))
        for inv in target_extra_invs:
            inv["_reserved"] = True
            issuer = random.choice([c for c in companies if c != target_p3])
            out.append(("ISSUES_TO", issuer["company_id"], inv["invoice_id"], {}))
            out.append(("PAYS", target_p3["company_id"], inv["invoice_id"], {}))
        
        available_invs = [inv for inv in invoices if not inv["_reserved"]]
    
    # Distribute remaining invoices to other companies (realistic distribution)
    for inv in available_invs:
        issuer = random.choice(companies)
        payer = random.choice([c for c in companies if c != issuer])
        out.append(("ISSUES_TO", issuer["company_id"], inv["invoice_id"], {}))
        out.append(("PAYS", payer["company_id"], inv["invoice_id"], {}))
    
    return out

# ----------------------------------------------------------------------
# BUILD FINAL EDGE LIST
# ----------------------------------------------------------------------
print("=" * 70)
print("GENERATING GRAPH PATTERNS")
print("=" * 70)

# Generate patterns first
edges.extend(embed_shell_pattern(companies, auditors, invoices))
edges.extend(embed_circular_trade(companies))

pattern3_result = embed_hidden_influence(companies, shareholders, invoices)
if len(pattern3_result) == 3:
    pattern3_edges, target_p3, supplier_p3 = pattern3_result
    edges.extend(pattern3_edges)
else:
    edges.extend(pattern3_result)
    target_p3 = None

print("=" * 70)
print("GENERATING BACKGROUND DATA")
print("=" * 70)

# Generate background edges
edges.extend(generate_random_subsidiaries(companies))
edges.extend(generate_random_ownership(companies, shareholders))
edges.extend(generate_random_audits(companies, auditors))
edges.extend(generate_random_supplies(companies))
edges.extend(generate_random_invoices(companies, invoices, target_p3))

print(f"Total edges: {len(edges)}")
print("=" * 70)

# ----------------------------------------------------------------------
# WRITE NODE CSVS
# ----------------------------------------------------------------------
def write_csv(filename, rows, header):
    with open(os.path.join(OUTPUT_DIR, filename), "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=header, extrasaction='ignore')
        writer.writeheader()
        writer.writerows(rows)

write_csv("companies.csv", companies,
          ["company_id", "name", "country", "sector", "risk_score", "opportunity_score", "is_supplier"])

write_csv("shareholders.csv", shareholders,
          ["shareholder_id", "name", "type", "risk_score", "opportunity_score"])

write_csv("auditors.csv", auditors,
          ["auditor_id", "name", "risk_level"])

write_csv("invoices.csv", invoices,
          ["invoice_id", "amount", "status", "is_simulated"])

# ----------------------------------------------------------------------
# SPLIT EDGES INTO SEPARATE CSV FILES WITH SPECIFIC COLUMN NAMES
# ----------------------------------------------------------------------
def edge_to_dict(rel_type, start, end, props):
    if rel_type == "SUBSIDIARY_OF":
        return {"child_company_id": start, "parent_company_id": end, "since_year": props.get("since_year")}
    elif rel_type == "OWNS_SHARE":
        return {"shareholder_id": start, "company_id": end, "percentage": props.get("percentage")}
    elif rel_type == "SUPPLIES":
        return {"supplier_company_id": start, "buyer_company_id": end, "annual_volume": props.get("annual_volume")}
    elif rel_type == "AUDITED_BY":
        return {"company_id": start, "auditor_id": end}
    elif rel_type == "ISSUES_TO":
        return {"company_id": start, "invoice_id": end}
    elif rel_type == "PAYS":
        return {"company_id": start, "invoice_id": end}
    return {"start": start, "end": end}

def write_edges_by_type(edges):
    buckets = {
        "SUBSIDIARY_OF": [],
        "OWNS_SHARE": [],
        "SUPPLIES": [],
        "AUDITED_BY": [],
        "ISSUES_TO": [],
        "PAYS": [],
    }

    for rel, start, end, props in edges:
        buckets[rel].append(edge_to_dict(rel, start, end, props))

    # Write each CSV with proper headers
    for rel, rows in buckets.items():
        if rel == "SUBSIDIARY_OF":
            header = ["child_company_id", "parent_company_id", "since_year"]
            filename = "subsidiary_of.csv"
        elif rel == "OWNS_SHARE":
            header = ["shareholder_id", "company_id", "percentage"]
            filename = "owns_share.csv"
        elif rel == "SUPPLIES":
            header = ["supplier_company_id", "buyer_company_id", "annual_volume"]
            filename = "supplies.csv"
        elif rel == "AUDITED_BY":
            header = ["company_id", "auditor_id"]
            filename = "audited_by.csv"
        elif rel == "ISSUES_TO":
            header = ["company_id", "invoice_id"]
            filename = "issues_to.csv"
        elif rel == "PAYS":
            header = ["company_id", "invoice_id"]
            filename = "pays.csv"

        with open(os.path.join(OUTPUT_DIR, filename), "w", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=header)
            writer.writeheader()
            writer.writerows(rows)
        
        print(f"✓ Written {filename}: {len(rows)} relationships")

write_edges_by_type(edges)

print("=" * 70)
print(f"✅ CSV files generated in: {OUTPUT_DIR}")
print("=" * 70)
print("\nPATTERN SUMMARY:")
print("• Pattern 1 (Shell Company): Chain of 5 companies with same HIGH-RISK auditor, 1 invoice each")
print("• Pattern 2 (Circular Trade): Cycle of 3-5 companies with circular SUPPLIES relationships")
print("• Pattern 3 (Hidden Influence): Shareholder owns >25% of supplier providing >80% of target's invoices")
print("=" * 70)

In [1]:
import random
import csv
import os
from faker import Faker

fake = Faker()

# ----------------------------------------------------------------------
# CONFIG
# ----------------------------------------------------------------------
NUM_COMPANIES = 500
NUM_SHAREHOLDERS = 250
NUM_AUDITORS = 10
NUM_INVOICES = 1500

OUTPUT_DIR = "output_csv"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ----------------------------------------------------------------------
# NODE CREATION
# ----------------------------------------------------------------------
companies = [
    {
        "company_id": f"C{i}",
        "name": fake.company(),
        "country": random.choice(["US", "IN", "DE", "SG"]),
        "sector": random.choice(["FIN", "MFG", "TECH", "RETAIL"]),
        "risk_score": 0.0,
        "opportunity_score": 0.0,
        "is_supplier": False,
    }
    for i in range(NUM_COMPANIES)
]

shareholders = [
    {
        "shareholder_id": f"S{i}",
        "name": fake.name(),
        "type": random.choice(["INDIVIDUAL", "FUND", "INSTITUTION"]),
        "risk_score": 0.0,
        "opportunity_score": 0.0,
    }
    for i in range(NUM_SHAREHOLDERS)
]

auditors = [
    {
        "auditor_id": f"A{i}",
        "name": fake.company() + " Audit",
        "risk_level": "HIGH" if i < 2 else random.choice(["LOW", "MEDIUM"]),
    }
    for i in range(NUM_AUDITORS)
]

invoices = [
    {
        "invoice_id": f"INV{i}",
        "amount": round(random.uniform(1000, 100000), 2),
        "status": random.choice(["PAID", "PENDING"]),
        "is_simulated": True,
    }
    for i in range(NUM_INVOICES)
]

# ----------------------------------------------------------------------
# EDGE ACCUMULATOR
# ----------------------------------------------------------------------
edges = []
# Format: (type, from, to, props_dict)

# ----------------------------------------------------------------------
# PATTERN 1 - SHELL COMPANY CHAIN
# ----------------------------------------------------------------------
def embed_shell_pattern(companies, auditors, invoices):
    high_risk = [a for a in auditors if a["risk_level"] == "HIGH"]
    auditor = random.choice(high_risk)

    main = random.choice(companies)
    chain = random.sample([c for c in companies if c != main], 4)

    pattern_edges = []
    parent = main

    for c in chain:
        pattern_edges.append(
            ("SUBSIDIARY_OF", c["company_id"], parent["company_id"], {"since_year": 2020})
        )
        parent = c

    for c in [main] + chain:
        pattern_edges.append(("AUDITED_BY", c["company_id"], auditor["auditor_id"], {}))

    sample_invs = random.sample(invoices, 5)
    for c in [main] + chain:
        inv = random.choice(sample_invs)
        pattern_edges.append(("ISSUES_TO", c["company_id"], inv["invoice_id"], {}))

    return pattern_edges

# ----------------------------------------------------------------------
# PATTERN 2 - CIRCULAR TRADE
# ----------------------------------------------------------------------
def embed_circular_trade(companies):
    cycle_nodes = random.sample(companies, 4)
    pattern_edges = []

    for i in range(len(cycle_nodes)):
        a = cycle_nodes[i]["company_id"]
        b = cycle_nodes[(i + 1) % len(cycle_nodes)]["company_id"]
        pattern_edges.append(("SUPPLIES", a, b, {"annual_volume": random.randint(50, 200)}))

    return pattern_edges

# ----------------------------------------------------------------------
# PATTERN 3 - HIDDEN INFLUENCE
# ----------------------------------------------------------------------
def embed_hidden_influence(companies, shareholders, invoices):
    target = random.choice(companies)
    supplier = random.choice([c for c in companies if c != target])
    influencer = random.choice(shareholders)

    pattern_edges = []

    # Influential ownership
    pattern_edges.append(
        ("OWNS_SHARE", influencer["shareholder_id"], supplier["company_id"],
         {"percentage": random.randint(30, 60)})
    )

    invs = random.sample(invoices, 50)
    for inv in invs:
        pattern_edges.append(("ISSUES_TO", supplier["company_id"], inv["invoice_id"], {}))
        pattern_edges.append(("PAYS", target["company_id"], inv["invoice_id"], {}))

    pattern_edges.append(
        ("SUPPLIES", supplier["company_id"], target["company_id"],
         {"annual_volume": 0.8 * len(invs)})
    )

    return pattern_edges

# ----------------------------------------------------------------------
# BACKGROUND EDGE GENERATORS
# ----------------------------------------------------------------------
def generate_random_subsidiaries(companies, count=200):
    out = []
    for _ in range(count):
        ch, p = random.sample(companies, 2)
        out.append(("SUBSIDIARY_OF", ch["company_id"], p["company_id"],
                    {"since_year": random.randint(1990, 2023)}))
    return out

def generate_random_ownership(companies, shareholders):
    out = []
    for c in companies:
        owners = random.sample(shareholders, random.randint(1, 4))
        remaining = 100
        N = len(owners)

        for i, o in enumerate(owners):
            if i == N - 1:
                pct = remaining
            else:
                max_allowed = max(5, remaining - 5 * (N - i - 1))
                pct = random.randint(5, max_allowed)

            remaining -= pct

            out.append(("OWNS_SHARE", o["shareholder_id"], c["company_id"], {"percentage": pct}))

    return out

def generate_random_audits(companies, auditors):
    out = []
    for c in companies:
        auditor = random.choice(auditors)
        out.append(("AUDITED_BY", c["company_id"], auditor["auditor_id"], {}))
    return out

def generate_random_supplies(companies, count=500):
    out = []
    for _ in range(count):
        a, b = random.sample(companies, 2)
        out.append(("SUPPLIES", a["company_id"], b["company_id"], {"annual_volume": random.randint(10, 80)}))
    return out

def generate_random_invoices(companies, invoices):
    out = []
    for inv in invoices:
        issuer = random.choice(companies)
        payer = random.choice(companies)
        out.append(("ISSUES_TO", issuer["company_id"], inv["invoice_id"], {}))
        out.append(("PAYS", payer["company_id"], inv["invoice_id"], {}))
    return out

# ----------------------------------------------------------------------
# BUILD FINAL EDGE LIST
# ----------------------------------------------------------------------
edges.extend(embed_shell_pattern(companies, auditors, invoices))
edges.extend(embed_circular_trade(companies))
edges.extend(embed_hidden_influence(companies, shareholders, invoices))

edges.extend(generate_random_subsidiaries(companies))
edges.extend(generate_random_ownership(companies, shareholders))
edges.extend(generate_random_audits(companies, auditors))
edges.extend(generate_random_supplies(companies))
edges.extend(generate_random_invoices(companies, invoices))

print("Total edges:", len(edges))

# ----------------------------------------------------------------------
# WRITE NODE CSVS
# ----------------------------------------------------------------------
def write_csv(filename, rows, header):
    with open(os.path.join(OUTPUT_DIR, filename), "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=header)
        writer.writeheader()
        writer.writerows(rows)

write_csv("companies.csv", companies,
          ["company_id", "name", "country", "sector", "risk_score", "opportunity_score", "is_supplier"])

write_csv("shareholders.csv", shareholders,
          ["shareholder_id", "name", "type", "risk_score", "opportunity_score"])

write_csv("auditors.csv", auditors,
          ["auditor_id", "name", "risk_level"])

write_csv("invoices.csv", invoices,
          ["invoice_id", "amount", "status", "is_simulated"])

# ----------------------------------------------------------------------
# SPLIT EDGES INTO SEPARATE CSV FILES WITH SPECIFIC COLUMN NAMES
# ----------------------------------------------------------------------
def edge_to_dict(rel_type, start, end, props):
    if rel_type == "SUBSIDIARY_OF":
        return {"company_id": start, "company_id": end, "since_year": props.get("since_year")}
    elif rel_type == "OWNS_SHARE":
        return {"shareholder_id": start, "company_id": end, "percentage": props.get("percentage")}
    elif rel_type == "SUPPLIES":
        return {"company_id": start, "company_id": end, "annual_volume": props.get("annual_volume")}
    elif rel_type == "AUDITED_BY":
        return {"company_id": start, "auditor_id": end}
    elif rel_type == "ISSUES_TO":
        return {"company_id": start, "invoice_id": end}
    elif rel_type == "PAYS":
        return {"company_id": start, "invoice_id": end}
    return {"start": start, "end": end}

def write_edges_by_type(edges):
    buckets = {
        "SUBSIDIARY_OF": [],
        "OWNS_SHARE": [],
        "SUPPLIES": [],
        "AUDITED_BY": [],
        "ISSUES_TO": [],
        "PAYS": [],
    }

    for rel, start, end, props in edges:
        buckets[rel].append(edge_to_dict(rel, start, end, props))

    # Write each CSV
    for rel, rows in buckets.items():
        if rel == "SUBSIDIARY_OF":
            header = ["company_id", "company_id", "since_year"]
            filename = "subsidiary_of.csv"
        elif rel == "OWNS_SHARE":
            header = ["shareholder_id", "company_id", "percentage"]
            filename = "owns_share.csv"
        elif rel == "SUPPLIES":
            header = ["company_id", "company_id", "annual_volume"]
            filename = "supplies.csv"
        elif rel == "AUDITED_BY":
            header = ["company_id", "auditor_id"]
            filename = "audited_by.csv"
        elif rel == "ISSUES_TO":
            header = ["company_id", "invoice_id"]
            filename = "issues_to.csv"
        elif rel == "PAYS":
            header = ["company_id", "invoice_id"]
            filename = "pays.csv"

        with open(os.path.join(OUTPUT_DIR, filename), "w", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=header)
            writer.writeheader()
            writer.writerows(rows)

write_edges_by_type(edges)

print("CSV files generated in:", OUTPUT_DIR)


Total edges: 5547
CSV files generated in: output_csv


In [6]:
embed_shell_pattern(companies, auditors, invoices)

[('SUBSIDIARY_OF', 'C64', 'C52', {'since_year': 2020}),
 ('SUBSIDIARY_OF', 'C42', 'C64', {'since_year': 2020}),
 ('SUBSIDIARY_OF', 'C330', 'C42', {'since_year': 2020}),
 ('SUBSIDIARY_OF', 'C84', 'C330', {'since_year': 2020}),
 ('AUDITED_BY', 'C52', 'A0', {}),
 ('AUDITED_BY', 'C64', 'A0', {}),
 ('AUDITED_BY', 'C42', 'A0', {}),
 ('AUDITED_BY', 'C330', 'A0', {}),
 ('AUDITED_BY', 'C84', 'A0', {}),
 ('ISSUES_TO', 'C52', 'INV152', {}),
 ('ISSUES_TO', 'C64', 'INV952', {}),
 ('ISSUES_TO', 'C42', 'INV437', {}),
 ('ISSUES_TO', 'C330', 'INV437', {}),
 ('ISSUES_TO', 'C84', 'INV133', {})]

In [16]:
invoices_df

Unnamed: 0,invoice_id,amount,status
0,INV0,50444.76,PENDING
1,INV1,50160.89,PAID
2,INV2,53576.49,PENDING
3,INV3,39249.60,PAID
4,INV4,93926.11,PAID
...,...,...,...
1495,INV1495,38023.28,PENDING
1496,INV1496,80995.12,PENDING
1497,INV1497,85349.95,PENDING
1498,INV1498,59059.12,PENDING


In [None]:
git_token = "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"