In [2]:
import pandas as pd 
import random 

original_data = pd.read_csv('./tvae_synthetic_dataset_cleaned.csv')

# Define 80 unique unique_suppliers
unique_suppliers = list(original_data['SUPPLIER_NAME'].unique())[:80]

# Generate unique supplier codes
supplier_code_mapping = {supplier: f"VD{random.randint(100000, 999999)}" for supplier in unique_suppliers}

# Ensure unique supplier codes
while len(set(supplier_code_mapping.values())) != len(supplier_code_mapping):
    supplier_code_mapping = {supplier: f"VD{random.randint(100000, 999999)}" for supplier in unique_suppliers}

In [10]:
import pandas as pd
import random
from datetime import datetime, timedelta

suppliers = unique_suppliers

# Define categories with contextually relevant items
categories = {
    "Motors & Drivers": [
        ("12V DC Motor", "Compact DC motor with high torque"),
        ("Stepper Motor", "Precision stepper motor with feedback"),
        ("Servo Motor Driver", "High-speed servo motor driver")
    ],
    "Sensors & Actuators": [
        ("Temperature Sensor", "RTD temperature sensor"),
        ("Pressure Sensor", "High-precision pressure sensor"),
        ("Proximity Sensor", "Inductive proximity sensor")
    ],
    "Control Systems": [
        ("PLC Controller", "Modular PLC for automation"),
        ("VFD Drive", "Variable Frequency Drive for motor control"),
        ("Control Cabinet", "Industrial control cabinet with locks")
    ],
    "Electrical Components": [
        ("10A Relay Switch", "SPDT relay switch for automation"),
        ("Resistor", "High-precision 10K resistor"),
        ("Power Supply", "24V DC power supply")
    ],
    "Robotics Systems": [
        ("Robotic Arm", "6-axis robotic arm"),
        ("Gripper", "Pneumatic gripper for robotic arms"),
        ("3D Printer", "Compact 3D printer for prototyping")
    ]
}

# Cross-category relationships (allowed combinations)
cross_category_combinations = [
    ("Motors & Drivers", "Control Systems"),
    ("Sensors & Actuators", "Control Systems"),
    ("Electrical Components", "Robotics Systems"),
    ("Motors & Drivers", "Sensors & Actuators"),
]

# Ensure unique supplier codes
supplier_code_mapping = {s: f"VD{random.randint(100000, 999999)}" for s in suppliers}
while len(supplier_code_mapping.values()) != len(set(supplier_code_mapping.values())):
    supplier_code_mapping = {s: f"VD{random.randint(100000, 999999)}" for s in suppliers}

# Store unique item codes for supplier-item pairs
item_code_mapping = {}

# Store base prices for supplier-item pairs
base_prices = {}

def get_price(supplier, item):
    """Generate a price with ±10% fluctuation for a supplier-item pair."""
    if (supplier, item) not in base_prices:
        # Assign a base price if this is the first occurrence
        base_price = round(random.uniform(50, 5000), 2)
        base_prices[(supplier, item)] = base_price
    else:
        # Apply ±10% fluctuation to the base price
        base_price = base_prices[(supplier, item)]
        fluctuation = random.uniform(0.8, 1.2)  # 20% up or down
        base_price = round(base_price * fluctuation, 2)
    return base_price

def generate_item_code(supplier, item):
    """Ensure consistent item codes for each supplier-item pair."""
    if (supplier, item) not in item_code_mapping:
        code = f"{supplier[:3].upper()}{random.randint(1000, 9999)}"
        item_code_mapping[(supplier, item)] = code
    return item_code_mapping[(supplier, item)]

def generate_faulted_parts(quantity):
    """Generate faulted parts based on a skewed distribution."""
    fault_probability = random.random()
    if fault_probability < 0.7:
        return 0  # 70% of orders have 0 faulted parts
    elif fault_probability < 0.9:
        return int(quantity * 0.05)  # 20% have 5% faulted parts
    elif fault_probability < 0.98:
        return int(quantity * 0.10)  # 8% have 10% faulted parts
    else:
        return quantity  # 2% have 100% faulted parts

def select_category():
    """Select a category or a cross-category combination."""
    if random.random() < 0.7:  # 70% chance to select a single category
        return [random.choice(list(categories.keys()))]
    else:  # 30% chance to select a valid cross-category combination
        return list(random.choice(cross_category_combinations))

# Generate purchase records
records = []
base_date = datetime(2023, 1, 1)
n_po_numbers = 30000  # Total number of POs (each with 1-4 items)

for po_num in range(100000, 100000 + n_po_numbers):
    selected_categories = select_category()
    items_in_po = []
    for category in selected_categories:
        items_in_po.extend(random.sample(categories[category], random.randint(1, 2)))

    supplier = random.choice(suppliers)  # One supplier per PO

    for item, description in items_in_po:
        item_code = generate_item_code(supplier, item)
        quantity = random.randint(1, 300)
        price = get_price(supplier, item)
        po_value = quantity * price
        faulted_parts = generate_faulted_parts(quantity)
        downpayment_date = base_date + timedelta(days=random.randint(0, 365))
        delivery_date = downpayment_date + timedelta(days=random.randint(3, 100))

        record = {
            "PO_NUM": po_num,
            "ITEM_NAME": item,
            "PART_DESCRIPTION": description,
            "ITEM_CODE": item_code,
            "SUPPLIER_NAME": supplier,
            "SUPPLIER_CODE": supplier_code_mapping[supplier],
            "ORDERED_QUANTITY": quantity,
            "FAULTED_PARTS": faulted_parts,
            "PRICE": price,
            "PO_VALUE": po_value,
            "DOWNPAYMENT_DATE": downpayment_date.strftime('%Y-%m-%d'),
            "DELIVERY_DATE": delivery_date.strftime('%Y-%m-%d')
        }
        records.append(record)

# Save the dataset to CSV
df = pd.DataFrame(records)



In [12]:
df.to_csv('./enlarged_dataset.csv', index=False)