### Download the dataset

In [None]:
import kagglehub


path = kagglehub.dataset_download("lastman0800/sustainable-manufacturing-large-data")
print("Path to dataset files:", path)

### Open the dataset and drop duplicates

In [None]:
import os
import pandas as pd


file_path = os.path.join(path, "Sustainable Manufacturing Large Data.csv")
df = pd.read_csv(file_path)

df.drop_duplicates(inplace=True)

df

### Generate data for 2020 (Machine 1 - 10)

In [None]:
import warnings
import numpy as np
from sklearn.linear_model import LinearRegression

warnings.filterwarnings("ignore")
np.random.seed(42)

# Get all dates for the year 2020
dates = pd.date_range(start='2020-01-01', end='2020-12-31').tolist()

feature_cols = df.columns[2:10]
dependent_features = [col for col in feature_cols if col != 'Operation_Hours']
machine_ids = df['Machine_ID'].unique()

# Fit models: each dependent feature ~ Operation_Hours
models = {}
for col in dependent_features:
    model = LinearRegression()
    model.fit(df[['Operation_Hours']], df[col])
    models[col] = model

synthetic_samples = []

for date in dates:
    for machine_id in machine_ids:
        # Sample a operation hours in a uniform distribution between 6 and 10 hours
        op_hours = round(np.random.uniform(6, 10), 2)

        sample_row = {
            'Date': date,
            'Machine_ID': machine_id,
            'Operation_Hours': round(op_hours) # parse to int
        }

        for col in dependent_features:
            prediction = models[col].predict([[op_hours]])[0]
            noise = np.random.normal(0, df[col].std() * 0.1)
            sample_row[col] = int(round(prediction) + noise)

        synthetic_samples.append(sample_row)

df = pd.DataFrame(synthetic_samples)
print("Synthetic Dataframe shape:", df.shape)
df[0:100]

### Create the target JSON file

In [None]:
import json


json_data = {}

for _, row in df.iterrows():

    if row["Machine_ID"] not in json_data:
        json_data[row["Machine_ID"]] = []

    obj = {
        "date": row["Date"].strftime("%Y-%m-%d"),
        "operation_hours": row["Operation_Hours"],
        "energy_consumption_kWh": row["Energy_Consumption_kWh"],
        "material_used_kg": row["Material_Used_kg"],
        "material_waste_kg": row["Material_Waste_kg"],
        "CO2_emissions_kg": row["CO2_Emissions_kg"],
        "water_consumption_liters": row["Water_Consumption_Liters"],
        "water_recycled_liters": row["Water_Recycled_Liters"],
        "product_output_units": row["Product_Output_Units"],
    }

    json_data[row["Machine_ID"]].append(obj)

# Save the JSON data to a file
with open("synthetic_data_target.json", "w") as f:
    json.dump(json_data, f, indent=4)

### Introduce noise and generate source JSON file

In [None]:
variation_functions = {
    # different key, correct unit, unit in key
    "M001": {
        "Date": ("CurrentDate", lambda x: x.strftime("%y-%m-%d")),
        "Operation_Hours": ("UptimeHours", lambda x: x),
        "Energy_Consumption_kWh": ("PowerKWh", lambda x: x),
        "Material_Used_kg": ("SubstanceUsedKg", lambda x: x),
        "Material_Waste_kg": ("SubstanceWasteKg", lambda x: x),
        "CO2_Emissions_kg": ("CarbonDioxideKg", lambda x: x),
        "Water_Consumption_Liters": ("WaterUsageLiters", lambda x: x),
        "Water_Recycled_Liters": ("WaterReclaimedLiters", lambda x: x),
        "Product_Output_Units": ("YieldUnits", lambda x: x),
    },
    # different key, different unit, unit in key
    "M002": {
        "Date": ("CurrentDateIso", lambda x: x.isoformat()),
        "Operation_Hours": ("UptimeSeconds", lambda x: x * 3600),
        "Energy_Consumption_kWh": ("PowerWh", lambda x: x * 1000),
        "Material_Used_kg": ("SubstanceUsedGrams", lambda x: x * 1000),
        "Material_Waste_kg": ("SubstanceWasteMilligrams", lambda x: x * 1000000),
        "CO2_Emissions_kg": ("CarbonDioxideTonnes", lambda x: x / 1000),
        "Water_Consumption_Liters": ("WaterUsageMilliliters", lambda x: x * 1000),
        "Water_Recycled_Liters": ("WaterReclaimedMilliliters", lambda x: x * 1000),
        "Product_Output_Units": ("YieldUnitsThousands", lambda x: x / 1000),
    },
    # different key, correct unit, unit in value
    "M003": {
        "Date": ("CurrentDate", lambda x: x.strftime("%y-%m-%d")),
        "Operation_Hours": ("Uptime", lambda x: f"{x} h"),
        "Energy_Consumption_kWh": ("Power", lambda x: f"{x} kWh"),
        "Material_Used_kg": ("SubstanceUsed", lambda x: f"{x} kg"),
        "Material_Waste_kg": ("SubstanceWaste", lambda x: f"{x} kg"),
        "CO2_Emissions_kg": ("CarbonDioxide", lambda x: f"{x} kg"),
        "Water_Consumption_Liters": ("WaterUsage", lambda x: f"{x} l"),
        "Water_Recycled_Liters": ("WaterReclaimed", lambda x: f"{x} l"),
        "Product_Output_Units": ("Yield", lambda x: f"{x} units"),
    },
    # different key, different unit, unit in value
    "M004": {
        "Date": ("CurrentDate", lambda x: x.strftime("%y-%m-%d")),
        "Operation_Hours": ("Uptime", lambda x: f"{x * 3600} s"),
        "Energy_Consumption_kWh": ("Power", lambda x: f"{x * 1000} Wh"),
        "Material_Used_kg": ("SubstanceUsed", lambda x: f"{x * 1000} g"),
        "Material_Waste_kg": ("SubstanceWaste", lambda x: f"{x * 1000000} mg"),
        "CO2_Emissions_kg": ("CarbonDioxide", lambda x: f"{x / 1000} t"),
        "Water_Consumption_Liters": ("WaterUsage", lambda x: f"{x * 1000} ml"),
        "Water_Recycled_Liters": ("WaterReclaimed", lambda x: f"{x * 1000} ml"),
        "Product_Output_Units": ("Yield", lambda x: f"{x / 1000} thousand units"),
    },
    # same key, different unit, unit in key
    "M005": {
        "Date": ("date_iso", lambda x: x.isoformat()),
        "Operation_Hours": ("operation_seconds", lambda x: x * 3600),
        "Energy_Consumption_kWh": ("energy_consumption_Wh", lambda x: x * 1000),
        "Material_Used_kg": ("material_used_g", lambda x: x * 1000),
        "Material_Waste_kg": ("material_waste_g", lambda x: x * 1000),
        "CO2_Emissions_kg": ("CO2_emissions_t", lambda x: x / 1000),
        "Water_Consumption_Liters": ("water_consumption_ml", lambda x: x * 1000),
        "Water_Recycled_Liters": ("water_recycled_ml", lambda x: x * 1000),
        "Product_Output_Units": ("product_output_units_thousands", lambda x: x / 1000),
    },
    # same key, different unit, unit in value
    "M006": {
        "Date": ("date", lambda x: x.isoformat()),
        "Operation_Hours": ("operation", lambda x: f"{x * 3600} s"),
        "Energy_Consumption_kWh": ("energy_consumption", lambda x: f"{x * 1000} Wh"),
        "Material_Used_kg": ("material_used", lambda x: f"{x * 1000} g"),
        "Material_Waste_kg": ("material_waste", lambda x: f"{x * 1000} g"),
        "CO2_Emissions_kg": ("CO2_emissions", lambda x: f"{x / 1000} t"),
        "Water_Consumption_Liters": ("water_consumption", lambda x: f"{x * 1000} ml"),
        "Water_Recycled_Liters": ("water_recycled", lambda x: f"{x * 1000} ml"),
        "Product_Output_Units": ("product_output", lambda x: f"{x / 1000} thousand units"),
    },
    # same key, correct unit, unit in value
    "M007": {
        "Date": ("date", lambda x: x.strftime("%Y-%m-%d")),
        "Operation_Hours": ("operation", lambda x: f"{x} h"),
        "Energy_Consumption_kWh": ("energy_consumption", lambda x: f"{x} kWh"),
        "Material_Used_kg": ("material_used", lambda x: f"{x} kg"),
        "Material_Waste_kg": ("material_waste", lambda x: f"{x} kg"),
        "CO2_Emissions_kg": ("CO2_emissions", lambda x: f"{x} kg"),
        "Water_Consumption_Liters": ("water_consumption", lambda x: f"{x} l"),
        "Water_Recycled_Liters": ("water_recycled", lambda x: f"{x} l"),
        "Product_Output_Units": ("product_output", lambda x: f"{x} units"),
    },
    # same key, correct unit, unit in key (IDENTITY MAPPING)
    "M008": {
        "Date": ("date", lambda x: x.strftime("%Y-%m-%d")),
        "Operation_Hours": ("operation_hours", lambda x: x),
        "Energy_Consumption_kWh": ("energy_consumption_kWh", lambda x: x),
        "Material_Used_kg": ("material_used_kg", lambda x: x),
        "Material_Waste_kg": ("material_waste_kg", lambda x: x),
        "CO2_Emissions_kg": ("co2_emissions_kg", lambda x: x),
        "Water_Consumption_Liters": ("water_consumption_liters", lambda x: x),
        "Water_Recycled_Liters": ("water_recycled_liters", lambda x: x),
        "Product_Output_Units": ("product_output_units", lambda x: x),
    },
    # same key, nested value and unit (different unit)
    "M009": {
        "Date": ("date", lambda x: {"value": x.strftime("%Y-%m-%d"), "unit": "ISO"}),
        "Operation_Hours": ("operation", lambda x: {"value": x * 3600, "unit": "s"}),
        "Energy_Consumption_kWh": ("energy_consumption", lambda x: {"value": x * 1000, "unit": "Wh"}),
        "Material_Used_kg": ("material_used", lambda x: {"value": x * 1000, "unit": "g"}),
        "Material_Waste_kg": ("material_waste", lambda x: {"value": x * 1000, "unit": "g"}),
        "CO2_Emissions_kg": ("CO2_emissions", lambda x: {"value": x / 1000, "unit": "t"}),
        "Water_Consumption_Liters": ("water_consumption", lambda x: {"value": x * 1000, "unit": "ml"}),
        "Water_Recycled_Liters": ("water_recycled", lambda x: {"value": x * 1000, "unit": "ml"}),
        "Product_Output_Units": ("product_output", lambda x: {"value": x / 1000, "unit": "thousand units"}),
    },
    # same key, nested value and unit (same unit)
    "M010": {
        "Date": ("date", lambda x: {"value": x.strftime("%Y-%m-%d"), "unit": "ISO"}),
        "Operation_Hours": ("operation", lambda x: {"value": x, "unit": "h"}),
        "Energy_Consumption_kWh": ("energy_consumption", lambda x: {"value": x, "unit": "kWh"}),
        "Material_Used_kg": ("material_used", lambda x: {"value": x, "unit": "kg"}),
        "Material_Waste_kg": ("material_waste", lambda x: {"value": x, "unit": "kg"}),
        "CO2_Emissions_kg": ("CO2_emissions", lambda x: {"value": x, "unit": "kg"}),
        "Water_Consumption_Liters": ("water_consumption", lambda x: {"value": x, "unit": "l"}),
        "Water_Recycled_Liters": ("water_recycled", lambda x: {"value": x, "unit": "l"}),
        "Product_Output_Units": ("product_output", lambda x: {"value": x, "unit": "units"}),
    },
}


json_data = {}

for _, row in df.iterrows():
    machine_id = row["Machine_ID"]
    if machine_id not in json_data:
        json_data[machine_id] = []

    variations = variation_functions.get(machine_id)
    obj = {}
    for key, (new_key, func) in variations.items():
        obj[new_key] = func(row[key])

    json_data[machine_id].append(obj)

# Save the JSON data to a file
with open("synthetic_data_src.json", "w") as f:
    json.dump(json_data, f, indent=4)