In [11]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Generate random dates between 2022-01-01 and 2024-12-31
start_date = datetime(2022, 1, 1)
end_date = datetime(2024, 12, 31)

date_list = [start_date + timedelta(days=random.randint(0, (end_date - start_date).days)) for _ in range(500)]

# Generate random Site_IDs
site_ids = [f"Site_{random.randint(1, 4)}" for _ in range(500)]

# Generate random CO2_Captured_Tons between 100 and 500 tons
co2_captured = [round(random.uniform(100, 500), 2) for _ in range(500)]

# Generate random Operational_Costs between 800 and 4500 USD
operational_costs = [round(random.uniform(800, 4500), 2) for _ in range(500)]

# Generate random Energy_Consumption between 2000 and 10000 kWh
energy_consumption = [round(random.uniform(2000, 10000), 2) for _ in range(500)]

# Generate random Revenue_CO2_Sales between 600 and 2500 USD
revenue_co2_sales = [round(random.uniform(600, 2500), 2) for _ in range(500)]

# Generate random Revenue_Carbon_Credits between 250 and 1000 USD
revenue_carbon_credits = [round(random.uniform(250, 1000), 2) for _ in range(500)]

# Generate random Plant_Uptime_Percentage between 80 and 100%
plant_uptime = [round(random.uniform(80, 100), 2) for _ in range(500)]

# Generate random Customer_Satisfaction_Score between 1 and 10
customer_satisfaction = [random.randint(1, 10) for _ in range(500)]

# Generate random latitude and longitude coordinates for each Site_ID
# Latitude ranges from -90 to 90
# Longitude ranges from -180 to 180
latitudes = [round(random.uniform(-90, 90), 4) for _ in range(500)]
longitudes = [round(random.uniform(-180, 180), 4) for _ in range(500)]

# Create DataFrame
df = pd.DataFrame({
    'Date': date_list,
    'Site_ID': site_ids,
    'Latitude': latitudes,
    'Longitude': longitudes,
    'CO2_Captured_Tons': co2_captured,
    'Operational_Costs': operational_costs,
    'Energy_Consumption': energy_consumption,
    'Revenue_CO2_Sales': revenue_co2_sales,
    'Revenue_Carbon_Credits': revenue_carbon_credits,
    'Plant_Uptime_Percentage': plant_uptime,
    'Customer_Satisfaction_Score': customer_satisfaction
})

# Introduce missing values randomly
for col in ['CO2_Captured_Tons', 'Operational_Costs', 'Energy_Consumption', 'Revenue_CO2_Sales', 'Revenue_Carbon_Credits', 'Plant_Uptime_Percentage']:
    indices = random.sample(range(500), 50)  # Select 50 random indices
    df.loc[indices, col] = np.nan

# Sort DataFrame by Date and Site_ID
df.sort_values(by=['Date', 'Site_ID'], inplace=True)
df.reset_index(drop=True, inplace=True)

# Display first few rows of the DataFrame
print(df.head())


        Date Site_ID  Latitude  Longitude  CO2_Captured_Tons  \
0 2022-01-01  Site_2   43.1288   143.8017             426.59   
1 2022-01-02  Site_1   26.4692   -89.5440                NaN   
2 2022-01-02  Site_1   73.7448   -11.1432             336.30   
3 2022-01-06  Site_1  -75.6948   114.4794             339.17   
4 2022-01-10  Site_4    0.5064    42.2252             219.77   

   Operational_Costs  Energy_Consumption  Revenue_CO2_Sales  \
0            2054.49             4660.83            1250.99   
1            3782.28             3793.76                NaN   
2                NaN             6523.91            1964.15   
3            2443.79             2966.91            1467.44   
4                NaN             6126.73            1159.69   

   Revenue_Carbon_Credits  Plant_Uptime_Percentage  \
0                  822.70                    87.51   
1                  618.78                    89.09   
2                  786.42                    85.25   
3                  3

In [12]:
# Export the DataFrame to a CSV file
df.to_csv('new_multi_site_factory_ccaaS_data.csv', index=False)

print("Dataset exported successfully!")


Dataset exported successfully!
