### Create Power Generation Dataframes


In [28]:
import pandas as pd
import numpy as np

# Data for France and Germany

# country = "Germany (DE)"
# country_short = "ger"
country = "France (FR)"
country_short = "fr"

new_df = pd.read_csv(
    f"{country_short}_2025.csv")
print(len(new_df))
new_df.head()



735840


Unnamed: 0,MTU (CET/CEST),Area,Production Type,Generation (MW)
0,01/01/2025 00:00:00 - 01/01/2025 00:15:00,France (FR),Biomass,342.31
1,01/01/2025 00:15:00 - 01/01/2025 00:30:00,France (FR),Biomass,341.56
2,01/01/2025 00:30:00 - 01/01/2025 00:45:00,France (FR),Biomass,362.69
3,01/01/2025 00:45:00 - 01/01/2025 01:00:00,France (FR),Biomass,361.64
4,01/01/2025 01:00:00 - 01/01/2025 01:15:00,France (FR),Biomass,361.11


In [29]:

# Format energy production data into desired structure

df = new_df.copy()

# Extract start time from MTU (remove CET/CEST)
start_str = (
    df["MTU (CET/CEST)"]
    .str.split(" - ").str[0]
    .str.replace(r"\s*\(.*\)", "", regex=True)
    .str.strip()
)

df["Time"] = pd.to_datetime(
    start_str,
    format="%d/%m/%Y %H:%M:%S",
    errors="raise"
)

df = df.set_index("Time").sort_index()

# Clean generation values
df["Generation (MW)"] = pd.to_numeric(df["Generation (MW)"], errors="coerce").fillna(0)

# Pivot to wide format
df_wide = df.pivot_table(
    index="Time",
    columns="Production Type",
    values="Generation (MW)",
    aggfunc="sum",
    fill_value=0
)

# Resample to 30‑minute intervals
df_30 = df_wide.resample("30min").mean()

# Get energy import data
flows = pd.read_csv(f"{country_short}_flows_2025.csv")

# Remove timezone suffix if present
flows["MTU_clean"] = flows["MTU"].str.replace(r"\s*\(.*?\)", "", regex=True)

# Extract start timestamp
flows["Start"] = flows["MTU_clean"].str.split(" - ").str[0]
flows["Start"] = pd.to_datetime(flows["Start"], format="%d/%m/%Y %H:%M:%S")

# Keep only imports to the country
flows = flows[flows["In Area"] == country].copy()

# Extract country code from Out Area
flows["Country"] = flows["Out Area"].str.extract(r"\((.*?)\)")

# Clean numeric flow values
flows["Physical Flow (MW)"] = (
    flows["Physical Flow (MW)"]
    .astype(str)
    .str.replace(",", ".", regex=False)
    .str.replace(r"[^\d\.\-]", "", regex=True)
)

flows["Physical Flow (MW)"] = pd.to_numeric(
    flows["Physical Flow (MW)"],
    errors="coerce"
).fillna(0)

# Convert hourly MW → 30‑minute MW for 30 minute sample size
flows["MW_30min"] = flows["Physical Flow (MW)"] / 2

# Expand hourly rows into two 30‑minute rows
flows_30 = pd.concat([
    flows.assign(Time=flows["Start"]),
    flows.assign(Time=flows["Start"] + pd.Timedelta(minutes=30))
])

# Pivot to wide format
flows_wide = flows_30.pivot_table(
    index="Time",
    columns="Country",
    values="MW_30min",
    aggfunc="sum",
    fill_value=0
)

# This removes nonexistent timestamps like 2025‑03‑30 02:00 and 02:30
df_30 = df_30.interpolate(limit_direction="both") 
flows_wide = flows_wide.interpolate(limit_direction="both")
# Merge generation and import data

final = pd.concat([df_30, flows_wide], axis=1)

# Clean numeric columns
numeric_cols = final.select_dtypes(include="number").columns
final[numeric_cols] = final[numeric_cols].replace([np.nan, np.inf, -np.inf], 0)
final[numeric_cols] = final[numeric_cols].clip(lower=0)
final[numeric_cols] = final[numeric_cols].round(0).astype(int)

# Save to csv

output_file = f"{country_short}_production_2025.csv"
final.to_csv(output_file)

final.head()


Unnamed: 0_level_0,Biomass,Energy storage,Fossil Brown coal/Lignite,Fossil Coal-derived gas,Fossil Gas,Fossil Hard coal,Fossil Oil,Fossil Oil shale,Fossil Peat,Geothermal,...,Waste,Wind Offshore,Wind Onshore,BE,CH,DE,ES,GB-CTY,IT,LU
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-01-01 00:00:00,342,0,0,0,2116,0,109,0,0,0,...,301,1366,9574,1111,0,0,0,1237,0,0
2025-01-01 00:30:00,362,0,0,0,1890,0,110,0,0,0,...,299,1366,9801,1111,0,0,0,1237,0,0
2025-01-01 01:00:00,360,0,0,0,1764,0,110,0,0,0,...,299,1383,9980,1092,0,0,0,1224,0,0
2025-01-01 01:30:00,360,0,0,0,1752,0,110,0,0,0,...,297,1375,10146,1092,0,0,0,1224,0,0
2025-01-01 02:00:00,359,0,0,0,1630,0,111,0,0,0,...,296,1382,10190,1285,0,0,0,1137,0,0


In [33]:
import pandas as pd
import numpy as np

# country = "Germany (DE)"
# country_short = "ger"
country = "France (FR)"
country_short = "fr"

# -----------------------------
# Load generation data
# -----------------------------
new_df = pd.read_csv(f"{country_short}_2026.csv")
df = new_df.copy()

# Extract start time from MTU (remove CET/CEST)
start_str = (
    df["MTU (CET/CEST)"]
    .str.replace(r"\s*\(.*?\)", "", regex=True)
    .str.split(" - ").str[0]
    .str.strip()
)

df["Time"] = pd.to_datetime(start_str, format="%d/%m/%Y %H:%M:%S")
df = df.set_index("Time").sort_index()

# Clean generation values
df["Generation (MW)"] = pd.to_numeric(df["Generation (MW)"], errors="coerce").fillna(0)

# Pivot to wide format
df_wide = df.pivot_table(
    index="Time",
    columns="Production Type",
    values="Generation (MW)",
    aggfunc="sum",
    fill_value=0
)

# Resample to 30‑minute intervals
df_30 = df_wide.resample("30min").mean()

# -----------------------------
# Load flows data (15‑minute)
# -----------------------------
flows = pd.read_csv(f"{country_short}_flows_2026.csv")

# Remove timezone suffix
flows["MTU_clean"] = flows["MTU"].str.replace(r"\s*\(.*?\)", "", regex=True)

# Extract start timestamp
flows["Start"] = flows["MTU_clean"].str.split(" - ").str[0]
flows["Start"] = pd.to_datetime(flows["Start"], format="%d/%m/%Y %H:%M:%S")

# Keep only imports to the country
flows = flows[flows["In Area"] == country].copy()

# Extract country code from Out Area
flows["Country"] = flows["Out Area"].str.extract(r"\((.*?)\)")

# Clean numeric flow values
flows["Physical Flow (MW)"] = (
    flows["Physical Flow (MW)"]
    .astype(str)
    .str.replace(",", ".", regex=False)
    .str.replace(r"[^\d\.\-]", "", regex=True)
)

flows["Physical Flow (MW)"] = pd.to_numeric(flows["Physical Flow (MW)"], errors="coerce").fillna(0)

# -----------------------------
# 2026 FIX: Collapse 15‑minute flows → 30‑minute flows
# -----------------------------
flows = flows.set_index("Start")

# Average every two 15‑min rows into one 30‑min row
flows_30 = flows.groupby([flows.index.floor("30min"), "Country"])["Physical Flow (MW)"].mean().unstack(fill_value=0)

# -----------------------------
# FORCE BOTH DATASETS TO 30‑MIN GRID
# -----------------------------
full_index = pd.date_range("2026-01-01 00:00", "2026-01-06 23:30", freq="30min")

df_30 = df_30.reindex(full_index).interpolate(limit_direction="both")
flows_30 = flows_30.reindex(full_index).interpolate(limit_direction="both")

# -----------------------------
# Merge generation + imports
# -----------------------------
final = pd.concat([df_30, flows_30], axis=1)

# Clean numeric columns
numeric_cols = final.select_dtypes(include="number").columns
final[numeric_cols] = final[numeric_cols].replace([np.nan, np.inf, -np.inf], 0)
final[numeric_cols] = final[numeric_cols].clip(lower=0)
final[numeric_cols] = final[numeric_cols].round(0).astype(int)

# Save to CSV
output_file = f"{country_short}_production_2026.csv"
final.to_csv(output_file)

final.head()


Unnamed: 0,Biomass,Energy storage,Fossil Brown coal/Lignite,Fossil Coal-derived gas,Fossil Gas,Fossil Hard coal,Fossil Oil,Fossil Oil shale,Fossil Peat,Geothermal,...,Waste,Wind Offshore,Wind Onshore,BE,CH,DE,ES,GB-CTY,IT,LU
2026-01-01 00:00:00,137,1,0,0,1534,52,35,0,0,0,...,177,330,1467,2529,0,1031,0,2261,0,0
2026-01-01 00:30:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2609,0,1241,0,2150,0,0
2026-01-01 01:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2690,0,1450,0,2038,0,0
2026-01-01 01:30:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2651,0,1520,0,2379,0,0
2026-01-01 02:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2612,0,1590,0,2720,0,0


In [31]:
import pandas as pd
import numpy as np

# Load the final output file
df = pd.read_csv("fr_production_2025.csv", parse_dates=["Time"], index_col="Time")

# Identify numeric columns (generation + imports)
num_cols = df.select_dtypes(include="number").columns
print("Numeric columns:", num_cols.tolist())
# --- 1. Detect "bad" rows: all zeros or nearly all zeros ---
# Allow up to 2 non-zero values
threshold_nonzero = 2

bad_mask = (df[num_cols] != 0).sum(axis=1) <= threshold_nonzero
bad_rows = df[bad_mask]

print("Bad rows detected:", len(bad_rows))

# --- 2. Function to compute replacement values ---
def fill_row(timestamp, row):
    month = timestamp.month
    minute_of_day = timestamp.hour * 60 + timestamp.minute

    # Select same month + same time-of-day
    same_time = df[
        (df.index.month == month) &
        ((df.index.hour * 60 + df.index.minute) == minute_of_day)
    ]

    # Exclude other bad rows
    same_time_good = same_time[(same_time[num_cols] != 0).sum(axis=1) > threshold_nonzero]

    if len(same_time_good) == 0:
        # No valid data to average → leave zeros
        return row

    # Compute mean across valid rows
    return same_time_good[num_cols].mean()


# --- 3. Apply the fix ---
df_fixed = df.copy()

for ts, row in bad_rows.iterrows():
    df_fixed.loc[ts, num_cols] = fill_row(ts, row)
numeric_cols = df_fixed.select_dtypes(include="number").columns
df_fixed[numeric_cols] = df_fixed[numeric_cols].replace([np.nan, np.inf, -np.inf], 0)
df_fixed[numeric_cols] = df_fixed[numeric_cols].clip(lower=0)
df_fixed[numeric_cols] = df_fixed[numeric_cols].round(0).astype(int)
# --- 4. Save corrected file ---

df_fixed = df_fixed.rename(columns={"GB-CTY": "GB"})

df_fixed.to_csv("fr_production_2025_fixed.csv")
print("Finished filling missing timestamps.")


Numeric columns: ['Biomass', 'Energy storage', 'Fossil Brown coal/Lignite', 'Fossil Coal-derived gas', 'Fossil Gas', 'Fossil Hard coal', 'Fossil Oil', 'Fossil Oil shale', 'Fossil Peat', 'Geothermal', 'Hydro Pumped Storage', 'Hydro Run-of-river and pondage', 'Hydro Water Reservoir', 'Marine', 'Nuclear', 'Other', 'Other renewable', 'Solar', 'Waste', 'Wind Offshore', 'Wind Onshore', 'BE', 'CH', 'DE', 'ES', 'GB-CTY', 'IT', 'LU']
Bad rows detected: 9
Finished filling missing timestamps.


  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)
  df_fixed.loc[ts, num_cols] = fill_row(ts, row)


In [7]:
import pandas as pd
import numpy as np

# Load the CSV
df = pd.read_csv("caiso_2025.csv")

# --- 1. Remove Interval Start and Interval End columns ---
df = df.drop(columns=["Interval Start", "Interval End"])

# --- 2. Convert the time column to LOCAL California time ---
df["Time"] = pd.to_datetime(df["Time"], utc=True)
df["Time"] = df["Time"].dt.tz_convert("America/Los_Angeles")

# --- 3. Remove the nonexistent DST hour (spring forward) ---
# In California, DST starts on 2025-03-09 at 02:00 (02:00–02:59 does not exist)
spring_gap = (
    (df["Time"].dt.date == pd.to_datetime("2025-03-09").date()) &
    (df["Time"].dt.hour == 2)
)
df = df[~spring_gap]

# --- 4. Remove timezone info but KEEP the local clock time ---
df["Time"] = df["Time"].dt.tz_localize(None)

# --- 5. Set as index for resampling ---
df = df.set_index("Time")

# --- 6. Resample to 30-minute intervals ---
df_30 = df.resample("30min").mean()

# --- 7. Build a perfect 30-minute index for the entire year ---
full_index = pd.date_range("2025-01-01 00:00", "2026-01-06 23:30", freq="30min")
df_30 = df_30.reindex(full_index)

# --- 8. Fill missing values (best for CAISO production data) ---
df_30 = df_30.interpolate(limit_direction="both")

# --- 9. Clean numeric columns ---
numeric_cols = df_30.select_dtypes(include="number").columns
df_30[numeric_cols] = df_30[numeric_cols].replace([np.nan, np.inf, -np.inf, -1], 0)
df_30[numeric_cols] = df_30[numeric_cols].clip(lower=0)
df_30[numeric_cols] = df_30[numeric_cols].round(0).astype(int)

# --- 10. Save cleaned CSV ---
df_30.to_csv("cal_production_2025.csv", index_label="Time")


In [10]:
# NEW GB Version

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# ============================================================
# 1. LOAD GB DATA (GENERATION + INTERCONNECTORS)
# ============================================================

gb = pd.read_csv("gb_2025.csv")

# Convert SettlementPeriod → timestamp (30‑min blocks)
def sp_to_time(date, sp):
    base = datetime.strptime(date, "%Y-%m-%d")
    minutes = (sp - 1) * 30
    return base + timedelta(minutes=minutes)

gb["Time"] = gb.apply(lambda x: sp_to_time(x["SettlementDate"], x["SettlementPeriod"]), axis=1)

# Domestic generation mapping
generation_map = {
    "BIOMASS": "Biomass",
    "CCGT": "Fossil Gas",
    "COAL": "Fossil Hard coal",
    "OIL": "Fossil Oil",
    "OCGT": "Fossil Gas",
    "NUCLEAR": "Nuclear",
    "WIND": "Wind Onshore",
    "NPSHYD": "Hydro Water Reservoir",
    "PS": "Hydro Pumped Storage",
    "OTHER": "Other",
}

interconnector_map = {
    "INTEW": "IE",      # Ireland (East-West)
    "INTIRL": "IE",     # Northern Ireland (Moyle) -> Tracked as Ireland
    "INTFR": "FR",      # France (IFA)
    "INTIFA2": "FR",    # France (IFA2)  <-- FIXED: now also FR
    "INTNED": "NL",     # Netherlands (BritNed)
    "INTNEM": "BE",     # Belgium (Nemo Link)
    "INTNSL": "NO",     # Norway (North Sea Link)
    "INTVKL": "DK",     # Denmark (Viking Link)
    "INTGRNL": "IE",    # Ireland (Greenlink)
    "INTELEC": "??",    # Unknown / legacy
}


# Separate domestic generation vs interconnectors
gb["GenCategory"] = gb["FuelType"].map(generation_map)
gb["ImpCategory"] = gb["FuelType"].map(interconnector_map)

# ============================================================
# 2. PROCESS DOMESTIC GENERATION
# ============================================================

gb_gen = gb[gb["GenCategory"].notna()].copy()

gb_wide = gb_gen.pivot_table(
    index="Time",
    columns="GenCategory",
    values="Generation",
    aggfunc="sum",
    fill_value=0
)

# ============================================================
# 3. PROCESS IMPORTS (INTERCONNECTORS)
# ============================================================

gb_imp = gb[gb["ImpCategory"].notna()].copy()

# Convert flows so that imports into GB are positive
gb_imp["ImportMW"] = -gb_imp["Generation"]

imp_wide = gb_imp.pivot_table(
    index="Time",
    columns="ImpCategory",
    values="ImportMW",
    aggfunc="sum",
    fill_value=0
)

# ============================================================
# 4. MERGE GENERATION + IMPORTS
# ============================================================

final = pd.concat([gb_wide, imp_wide], axis=1)

# ============================================================
# 5. FIX DST + ENSURE CONTINUOUS 30‑MIN TIMELINE
# ============================================================

# Convert to timezone-aware UK time
final = final.tz_localize("UTC").tz_convert("Europe/London")

# Remove nonexistent DST hour (spring forward)
spring_gap = (
    (final.index.date == pd.to_datetime("2025-03-30").date()) &
    (final.index.hour == 1)
)
final = final[~spring_gap]

# Remove timezone info but keep local clock time
final.index = final.index.tz_localize(None)

# Collapse duplicate timestamps (fall-back hour)
final = final.groupby(final.index).sum()

# Build perfect 30-minute index for the year
full_index = pd.date_range("2025-01-01 00:00", "2026-01-06 23:30", freq="30min")

# Reindex and fill missing values
final = final.reindex(full_index)
final = final.interpolate(limit_direction="both")

# ============================================================
# 6. CLEAN NUMERIC COLUMNS
# ============================================================

numeric_cols = final.select_dtypes(include="number").columns
final[numeric_cols] = final[numeric_cols].replace([np.nan, np.inf, -np.inf], 0)
final[numeric_cols] = final[numeric_cols].clip(lower=0)
final[numeric_cols] = final[numeric_cols].round(0).astype(int)

# ============================================================
# 7. SAVE OUTPUT
# ============================================================

final.to_csv("gb_production_2025.csv", index_label="Time")

print(final.head())


                     Biomass  Fossil Gas  Fossil Hard coal  Fossil Oil  \
2025-01-01 00:00:00      962        3711                 0           0   
2025-01-01 00:30:00     1097        3935                 0           0   
2025-01-01 01:00:00     1106        3769                 0           0   
2025-01-01 01:30:00     1083        3719                 0           0   
2025-01-01 02:00:00     1007        3675                 0           0   

                     Hydro Pumped Storage  Hydro Water Reservoir  Nuclear  \
2025-01-01 00:00:00                     0                    744     5062   
2025-01-01 00:30:00                     0                    745     5059   
2025-01-01 01:00:00                     0                    744     5056   
2025-01-01 01:30:00                     0                    745     5057   
2025-01-01 02:00:00                     0                    737     5057   

                     Other  Wind Onshore  ??   BE  DK    FR   IE  NL   NO  
2025-01-01 00:00

In [35]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# ============================================================
# 1. LOAD GB GENERATION DATA (domestic only)
# ============================================================

gb = pd.read_csv("gb_2025.csv")

# Convert SettlementPeriod → timestamp (30‑min blocks)
def sp_to_time(date, sp):
    base = datetime.strptime(date, "%Y-%m-%d")
    minutes = (sp - 1) * 30
    return base + timedelta(minutes=minutes)

gb["Time"] = gb.apply(lambda x: sp_to_time(x["SettlementDate"], x["SettlementPeriod"]), axis=1)

# Map GB fuel types → ENTSO‑E‑style categories
generation_map = {
    "BIOMASS": "Biomass",
    "CCGT": "Fossil Gas",
    "COAL": "Fossil Hard coal",
    "OIL": "Fossil Oil",
    "OCGT": "Fossil Gas",
    "NUCLEAR": "Nuclear",
    "WIND": "Wind Onshore",
    "NPSHYD": "Hydro Water Reservoir",
    "PS": "Hydro Pumped Storage",
    "OTHER": "Other",
}

gb["Category"] = gb["FuelType"].map(generation_map)

# Keep only domestic generation rows
gb_gen = gb[gb["Category"].notna()].copy()

# Pivot to wide format
gb_wide = gb_gen.pivot_table(
    index="Time",
    columns="Category",
    values="Generation",
    aggfunc="sum",
    fill_value=0
)

# ============================================================
# 2. LOAD IMPORT DATA FROM SEPARATE CSV
# ============================================================

imp = pd.read_csv("gb_flows_2025.csv")

# Remove optional timezone suffix like " (CET)" or " (UTC)"
imp["MTU_clean"] = imp["MTU"].str.replace(r"\s*\(.*?\)", "", regex=True)

# Extract the first timestamp (start of the MTU)
imp["Start"] = imp["MTU_clean"].str.split(" - ").str[0]

# Parse into datetime
imp["Start"] = pd.to_datetime(imp["Start"], format="%d/%m/%Y %H:%M:%S")

# Keep only imports INTO GB
imp = imp[imp["In Area"] == "United Kingdom (GB-CTY)"].copy()

# Extract country code from "Out Area"
imp["Country"] = imp["Out Area"].str.extract(r"\((.*?)\)")

# Convert hourly MW → 30‑minute MW
imp["MW_30min"] = imp["Physical Flow (MW)"] / 2

# Expand hourly rows into two 30‑minute rows
imp_30 = pd.concat([
    imp.assign(Time=imp["Start"]),
    imp.assign(Time=imp["Start"] + pd.Timedelta(minutes=30))
])

# Pivot to wide format
imp_wide = imp_30.pivot_table(
    index="Time",
    columns="Country",
    values="MW_30min",
    aggfunc="sum",
    fill_value=0
)

imp_wide.index.name = "Time"

# ============================================================
# 3. MERGE GENERATION + IMPORTS
# ============================================================

final = pd.concat([gb_wide, imp_wide], axis=1)

# ============================================================
# 4. FIX DST + ENSURE CONTINUOUS 30‑MIN TIMELINE
# ============================================================

# Convert to timezone-aware UK time
final = final.tz_localize("UTC").tz_convert("Europe/London")

# Remove nonexistent DST hour (spring forward)
spring_gap = (
    (final.index.date == pd.to_datetime("2025-03-30").date()) &
    (final.index.hour == 1)
)
final = final[~spring_gap]

# Remove timezone info but keep local clock time
final.index = final.index.tz_localize(None)

# ⭐ FIX: remove duplicate timestamps (fall‑back hour)
final = final.groupby(final.index).sum()

# Build perfect 30-minute index for the year
full_index = pd.date_range("2025-01-01 00:00", "2025-12-31 23:30", freq="30min")

# Reindex and fill missing values
final = final.reindex(full_index)
final = final.interpolate(limit_direction="both")





In [None]:
# # merge all 3 years
# df_2023 = pd.read_csv("germany_2023_generation.csv", index_col="Time", parse_dates=True)
# df_2024 = pd.read_csv("germany_2024_generation.csv", index_col="Time", parse_dates=True)
# df_2025 = pd.read_csv("germany_2025_generation.csv", index_col="Time", parse_dates=True)

# combined_df = pd.concat([df_2023, df_2024, df_2025])
# combined_df = combined_df.sort_index()

# # exclude duplicate time indices if any
# print(len(combined_df))
# combined_df = combined_df[~combined_df.index.duplicated(keep='first')]
# print(len(combined_df))
# combined_df.to_csv("germany_2325_generation.csv")

In [None]:
# combined_df.columns

In [None]:
# combined_df

### Add Carbon Intensity Column

In [None]:
# import pandas as pd


# ci_df = pd.read_csv("new_data/germany_2325_ci.csv")

# ci_df.head()

Unnamed: 0,Time,Biomass,Energy storage,Fossil Brown coal/Lignite,Fossil Coal-derived gas,Fossil Gas,Fossil Hard coal,Fossil Oil,Fossil Oil shale,Fossil Peat,...,Nuclear,Other,Other renewable,Solar,Waste,Wind Offshore,Wind Onshore,total_power_mw,emissions_weighted,carbon_intensity
0,2023-01-01 00:00:00,4014.0975,0.0,3859.6,651.375,1593.8225,2067.6225,306.4125,0.0,0.0,...,2459.17,187.3025,91.33,1.7925,735.2525,3059.0925,28947.15,49274.6825,8804439.0,178.68078
1,2023-01-01 01:00:00,3993.27,0.0,3866.365,629.275,1436.9025,2051.83,305.905,0.0,0.0,...,2458.6025,187.27,92.615,1.65,725.1,3586.26,29587.5575,50174.0675,8704868.0,173.493363
2,2023-01-01 02:00:00,3967.275,0.0,3860.135,570.95,1435.14,2034.2625,305.7125,0.0,0.0,...,2459.645,187.2525,92.4675,1.7975,718.67,3842.2825,29514.8475,50237.5625,8618429.0,171.553495
3,2023-01-01 03:00:00,3973.155,0.0,3864.61,579.375,1432.61,2037.06,306.0,0.0,0.0,...,2460.475,187.2025,91.7625,1.755,718.8425,3463.0525,27493.4675,47857.39,8604584.0,179.79634
4,2023-01-01 04:00:00,3996.42,0.0,3840.83,604.6,1430.85,2039.9775,306.0,0.0,0.0,...,2460.8025,187.2775,91.97,2.1275,721.325,3462.1925,26938.7425,47351.455,8603948.0,181.703985


In [None]:
# ci_df.columns

Index(['Time', 'Biomass', 'Energy storage', 'Fossil Brown coal/Lignite',
       'Fossil Coal-derived gas', 'Fossil Gas', 'Fossil Hard coal',
       'Fossil Oil', 'Fossil Oil shale', 'Fossil Peat', 'Geothermal',
       'Hydro Pumped Storage', 'Hydro Run-of-river and pondage',
       'Hydro Water Reservoir', 'Marine', 'Nuclear', 'Other',
       'Other renewable', 'Solar', 'Waste', 'Wind Offshore', 'Wind Onshore',
       'total_power_mw', 'emissions_weighted', 'carbon_intensity'],
      dtype='object')