In [452]:
import pandas as pd
import numpy as np

In [453]:
files = [
    "NGER.ID0075.csv", "NGER.ID0076.csv", "NGER.ID0077.csv",
    "NGER.ID0078.csv", "NGER.ID0079.csv", "NGER.ID0080.csv",
    "NGER.ID0081.csv", "NGER.ID0082.csv", "NGER.ID0083.csv",
    "NGER.ID0243.csv"
]

# standard column
ref = pd.read_csv("NGER.ID0243.csv")
standard_cols = ref.columns.tolist()
print("standard columns：", standard_cols)

dfs = []

for fname in files:
    df = pd.read_csv(fname)

    # normalize field names
    rename_map = {}
    for c in df.columns:
        c_clean = c.strip().lower().replace(" ", "_")
        for std in standard_cols:
            std_clean = std.strip().lower().replace(" ", "_")
            if c_clean == std_clean:
                rename_map[c] = std
    df = df.rename(columns = rename_map)

    # fill in missing fields
    for col in standard_cols:
        if col not in df.columns:
            df[col] = pd.NA

    # keep standard columns only
    df = df[standard_cols]

    dfs.append(df)

# vertical merge
NGER = pd.concat(dfs, ignore_index=True, sort=False)
print("combined size：", NGER.shape)

standard columns： ['Reporting entity', 'Facility name', 'Type', 'State', 'Electricity production GJ', 'Electricity production MWh', 'Total scope 1 emissions t CO2 e', 'Total scope 2 emissions t CO2 e', 'Total emissions t CO2 e', 'Emission intensity t CO2 e MWh', 'Grid connected', 'Grid', 'Primary fuel', 'Important notes']
combined size： (5942, 14)


  NGER = pd.concat(dfs, ignore_index=True, sort=False)


In [454]:
# drop unnecessary row
NGER = NGER[NGER["Facility name"] != "Corporate Total"]
NGER = NGER.drop(NGER[NGER["Type"].isna()].index)
NGER = NGER.drop(NGER[NGER["Type"] == "-"].index)
NGER = NGER.drop(NGER[NGER["Electricity production GJ"].isna()].index)

In [455]:
# handle missing value in CO2 Emissions by median
for col in ["Total scope 1 emissions t CO2 e", "Total scope 2 emissions t CO2 e"]:
    NGER[col] = NGER[col].fillna(NGER[col].median())

In [456]:
# Calculate Emission Intensity
mask = NGER["Emission intensity t CO2 e MWh"].isna() & \
       (NGER["Electricity production MWh"] > 0)
NGER.loc[mask, "Emission intensity t CO2 e MWh"] = (
    NGER.loc[mask, "Total emissions t CO2 e"] /
    NGER.loc[mask, "Electricity production MWh"]
)

In [457]:
# handle Grid column
NGER["Grid connected"] = NGER["Grid connected"].fillna("Off").replace("-", "Off")
NGER["Grid"] = NGER["Grid"].fillna("Off-grid").replace("-", "Off-grid")

In [458]:
# drop unecessary column
NGER = NGER.drop(columns=["Important notes"], errors="ignore")

In [459]:
# type transform
cat_cols = ['Reporting entity','Facility name','Type','State',
            'Grid connected','Grid','Primary fuel']
for col in cat_cols:
    NGER[col] = NGER[col].astype('category')

In [460]:
print(NGER.info())


<class 'pandas.core.frame.DataFrame'>
Index: 4877 entries, 0 to 5940
Data columns (total 13 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   Reporting entity                 4476 non-null   category
 1   Facility name                    4877 non-null   category
 2   Type                             4877 non-null   category
 3   State                            4877 non-null   category
 4   Electricity production GJ        4877 non-null   float64 
 5   Electricity production MWh       4877 non-null   float64 
 6   Total scope 1 emissions t CO2 e  4877 non-null   float64 
 7   Total scope 2 emissions t CO2 e  4877 non-null   float64 
 8   Total emissions t CO2 e          4877 non-null   int64   
 9   Emission intensity t CO2 e MWh   4877 non-null   float64 
 10  Grid connected                   4877 non-null   category
 11  Grid                             4877 non-null   category
 12  Primary fue

In [461]:
NGER.nunique()

Reporting entity                    314
Facility name                       853
Type                                  2
State                                 8
Electricity production GJ          4699
Electricity production MWh         4519
Total scope 1 emissions t CO2 e    2543
Total scope 2 emissions t CO2 e    1156
Total emissions t CO2 e            3003
Emission intensity t CO2 e MWh     1272
Grid connected                        2
Grid                                  6
Primary fuel                         22
dtype: int64