In [5]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "README.md").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent  # jump from Notebooks/ to repo root

src = PROJECT_ROOT / "Data" / "Processed" / "uk_electricity_pbi.csv"
print(src)
print("exists:", src.exists(), "size_MB:", round(src.stat().st_size/1024/1024, 1))

c:\DataAnalystTraining\Energy-Portfolio\Data\Processed\uk_electricity_pbi.csv
exists: True size_MB: 223.3


In [6]:
df = pd.read_csv(src, parse_dates=["dtm"])
df = df.sort_values("dtm").set_index("dtm")

minute = df.resample("1min").mean(numeric_only=True)
minute["out_of_band"] = (minute["f"] < 49.8) | (minute["f"] > 50.2)

out = PROJECT_ROOT / "Data" / "Processed" / "uk_electricity_pbi_1min.csv"
minute.reset_index().to_csv(out, index=False, float_format="%.4f")

print(out)
print("size_MB:", round(out.stat().st_size/1024/1024, 1))


c:\DataAnalystTraining\Energy-Portfolio\Data\Processed\uk_electricity_pbi_1min.csv
size_MB: 2.1


In [14]:
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "README.md").exists():
    for parent in Path.cwd().parents:
        if (parent / "README.md").exists():
            PROJECT_ROOT = parent
            break

raw_path = PROJECT_ROOT / "Data" / "Raw" / "uk_electricity.csv"
processed_dir = PROJECT_ROOT / "Data" / "Processed"
processed_dir.mkdir(parents=True, exist_ok=True)

raw_path

WindowsPath('c:/DataAnalystTraining/Energy-Portfolio/Data/Raw/uk_electricity.csv')

In [15]:
df = pd.read_csv(raw_path)

df.head(), df.shape

(                   dtm       f
 0  2025-11-01 00:00:01  49.964
 1  2025-11-01 00:00:02  49.967
 2  2025-11-01 00:00:03  49.964
 3  2025-11-01 00:00:04  49.965
 4  2025-11-01 00:00:05  49.969,
 (2591999, 2))

In [16]:
dt_col = "dtm"
metric_col = "f"

df[dt_col] = pd.to_datetime(df[dt_col], errors="coerce")
df[metric_col] = pd.to_numeric(df[metric_col], errors="coerce")

df = df.dropna(subset=[dt_col, metric_col]).sort_values(dt_col)

df.dtypes, df[[dt_col, metric_col]].head()

(dtm    datetime64[ns]
 f             float64
 dtype: object,
                   dtm       f
 0 2025-11-01 00:00:01  49.964
 1 2025-11-01 00:00:02  49.967
 2 2025-11-01 00:00:03  49.964
 3 2025-11-01 00:00:04  49.965
 4 2025-11-01 00:00:05  49.969)

In [17]:
df["minute"] = df[dt_col].dt.floor("min")

df = df.set_index(dt_col)
df["rolling_5min"] = df[metric_col].rolling("5min").mean()

df["deviation_from_50"] = df[metric_col] - 50.0

df["out_of_band"] = (df[metric_col] < 49.9) | (df[metric_col] > 50.1)

df = df.reset_index()

df[["dtm","f","minute","rolling_5min","deviation_from_50","out_of_band"]].head()

Unnamed: 0,dtm,f,minute,rolling_5min,deviation_from_50,out_of_band
0,2025-11-01 00:00:01,49.964,2025-11-01,49.964,-0.036,False
1,2025-11-01 00:00:02,49.967,2025-11-01,49.9655,-0.033,False
2,2025-11-01 00:00:03,49.964,2025-11-01,49.965,-0.036,False
3,2025-11-01 00:00:04,49.965,2025-11-01,49.965,-0.035,False
4,2025-11-01 00:00:05,49.969,2025-11-01,49.9658,-0.031,False


In [18]:
summary = {
    "rows": len(df),
    "start": df["dtm"].min(),
    "end": df["dtm"].max(),
    "avg_f": df["f"].mean(),
    "min_f": df["f"].min(),
    "max_f": df["f"].max(),
    "out_of_band_count": int(df["out_of_band"].sum())
}

summary

{'rows': 2591999,
 'start': Timestamp('2025-11-01 00:00:01'),
 'end': Timestamp('2025-11-30 23:59:59'),
 'avg_f': np.float64(49.999767024601475),
 'min_f': np.float64(49.753),
 'max_f': np.float64(50.285),
 'out_of_band_count': 645555}

In [19]:
out_path = processed_dir / "uk_electricity_pbi.csv"
df.to_csv(out_path, index=False)

out_path

WindowsPath('c:/DataAnalystTraining/Energy-Portfolio/Data/Processed/uk_electricity_pbi.csv')

In [20]:
sample_path = processed_dir / "uk_electricity_pbi_sample_50k.csv"
df.head(50_000).to_csv(sample_path, index=False)

sample_path

WindowsPath('c:/DataAnalystTraining/Energy-Portfolio/Data/Processed/uk_electricity_pbi_sample_50k.csv')

In [1]:
import pandas as pd

src = "Data/Processed/uk_electricity_pbi.csv"
df = pd.read_csv(src, parse_dates=["dtm"])

df = df.sort_values("dtm").set_index("dtm")

minute = df.resample("1min").mean(numeric_only=True)

# recreate out_of_band (adjust thresholds if yours differ)
minute["out_of_band"] = (minute["f"] < 49.8) | (minute["f"] > 50.2)

out = "Data/Processed/uk_electricity_pbi_1min.csv"
minute.reset_index().to_csv(out, index=False, float_format="%.4f")

out

FileNotFoundError: [Errno 2] No such file or directory: 'Data/Processed/uk_electricity_pbi.csv'

In [2]:
from pathlib import Path
Path.cwd()

WindowsPath('c:/DataAnalystTraining/Energy-Portfolio/Notebooks')

In [3]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "README.md").exists():
    for p in Path.cwd().parents:
        if (p / "README.md").exists():
            PROJECT_ROOT = p
            break

src = PROJECT_ROOT / "Data" / "Processed" / "uk_electricity_pbi.csv"
df = pd.read_csv(src, parse_dates=["dtm"])
df.head()

Unnamed: 0,dtm,f,minute,rolling_5min,deviation_from_50,out_of_band
0,2025-11-01 00:00:01,49.964,2025-11-01 00:00:00,49.964,-0.036,False
1,2025-11-01 00:00:02,49.967,2025-11-01 00:00:00,49.9655,-0.033,False
2,2025-11-01 00:00:03,49.964,2025-11-01 00:00:00,49.965,-0.036,False
3,2025-11-01 00:00:04,49.965,2025-11-01 00:00:00,49.965,-0.035,False
4,2025-11-01 00:00:05,49.969,2025-11-01 00:00:00,49.9658,-0.031,False


In [4]:
from pathlib import Path
import pandas as pd

# load
df = pd.read_csv(src, parse_dates=["dtm"])
df = df.sort_values("dtm").set_index("dtm")

# 1-minute aggregation (mean)
minute = df.resample("1min").mean(numeric_only=True)

# re-create out_of_band if needed (adjust thresholds to yours)
if "f" in minute.columns:
    minute["out_of_band"] = (minute["f"] < 49.8) | (minute["f"] > 50.2)

out = PROJECT_ROOT / "Data" / "Processed" / "uk_electricity_pbi_1min.csv"
minute.reset_index().to_csv(out, index=False, float_format="%.4f")
out

WindowsPath('c:/DataAnalystTraining/Energy-Portfolio/Data/Processed/uk_electricity_pbi_1min.csv')