# Day-Averaged Yearly files

The average (mean) of energy consumed in a day is calculated for per season then appended together

In [7]:
import polars as pl
import glob
import re
from pathlib import Path
import os
from datetime import datetime

In [8]:
# Create a date range for the entire year of 2024
timestamp_range = pl.datetime_range(datetime(2024, 1, 1), datetime(2024, 12, 31), "1d", eager=True).cast(pl.Date)

In [9]:
def day_avg_preprocess(df, timestamp_range):
    
    df = df.with_columns(
        pl.col("zeitpunkt").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%z", strict=False).alias("zeitpunkt_dt")
    )
    
    df = df.with_columns(
        pl.col("zeitpunkt_dt").dt.date().alias("date")
    )

    date_list = timestamp_range.to_list()

    day_avg = (
        df.filter(pl.col("date").is_in(date_list))
          .group_by("date")
          .agg(pl.col("bruttolastgang_kwh").mean().alias("day_avg_bruttolastgang_kwh"))
          .sort("date")
    )
    return day_avg

In [10]:
# Define the output directory
output_dir = Path("/Users/jansigrist/Documents/SP/Customer_Segmentation_Lantern/DATA/SM_DATA_day_avg")
output_dir.mkdir(parents=True, exist_ok=True)

pattern = "/Users/jansigrist/Documents/SP/Customer_Segmentation_Lantern/DATA/SM_DATA_cleaned/*CH*.csv"
file_paths = glob.glob(pattern)

for file_path in file_paths:
    df = pl.read_csv(file_path)
    day_avg = day_avg_preprocess(df, timestamp_range)
    
    filename = os.path.basename(file_path)  # gets file name with extension
    output_file = output_dir / filename      # construct output path
    
    day_avg.write_csv(output_file)

# Normalizing

In [11]:
def normalize(df):
    """ Normalize the 'day_avg_bruttolastgang_kwh' column by its max value and overwrite the original column """
    
    max_val = df.select(pl.max("day_avg_bruttolastgang_kwh"))['day_avg_bruttolastgang_kwh'][0]
    
    if max_val == 0:
        # Avoid division by zero, leave values unchanged
        df = df.with_columns(
            pl.col("day_avg_bruttolastgang_kwh")
        )
    else:
        # Overwrite original column with normalized values
        df = df.with_columns(
            (pl.col("day_avg_bruttolastgang_kwh") / max_val).alias("day_avg_bruttolastgang_kwh")
        )
    
    df = df.sort("date")
    
    return df


In [12]:
# Define the new folder for normalized data

output_dir = Path("/Users/jansigrist/Documents/SP/Customer_Segmentation_Lantern/DATA/SM_DATA_day_avg_normalized")
output_dir.mkdir(parents=True, exist_ok=True)

pattern = str(output_dir / "/*CH*.csv")
file_paths = glob.glob(pattern)

for file_path in file_paths:
    df = pl.read_csv(file_path)
    normalized_df = normalize(df)

    # Get original file name
    original_file_name = Path(file_path).name

    # Define output path keeping the same file name
    output_file = output_dir / original_file_name
    normalized_df.write_csv(output_file)