In [None]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt

# Load prepared dataframes from pickle files
data_path = "../data_processed/"

with open(data_path + "df_prepared.pkl", "rb") as f:
    df = pickle.load(f)

with open(data_path + "merged_df_prepared.pkl", "rb") as f:
    merged_df = pickle.load(f)

print("DataFrames loaded successfully:")
print(f"  df shape: {df.shape}")
print(f"  merged_df shape: {merged_df.shape}")

# Descriptives

# a) KPIs

## KPI 1: Utilization Rate

In [None]:
def calculate_utilizationrate (df, date) :
    possible_session_times = df[df['connectionTime'].dt.date == pd.to_datetime(date).date()]
    possible_session_times = possible_session_times.copy()
    possible_session_times['session_duration'] = (possible_session_times['disconnectTime'] - possible_session_times['connectionTime']) / pd.Timedelta(hours=1)

    total_stations = len(set(df['stationID']))
    total_available_time = total_stations * 24
    utilized_time_station = possible_session_times.groupby('stationID')['session_duration'].sum()
    utilized_time_station = utilized_time_station.apply(lambda x: min(x, 24))
    total_utilized_time = utilized_time_station.sum()
    utilization_rate = (total_utilized_time / total_available_time) * 100
    return utilization_rate


def calculate_utilizationrate_per_day (df):
    dates = df['connectionTime'].dt.date.unique()
    utilization_rates = []
    for date in dates:
        utilization_rate = calculate_utilizationrate(df, date)
        utilization_rates.append({'date': date, 'utilization_rate': utilization_rate})

    utilization_rates = sorted(utilization_rates, key=lambda x: x['date'])
    return pd.DataFrame(utilization_rates)

daily_utilization_df = calculate_utilizationrate_per_day(df)
daily_utilization_df.set_index('date', inplace=True)
print(daily_utilization_df)



In [None]:
util_df = calculate_utilizationrate_per_day(df)
fig, ax = plt.subplots(figsize=(12, 6))

ax.plot(
    util_df['date'],
    util_df['utilization_rate'],
    marker='o',
    linewidth=1.5,
    color='tab:blue'
)

ax.set_xlabel('Date')
ax.set_ylabel('Utilization Rate (%)')
ax.set_title('Daily Utilization Rate of Charging Stations')

plt.show()

The plot show different trends at which we look in a little bit.
There are also huge differences in utilization fo different days which could have to do with week days and weekends. To prove this lets look at a comparison between weekdays & weekends.

In [None]:
# Use column is_weekend for differentiation into weekdays and weekends
util_df['day_of_week'] = pd.to_datetime(util_df['date']).dt.day_name()
util_df['day_name_short'] = pd.to_datetime(util_df['date']).dt.strftime('%a')
util_df['is_weekend'] = util_df['day_of_week'].isin(['Saturday', 'Sunday'])

# Calculate statistics for each day of the week
print("=" * 60)
print("UTILIZATION RATE BY DAY OF WEEK")
print("=" * 60)
by_day = util_df.groupby('day_of_week')['utilization_rate'].agg(['mean', 'std', 'count'])
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
by_day = by_day.reindex(day_order)
print(by_day)

print("\n" + "=" * 60)
print("WEEKDAY vs WEEKEND")
print("=" * 60)
by_weekend = util_df.groupby('is_weekend')['utilization_rate'].agg(['mean', 'std', 'count'])
by_weekend.index = ['Weekday', 'Weekend']
print(by_weekend)

# Calculate difference between weekday and weekend averages
weekday_mean = util_df[~util_df['is_weekend']]['utilization_rate'].mean()
weekend_mean = util_df[util_df['is_weekend']]['utilization_rate'].mean()
diff_pct = ((weekday_mean - weekend_mean) / weekend_mean) * 100
print(f"\nWeekday average: {weekday_mean:.2f}%")
print(f"Weekend average: {weekend_mean:.2f}%")
print(f"Difference: {diff_pct:.1f}% higher on weekdays")


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Box plot for each day of week
util_df_sorted = util_df.copy()
util_df_sorted['day_of_week'] = pd.Categorical(util_df_sorted['day_of_week'], categories=day_order, ordered=True)
util_df_sorted.boxplot(column='utilization_rate', by='day_of_week', ax=ax1)
ax1.set_xlabel('Day of Week')
ax1.set_ylabel('Utilization Rate (%)')
ax1.set_title('Utilization Rate by Day of Week')
plt.sca(ax1)
plt.xticks(rotation=45)

# Box plot for Weekday vs Weekend
ax2.boxplot([util_df[~util_df['is_weekend']]['utilization_rate'], 
              util_df[util_df['is_weekend']]['utilization_rate']],
            tick_labels=['Weekday', 'Weekend'])
ax2.set_ylabel('Utilization Rate (%)')
ax2.set_title('Utilization Rate: Weekday vs Weekend')
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

When just looking at the average utilization of weekdays compared to weekends it become pretty clear, that the utilization on weekdays is definetely higher than on weekend. To be xact it is 513% higher. The boxplot gives us also a nice visual of this relationship.

### Rolling averages and confidence bands
To get a better understanding of trends and the typical utilization compared to extreme days, let's look at the rolling averages and confidence bands.

In [None]:
util_df["date"] = pd.to_datetime(util_df["date"])

window = 30  # rolling window of 30 days

# Rolling statistics for mean and standard deviation
roll_mean = util_df["utilization_rate"].rolling(window, min_periods=10).mean()
roll_std  = util_df["utilization_rate"].rolling(window, min_periods=10).std()

# mean ± 2 std (≈95%) as confidence band
k = 2
upper = roll_mean + k * roll_std
lower = roll_mean - k * roll_std

fig, ax = plt.subplots(figsize=(10, 4))

# Original daily values (light)
ax.plot(util_df["date"], util_df["utilization_rate"],
        color="tab:blue", alpha=0.3, label="Daily utilization")

# Rolling mean
ax.plot(util_df["date"], roll_mean,
        color="tab:blue", linewidth=2, label=f"{window}-day rolling mean")

# Confidence band
ax.fill_between(util_df["date"], lower, upper,
                color="tab:blue", alpha=0.15, label=f"±{k}·rolling std")

ax.set_xlabel("Date")
ax.set_ylabel("Utilization rate (%)")
ax.set_title("Daily utilization with rolling average and band")
ax.grid(True, linestyle="--", alpha=0.4)
ax.legend()
fig.autofmt_xdate()
plt.tight_layout()
plt.show()

We see different trends. There is a rise in the fourth quartal of 2018, a stable high phase in 2019 and a sharp drop in early 2020. This is probably connected with the start of the Covid-19 pandemic. But we can also see a slow recovery afterwards.

### Let's look if there a trends within the seasons.

In [None]:
# Define seasons based on meteorological seasons
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:  # 9, 10, 11
        return 'Fall'

# Add column for season
util_df['month'] = pd.to_datetime(util_df['date']).dt.month
util_df['season'] = util_df['month'].apply(get_season)

# Calculate statistics for each season
print("=" * 60)
print("UTILIZATION RATE BY SEASON")
print("=" * 60)
season_order = ['Winter', 'Spring', 'Summer', 'Fall']
by_season = util_df.groupby('season')['utilization_rate'].agg(['mean', 'median', 'std', 'count'])
by_season = by_season.reindex(season_order)
print(by_season)


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Box plot for each season
util_df_season = util_df.copy()
util_df_season['season'] = pd.Categorical(util_df_season['season'], categories=season_order, ordered=True)
util_df_season.boxplot(column='utilization_rate', by='season', ax=ax1)
ax1.set_xlabel('Season')
ax1.set_ylabel('Utilization Rate (%)')
ax1.set_title('Utilization Rate by Season')
ax1.get_figure().suptitle('')  # Remove default title

# Bar chart with mean values
season_means = util_df.groupby('season')['utilization_rate'].mean().reindex(season_order)
colors = ['#87CEEB', '#90EE90', '#FFD700', '#FF8C00']  # Winter, Spring, Summer, Fall
ax2.bar(season_order, season_means, color=colors, alpha=0.7, edgecolor='black')
ax2.set_xlabel('Season')
ax2.set_ylabel('Average Utilization Rate (%)')
ax2.set_title('Average Utilization Rate by Season')
ax2.grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, (season, value) in enumerate(zip(season_order, season_means)):
    ax2.text(i, value + 0.5, f'{value:.1f}%', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Time series colored by season
fig, ax = plt.subplots(figsize=(14, 6))

for season, color in zip(season_order, colors):
    mask = util_df['season'] == season
    ax.scatter(util_df[mask]['date'], util_df[mask]['utilization_rate'], 
               label=season, color=color, alpha=0.6, s=20)

ax.set_xlabel('Date')
ax.set_ylabel('Utilization Rate (%)')
ax.set_title('Daily Utilization Rate colored by Season')
ax.legend()
ax.grid(True, linestyle='--', alpha=0.4)
fig.autofmt_xdate()
plt.tight_layout()
plt.show()

# Statistical test: Check if differences are significant
print("\n" + "=" * 60)
print("SEASON COMPARISON")
print("=" * 60)
winter_mean = util_df[util_df['season'] == 'Winter']['utilization_rate'].mean()
spring_mean = util_df[util_df['season'] == 'Spring']['utilization_rate'].mean()
summer_mean = util_df[util_df['season'] == 'Summer']['utilization_rate'].mean()
fall_mean = util_df[util_df['season'] == 'Fall']['utilization_rate'].mean()

print(f"Winter: {winter_mean:.2f}%")
print(f"Spring: {spring_mean:.2f}%")
print(f"Summer: {summer_mean:.2f}%")
print(f"Fall:   {fall_mean:.2f}%")

highest_season = by_season['mean'].idxmax()
lowest_season = by_season['mean'].idxmin()
diff = by_season.loc[highest_season, 'mean'] - by_season.loc[lowest_season, 'mean']
print(f"\nHighest: {highest_season} ({by_season.loc[highest_season, 'mean']:.2f}%)")
print(f"Lowest:  {lowest_season} ({by_season.loc[lowest_season, 'mean']:.2f}%)")
print(f"Difference: {diff:.2f} percentage points")

The statistics and the viusals show that the utilization in winter, spring & summer is almost the same with slight differences. But in fall the utilization considerably higher than for the other seasons. The boxplot also confirms that also the median and upper quartiles are clearly higher in fall, meaning typical days, and not just extremes, are busier in that season

### utilization per hour

In [None]:
def calculate_hourly_utilization(df):
    
    # Create a copy with the extracted hour
    df_hours = df.copy()
    
    hourly_usage = []
    
    for idx, row in df_hours.iterrows():
        start = row['connectionTime']
        end = row['doneChargingTime']
        station = row['stationID']
        
        # Generate all hours this session occupied
        current = start.replace(minute=0, second=0, microsecond=0)
        while current < end:
            hourly_usage.append({
                'datetime': current,
                'hour': current.hour,
                'date': current.date(),
                'stationID': station
            })
            current += pd.Timedelta(hours=1)
    
    usage_df = pd.DataFrame(hourly_usage)
    
    # Count unique station-hour combinations per hour of day
    total_stations = df['stationID'].nunique()
    total_days = (df['connectionTime'].max().date() - df['connectionTime'].min().date()).days + 1
    
    # Group by hour and count unique station-date combinations
    hourly_stats = usage_df.groupby('hour').agg({
        'stationID': 'count'  # Total station-hours occupied
    }).reset_index()
    
    # Total available station-hours per hour = total_stations * total_days
    hourly_stats['total_available'] = total_stations * total_days
    hourly_stats['utilization_rate'] = (hourly_stats['stationID'] / hourly_stats['total_available']) * 100
    
    return hourly_stats




In [None]:
hourly_util = calculate_hourly_utilization(df)
print(hourly_util)
# Visualization
fig, ax = plt.subplots(figsize=(12, 6))

ax.bar(hourly_util['hour'], hourly_util['utilization_rate'], 
       color='tab:blue', alpha=0.7, edgecolor='black')

ax.set_xlabel('Hour of Day', fontsize=12)
ax.set_ylabel('Utilization Rate (%)', fontsize=12)
ax.set_title('Average Utilization Rate by Hour of Day', fontsize=14, fontweight='bold')
ax.set_xticks(range(0, 24))
ax.set_xticklabels([f'{h:02d}:00' for h in range(24)], rotation=45, ha='right')
ax.grid(axis='y', alpha=0.3)

# Highlight peak hours
peak_hour = hourly_util.loc[hourly_util['utilization_rate'].idxmax(), 'hour']
ax.axvline(peak_hour, color='red', linestyle='--', alpha=0.5, label=f'Peak hour: {peak_hour:02d}:00')
ax.legend()

plt.tight_layout()
plt.show()


Utilization starts very low during the night and rises steeply from about 06:00, reaching a maximum around 09:00, which is marked by the red dashed line. After 09:00 the utilization gradually declines over the day, with a slower tail into the evening. There is no second strong peak comparable to morning spike. The plot provides us with a critical window between 08:00 & 11:00 for operators. If users report waiting times or congestion, this is the period where measures like adding capaacity or time-limits would have the most impact.

## KPI 2: Energy delivered per Hour

In [None]:
def hourlyEnergyDelivered(df):
    # Create a copy with hour extracted
    df_hours = df.copy()
    
    hourly_energy = []
    
    for idx, row in df_hours.iterrows():
        start = row['connectionTime']
        end = row['disconnectTime']
        energy = row['kWhDelivered']  # in kWh
        station = row['stationID']
        
        # Calculate total session duration in hours
        total_duration_hours = (end - start).total_seconds() / 3600.0
        
        # Already handled in cleaning phase
        if total_duration_hours == 0:
            continue  # Skip sessions with zero duration
        
        # Energy delivered per hour (uniform rate across session hours)
        energy_per_hour = energy / total_duration_hours
        
        # Generate all hours this session occupied
        current = start.replace(minute=0, second=0, microsecond=0)
        while current < end:
            hourly_energy.append({
                'datetime': current,
                'hour': current.hour,
                'date': current.date(),
                'year': current.year,
                'stationID': station,
                'energy_delivered': energy_per_hour
            })
            current += pd.Timedelta(hours=1)
    
    energy_df = pd.DataFrame(hourly_energy)
    
    # Aggregate energy delivered per exact hourly datetime
    hourly_stats = energy_df.groupby(['year', 'hour']).agg({
        'energy_delivered': 'sum'  # Total energy delivered in kWh per hour timestamp
    }).reset_index()
    
    return hourly_stats

In [None]:
print(hourlyEnergyDelivered(df))

In [None]:
hourly_energy = hourlyEnergyDelivered(df)

fig, ax = plt.subplots(figsize=(12, 6))

colors = ['#87CEEB', '#90EE90', '#FFD700', '#FF8C00']  # Colors for different years#
line_styles = ['-', '--', '-.', ':']

for idx, year in enumerate(sorted(hourly_energy['year'].unique())):
    year_data = hourly_energy[hourly_energy['year'] == year]
    ax.plot(
        year_data['hour'],
        year_data['energy_delivered'],
        marker='o',
        label=str(year),
        color=colors[idx % len(colors)],
        linestyle=line_styles[idx % len(line_styles)],
        linewidth=2
    )
ax.set_title('Hourly Energy Delivered by Year')
ax.set_xlabel('Hour of Day')
ax.set_ylabel('Total Energy Delivered (kWh)')
ax.legend()
ax.grid(True, linestyle='--', alpha=0.4)

# b) Site Characteristics