# Preprocess the EV data
Goal: Preprocess the EV data from [1] for easier use in the optimization.

## Input
Before running below code, download the dataset "Dataset 1_EV charging reports.csv" from [https://data.mendeley.com/datasets/jbks2rcwyj/1](https://data.mendeley.com/datasets/jbks2rcwyj/1) and save it in the folder "input".
From [1]: 'The CSV file “Dataset 1” describes 6,878 individual charging sessions, registered by 97 user IDs from December 2018 to January 2020. The charging reports include plug-in time, plug-out time and charged energy per charging session. Each charging session is connected to a user ID, charger ID and address. The charger IDs are either private or shared, since the charge points (CPs) are either located on the residents private parking spaces, or on shared parking areas available for all residents registered as users. ...'

## Output
Dataframe with hourly information for each user ID/vehicle.

## Sources
[1] Sørensen, Å. L., Lindberg, K. B., Sartori, I., & Andresen, I. (2021). Residential electric vehicle charging datasets from apartment buildings. Data in Brief, 36, 107105.

In [None]:
# IMPORTS
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path

In [None]:
# SETTINGS
path_ev_orig = Path("input/Dataset 1_EV charging reports.csv")  # original EV data from https://data.mendeley.com/datasets/jbks2rcwyj/1
path_output = Path("input/preprocessed")  # output of this script is input to the optimization
start_date = "2019-01-01"  # only consider charging sessions from this date onwards; format: YYYY-MM-DD (heat pump and household data starts on 01.01.2019)
end_date = "2020-01-01"  # only consider charging sessions until this date; format: YYYY-MM-DD (end_date is not included, EV data ends on 31.01.2020, local time)

tz_data = "Europe/Berlin"  # time zone of input data; from [1]: "In the data provided with this article, Central European Time (CET) zone is used, which is GMT +1. Daylight saving time (DST) applies."
tz_output = "UTC"  # time zone of output data

daterange = pd.date_range(start_date, end_date, freq="H", inclusive="left", tz=tz_output)  # hourly date range from start_date to end_date; time stamp indicates the start of a time interval
print(f"Daterange: {daterange[0]} - {daterange[-1]}")

In [None]:
# DATA TYPES AND REMOVAL OF DATA THAT IS NOT RELEVANT
# Read in the data
ev = pd.read_csv(path_ev_orig, sep=";")

# Drop specified columns
list_drop = ["month_plugin", "weekdays_plugin", "Plugin_category", "Duration_category", "Shared_ID"]
ev.drop(columns=list_drop, inplace=True)

# Convert El_kWh and duration hours to float
ev["El_kWh"] = ev["El_kWh"].str.replace(",", ".")
ev["El_kWh"] = ev["El_kWh"].astype(float)
ev["Duration_hours"] = ev["Duration_hours"].str.replace(",", ".")
ev["Duration_hours"] = ev["Duration_hours"].astype(float)

# Drop rows with no end plugout time
# From [1]: "If the plug-out time is too early, compared to energy charged and maximum 11 kW charging power available, the plug-out time is removed (set to NA), since this indicates that the value is incorrect (relevant for 34 charging sessions)."
ev.dropna(subset=["End_plugout"], inplace=True)

# Only keep private charging sessions; then drop column "User_type"
ev = ev[ev["User_type"] != "Shared"]
assert list(ev["User_type"].unique()) == ["Private"], "Error: User_type should only contain 'Private'."
ev = ev.drop("User_type", axis=1)

# Format plug-in and plug-out time; change time zone to UTC
# From [1]: "In the data provided with this article, Central European Time (CET) zone is used, which is GMT +1. Daylight saving time (DST) applies." (It is a Norwegian dataset.)
for col in ["Start_plugin", "End_plugout"]:
    ev[col] = pd.to_datetime(ev[col], format="%d.%m.%Y %H:%M")
    ev[col] = ev[col].dt.tz_localize(tz_data).dt.tz_convert(tz_output)

# Only consider charging sessions in defined time window
ev = ev[(ev["Start_plugin"].dt.date.isin(daterange.date)) & (ev["End_plugout"].dt.date.isin(daterange.date))]

# Add a new column with kWh/min if charging was distributed equally over entire duration
ev["kWh/min"] = ev["El_kWh"] / (ev["Duration_hours"] * 60)

# Add a new column with power in kW if charging was distributed equally over entire duration
ev["kW"] = ev["El_kWh"] / ev["Duration_hours"]

# Show the first two rows
ev.head(2)

In [None]:
# Does each garage correspond to one charger or are there multiple chargers in one garage?
# Example to show that there must be multiple chargers in one garage
ev[(ev["Garage_ID"] == "AdO1") & (ev["Start_plugin"].dt.date == datetime.date(2019, 2, 28))]
# --> there are 3 different user IDs charging at the same time in garage "AdO1"

In [None]:
# Get all unique user IDs
unique_users_ = sorted(ev["User_ID"].unique())
print("Number of unique users: ", len(unique_users_))

In [None]:
# Plot the charging processes over time for each user
fig, axs = plt.subplots(figsize=(15, 20), nrows=len(unique_users_), sharex=True, sharey=True)
for i, u in enumerate(unique_users_):
    ts = pd.Series(np.empty(len(daterange)), index=daterange)
    ts[:] = np.nan
    df_u = ev[ev["User_ID"] == u]
    for _, c in df_u.iterrows():
        s = c["Start_plugin"].replace(minute=0)
        e = c["End_plugout"].replace(minute=0)
        ts.loc[s:e] = 1
    ts.plot(ax=axs[i], linewidth=5, label=u)
    axs[i].legend(loc="center left", bbox_to_anchor=(1, 0.5), handlelength=0.2)
    axs[i].set_ylim(0.9, 1.1)
    axs[i].set_yticks([])  # remove y-ticks
    axs[i].set_xticks([], minor=True)  # remove minor x-ticks

In [None]:
# The plot shows that many users only joined/switched to a private charging station towards the end of the time period.
# --> only consider users that have at least one charging process in the first <n_weeks> weeks of the investigated date range.
n_weeks = 12
unique_users = []
for u in unique_users_:
    df_u = ev[ev["User_ID"] == u]
    if df_u["Start_plugin"].dt.date.isin(np.unique(daterange.date)[:n_weeks * 7]).any():
        unique_users.append(u)
print("Number of unique users left: ", len(unique_users))
print("Corresponding user IDs: ", unique_users)

In [None]:
# COMPUTE HOURLY DATA FOR EACH USER
# Initialize dataframe with zeros
col_per_user = ["kWh",  # kWh charged in this hour
                "share_of_hour",  # how long is the EV plugged-in in this hour; e.g.: 0.5 means 30 minutes
                "start",  # indicates if a charging process starts in this hour; 1 if yes, 0 if no
                "hours_until_end",  # how many hours does the charging process last (in the same row where "start" is 1); e.g., if charging process starts at 10:10 and ends at 12:30, then "hours_until_end" is 2
                ]
columns = pd.MultiIndex.from_product([unique_users, col_per_user])
ev_hourly = pd.DataFrame(np.zeros((len(daterange),len(unique_users) * len(col_per_user))) , index=daterange, columns=columns)

# Iterate over all user IDs
list_same_hour = []
list_overlapping = []
for u in unique_users:
    idx_last_start = None # start HOUR of last charging process
    ts_last_end = None  # end TIMESTAMP of last charging process

    # Get all charging processes of one user
    df_u = ev[ev["User_ID"] == u]

    # Iterate over all charging processes of one user
    for _, c in df_u.iterrows():
        c = c.to_dict()

        # Check for overlapping charging processes for the same user ID; example: AdO1 on 17.01.2020
        # --> only the first charging process is considered
        if ts_last_end is not None and c["Start_plugin"] < ts_last_end:
            list_overlapping.append((u, c["Start_plugin"]))
            continue
        else:
            # Define start and end date and hour
            s = c["Start_plugin"].replace(minute=0)
            e = c["End_plugout"].replace(minute=0)

            # Iterate over all indexes of the dataframe ev_hourly
            for idx in ev_hourly.index:
                # If idx >= s (plug-in hour) and <= e (plug-out hour), then the EV is charging in that time interval
                if s <= idx <= e:
                    # If the charging process starts in this hour, set "start" to 1 and compute "hours_until_end"
                    # Note: there can be edge cases where
                    # 1) two charging processes start in the same hour (first one very short)
                    # 2) one charging process ends and another one starts in the same hour
                    # in these cases, the 2 charging processes are considered as one; value "hours_until_end" is the sum of the two charging processes
                    if idx == s:
                        hours_until_end = (e - s).days * 24 + (e - s).seconds / 3600
                        if idx_last_start is None or idx > idx_last_start + datetime.timedelta(hours=ev_hourly.loc[idx_last_start, (u, "hours_until_end")]): # no edge case
                            ev_hourly.loc[idx, (u, "start")] = 1
                            ev_hourly.loc[idx, (u, "hours_until_end")] = hours_until_end
                            idx_last_start = idx
                        else:  # edge case --> consider the two charging processes as one
                            ev_hourly.loc[idx_last_start, (u, "hours_until_end")] += hours_until_end
                            list_same_hour.append((idx_last_start, u))

                    # Compute the number of minutes for which the EV is plugged-in in the current hour
                    if idx == s and s != e:  # charging process starts in this hour, but ends in another hour
                        minutes = 60 - c["Start_plugin"].minute
                    elif idx == e and e == s:  # charging process starts and ends in this hour
                        minutes = c["End_plugout"].minute - c["Start_plugin"].minute
                    elif idx == e and s != e:  # charging process ends in this hour, but has started in another hour
                        minutes = c["End_plugout"].minute
                    else:  # charging process starts and ends in another hour
                        minutes = 60

                    # Add the charged energy and share of hour to the dataframe
                    # Note: values are added to the current value in the dataframe as there can be multiple charging processes in the same hour
                    ev_hourly.loc[idx, (u, "kWh")] = ev_hourly.loc[idx, (u, "kWh")] + minutes * c["kWh/min"]
                    ev_hourly.loc[idx, (u, "share_of_hour")] = ev_hourly.loc[idx, (u, "share_of_hour")] + minutes / 60

            # Update last end time
            ts_last_end = c["End_plugout"]

# print("List of indices to check for edge cases: ", list_same_hour)
# print("List of overlapping charging processes: ", list_overlapping)

# Filter for rows where the first user is charging and show the first 5 rows
ev_hourly[ev_hourly.loc[:, pd.IndexSlice[unique_users[0], "kWh"]] > 0].head(5)

In [None]:
# Convert kWh to Wh
ev_hourly.loc[:, pd.IndexSlice[:, "kWh"]] = ev_hourly.loc[:, pd.IndexSlice[:, "kWh"]] * 1000
ev_hourly.rename(columns={"kWh": "Wh"}, level=1, inplace=True)
ev_hourly[ev_hourly.loc[:, pd.IndexSlice[unique_users[0], "Wh"]] > 0].head(5)

In [None]:
# SAVE THE PROCESSED DATA
ev_hourly.to_csv(path_output / "Hourly_EV_Charging.csv", sep=";")

#### How to filter multi-index

In [None]:
# One specific user
ev_hourly.loc[:, (unique_users[0], slice(None))].head(2)

In [None]:
# All users, but only Wh
ev_hourly.loc[:, pd.IndexSlice[:, "Wh"]].droplevel(axis=1, level=1).head(2)

#### Maximum charging for each user

In [None]:
for u in unique_users:
    print(f"Max. charging power for user {u} in W: {ev_hourly.loc[:, (u, 'Wh')].max()}")