In [0]:
import pandas as pd
import os
from azure.storage.blob import BlobServiceClient

In [0]:
def get_data_from_blob_storage(folder_name, blob_name, connection_string):
    """
    Liest eine Datei aus einem bestimmten Ordner im Azure Blob Storage und gibt den Inhalt als DataFrame zurück.

    Args:
        folder_name (str): Name des Ordners im Blob Storage.
        blob_name (str): Name der Datei im Blob Storage.

    Returns:
        pd.DataFrame: Inhalt der Datei als DataFrame.
    """
    # Establish connection to the Azure Blob Storage account
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)

    # Access the container
    container_name = "source-data"
    container_client = blob_service_client.get_container_client(container_name)

    # Build the full path to the blob within the folder
    blob_path = f"{folder_name}/{blob_name}"

    # Download the file
    blob_client = container_client.get_blob_client(blob_path)

    # Use a temporary file to download the blob
    temp_file_path = blob_name 
    with open(temp_file_path, "wb") as download_file:
        download_file.write(blob_client.download_blob().readall())

    # Load the file into a DataFrame depending on its extension
    if temp_file_path.lower().endswith(".parquet"):
        df = pd.read_parquet(temp_file_path)
    elif temp_file_path.lower().endswith(".csv"):
        df = pd.read_csv(temp_file_path, sep=";")
    else:
        raise ValueError("Unsupported file format. Only .parquet and .csv are supported.")

    # Remove the temporary file if needed
    os.remove(temp_file_path)

    return df


def prepare_data(df):
    df.rename(columns={"datum":"datetime","Überschuss":"feed_in:kWh", "Produktion": "production:kWh", "Eigenverbrauch": "self-consumption:kWh",
                        "PLZ":"zip_code", "Ort":"city", "PanelPeakLeistung": "panel_peak_power:kWp", "Ausrichtung": "orientation", "Anstellwinkel": "tilt:deg",
                        "Ausrichtung_Grad": "orientation:deg",  'Installierte, nominale Speicherkapazität (kWh)': "battery_capacity:kWh",
                        "Kategorie":"category"},inplace=True)
    df.drop(columns=["date"],inplace=True)
    return df

In [0]:
connection_string = ""

In [0]:
df_PV = get_data_from_blob_storage("Innovation_Days", "data_inno_days.parquet", connection_string)
df_PV = prepare_data(df_PV)