In [1]:
import json
import os
import pandas as pd
import requests

In [2]:
directory = "../data"

if not os.path.exists(directory):
    os.makedirs(directory)

# Define file URLs
file_links = {
    "SCADA_Data.csv": "https://zenodo.org/records/8192149/files/Aventa_AV7_IET_OST_SCADA.csv?download=1",
    "WT_Metadata.json": "https://zenodo.org/records/8192149/files/Aventa_AV_7_IET_OST_WT_metadata.json?download=1",
    "SCADA_Channels_Metadata.csv": "https://zenodo.org/records/8192149/files/SCADA_Channels_Metadata.csv?download=1"
}

# Download and save each file
for filename, url in file_links.items():
    response = requests.get(url)
    if response.status_code == 200:
        filepath = os.path.join(directory, filename)
        with open(filepath, "wb") as file:
            file.write(response.content)
        print(f"{filename} downloaded successfully!")
    else:
        print(f"Failed to download {filename}. Status Code: {response.status_code}")


SCADA_Data.csv downloaded successfully!
WT_Metadata.json downloaded successfully!
SCADA_Channels_Metadata.csv downloaded successfully!


In [3]:
# Load SCADA Time Series Data
scada_df = pd.read_csv("../data/SCADA_Data.csv")
print("SCADA Data Overview:")
print(scada_df.info())
print(scada_df.head())


SCADA Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39715978 entries, 0 to 39715977
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Datetime              object 
 1   RotorSpeed            float64
 2   GeneratorSpeed        float64
 3   GeneratorTemperature  float64
 4   WindSpeed             float64
 5   PowerOutput           float64
 6   SpeiseSpannung        float64
 7   StatusAnlage          float64
 8   MaxWindHeute          float64
 9   offsetWindDirection   float64
 10  PitchDeg              float64
dtypes: float64(10), object(1)
memory usage: 3.3+ GB
None
                  Datetime  RotorSpeed  GeneratorSpeed  GeneratorTemperature  \
0  2021-12-31 23:00:00.647         0.0             0.0                  11.6   
1  2021-12-31 23:00:01.647         0.0             0.0                  11.6   
2  2021-12-31 23:00:02.647         0.0             0.0                  11.6   
3  2021-12-31 23:00:03.650     

In [4]:
# Load SCADA Channels Metadata
scada_meta_df = pd.read_csv("../data/SCADA_Channels_Metadata.csv")
print("\nSCADA Channels Metadata Overview:")
print(scada_meta_df.info())
print(scada_meta_df.head())



SCADA Channels Metadata Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 11 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Internal Name                             11 non-null     object 
 1   IEC 61400-25 guideline name               11 non-null     object 
 2   ENTR Alliance name                        8 non-null      object 
 3   Description                               11 non-null     object 
 4   Notes                                     11 non-null     object 
 5   Reliable Measurement                      10 non-null     object 
 6   Units same as ENTR                        10 non-null     object 
 7   Unit (if different from ENTR)             2 non-null      object 
 8   RAW Signal (Bool)                         10 non-null     object 
 9   Unit conversion constant (if RAW signal)  0 non-null      float64
 10  Frequ

In [5]:
# Load Wind Turbine Metadata (JSON)
with open("../data/WT_Metadata.json", "r") as file:
    wt_metadata = json.load(file)

print("\nWind Turbine Metadata:")
print(json.dumps(wt_metadata, indent=4))  # Pretty-print JSON data



Wind Turbine Metadata:
{
    "name": "IET-OST Aventa Research Wind Turbibe",
    "comments": "Aventa AV-7 Turbine model owned and operated by IET OST",
    "assembly": {
        "drivetrain": "belt_drive",
        "rotor_orientation": "Upwind",
        "number_of_blades": 3,
        "hub_height": 18.0,
        "rotor_diameter": 12.9,
        "rated_power": 6200.0
    },
    "components": {
        "blade": {
            "system_mass": 117.0
        },
        "hub": {
            "elastic_properties_mb": {
                "system_mass": 119,
                "system_inertia": "None"
            }
        },
        "nacelle": {
            "drivetrain": {
                "gear_ratio": 12.0
            },
            "generator": {
                "generator_type": "PMSG"
            }
        },
        "tower": {
            "system_mass": 14600
        },
        "foundation": {
            "height": 0
        }
    },
    "materials": [
        {
            "name": "concrete",
    

In [6]:
# Check for missing values
print(scada_df.isnull().sum())


Datetime                0
RotorSpeed              0
GeneratorSpeed          0
GeneratorTemperature    0
WindSpeed               1
PowerOutput             1
SpeiseSpannung          0
StatusAnlage            0
MaxWindHeute            2
offsetWindDirection     0
PitchDeg                1
dtype: int64


In [7]:
# Check for duplicate rows
print(f"Duplicate rows: {scada_df.duplicated().sum()}")


Duplicate rows: 0


In [8]:
# Check for unrealistic values
print(scada_df.describe())


         RotorSpeed  GeneratorSpeed  GeneratorTemperature     WindSpeed  \
count  3.971598e+07    3.971598e+07          3.971598e+07  3.971598e+07   
mean   2.067616e+01    2.444918e+02          2.871725e+01  2.457526e+00   
std    2.260638e+01    2.773169e+02          1.532112e+01  2.324517e+00   
min    0.000000e+00    0.000000e+00         -9.510000e+01  0.000000e+00   
25%    0.000000e+00    0.000000e+00          1.740000e+01  1.000000e+00   
50%    1.060000e+01    0.000000e+00          2.680000e+01  1.900000e+00   
75%    3.790000e+01    4.700000e+02          3.820000e+01  3.200000e+00   
max    3.040000e+02    8.020000e+02          1.073000e+02  4.100000e+01   

        PowerOutput  SpeiseSpannung  StatusAnlage  MaxWindHeute  \
count  3.971598e+07    3.971598e+07  3.971598e+07  3.971598e+07   
mean   1.051422e+00    2.780072e+01  1.001604e+01  7.265652e+00   
std    1.957080e+00    2.477404e-02  4.129103e+01  3.019539e+01   
min   -2.200000e-01    0.000000e+00  0.000000e+00  0.000

In [9]:
# Check for gaps in timestamps
scada_df["Datetime"] = pd.to_datetime(scada_df["Datetime"])
scada_df = scada_df.sort_values(by="Datetime")
scada_df["Time_Gap"] = scada_df["Datetime"].diff().dt.total_seconds()
print(scada_df[["Datetime", "Time_Gap"]].describe())


                            Datetime      Time_Gap
count                       39715978  3.971598e+07
mean   2022-10-09 10:25:25.027151360  1.231212e+00
min       2021-12-31 23:00:00.647000  2.001000e-03
25%    2022-05-03 04:27:30.703500032  9.980000e-01
50%    2022-10-14 19:15:05.150000128  1.002000e+00
75%    2023-03-22 07:45:13.121999872  1.005000e+00
max       2023-07-20 21:59:59.577999  2.526915e+06
std                              NaN  4.314126e+02


In [10]:
# Convert Datetime to proper format
scada_df["Datetime"] = pd.to_datetime(scada_df["Datetime"])

# Step 1: Handle Missing Values (Interpolation)
scada_df.interpolate(method="linear", inplace=True)

# Step 2: Clip/Filter Unrealistic Values
# Define realistic thresholds based on known physical constraints
scada_df["GeneratorTemperature"] = scada_df["GeneratorTemperature"].clip(lower=-30, upper=100)  # Fix sensor errors
scada_df["WindSpeed"] = scada_df["WindSpeed"].clip(lower=0, upper=60)  # Wind speeds above 60m/s are rare
scada_df["PowerOutput"] = scada_df["PowerOutput"].clip(lower=0)  # Negative power is unrealistic
scada_df["MaxWindHeute"] = scada_df["MaxWindHeute"].clip(lower=0, upper=100)  # Remove extreme values

# Step 3: Check for Time Gaps and Resample
scada_df = scada_df.sort_values(by="Datetime")
scada_df.set_index("Datetime", inplace=True)

# Resample to 1-second intervals and interpolate missing time points
scada_df = scada_df.resample("1s").interpolate()


In [11]:
# Check for missing values
print(scada_df.isnull().sum())


RotorSpeed              340
GeneratorSpeed          340
GeneratorTemperature    340
WindSpeed               340
PowerOutput             340
SpeiseSpannung          340
StatusAnlage            340
MaxWindHeute            340
offsetWindDirection     340
PitchDeg                340
Time_Gap                340
dtype: int64


In [12]:
# Get indices where any column has null values
null_dates = scada_df[scada_df.isnull().any(axis=1)].index

if not null_dates.empty:
    # Convert Index to Series
    null_dates_series = pd.Series(null_dates)

    # Detect gaps in consecutive dates
    gaps = null_dates_series.diff().gt(pd.Timedelta(days=1))

    # Assign group numbers to consecutive periods
    groups = gaps.cumsum()

    # Group by the detected groups and find min/max for each period
    date_ranges = null_dates_series.groupby(groups).agg(["min", "max"])

    print("Null value date ranges:")
    print(date_ranges)
else:
    print("No null values found.")


Null value date ranges:
                         min                 max
Datetime                                        
0        2021-12-31 23:00:00 2021-12-31 23:05:39


In [13]:
# Filter out missing values
scada_df = scada_df.dropna()

# Save the cleaned dataset
scada_df.to_parquet("../data/scada_df.parquet")

In [14]:
# Compute correlation matrix
scada_df.corr()


Unnamed: 0,RotorSpeed,GeneratorSpeed,GeneratorTemperature,WindSpeed,PowerOutput,SpeiseSpannung,StatusAnlage,MaxWindHeute,offsetWindDirection,PitchDeg,Time_Gap
RotorSpeed,1.0,0.995752,0.743425,0.601629,0.875971,-0.008064,0.024035,0.417882,0.037283,-0.629321,0.205437
GeneratorSpeed,0.995752,1.0,0.74172,0.551851,0.859592,-0.006947,0.010861,0.383183,0.03537,-0.667786,0.205549
GeneratorTemperature,0.743425,0.74172,1.0,0.521667,0.722567,-0.045568,-0.019094,0.468056,0.025447,-0.497436,0.061478
WindSpeed,0.601629,0.551851,0.521667,1.0,0.6461,-0.027396,0.166309,0.68529,0.000986,0.018895,0.121791
PowerOutput,0.875971,0.859592,0.722567,0.6461,1.0,-0.013826,0.011095,0.451572,0.021784,-0.390635,0.227337
SpeiseSpannung,-0.008064,-0.006947,-0.045568,-0.027396,-0.013826,1.0,-0.003266,-0.030997,-0.002038,-0.01868,0.001607
StatusAnlage,0.024035,0.010861,-0.019094,0.166309,0.011095,-0.003266,1.0,0.202803,0.007587,0.228166,0.255982
MaxWindHeute,0.417882,0.383183,0.468056,0.68529,0.451572,-0.030997,0.202803,1.0,0.020806,0.050895,0.143493
offsetWindDirection,0.037283,0.03537,0.025447,0.000986,0.021784,-0.002038,0.007587,0.020806,1.0,0.039583,0.010487
PitchDeg,-0.629321,-0.667786,-0.497436,0.018895,-0.390635,-0.01868,0.228166,0.050895,0.039583,1.0,-0.122595


In [15]:
# Check for unrealistic values
print(scada_df.describe())


         RotorSpeed  GeneratorSpeed  GeneratorTemperature     WindSpeed  \
count  4.889846e+07    4.889846e+07          4.889846e+07  4.889846e+07   
mean   2.108908e+01    2.503695e+02          2.921964e+01  2.524658e+00   
std    2.194295e+01    2.680847e+02          1.503987e+01  2.155790e+00   
min    0.000000e+00    0.000000e+00         -5.000000e-01  0.000000e+00   
25%    0.000000e+00    0.000000e+00          1.797773e+01  1.076549e+00   
50%    1.445758e+01    1.673869e+02          2.843609e+01  2.052662e+00   
75%    3.790036e+01    4.685992e+02          3.830000e+01  3.351751e+00   
max    6.840000e+01    7.910000e+02          8.240000e+01  2.630000e+01   

        PowerOutput  SpeiseSpannung  StatusAnlage  MaxWindHeute  \
count  4.889846e+07    4.889846e+07  4.889846e+07  4.889846e+07   
mean   1.116589e+00    2.780074e+01  1.008520e+01  7.435837e+00   
std    1.893313e+00    1.819512e-02  3.006374e+00  5.002596e+00   
min    0.000000e+00    2.410000e+01  0.000000e+00  0.000