In [1]:
# Import all required libraries

import pandas as pd
import numpy as np
import os

In [2]:
exclude_columns = ["time_stamp", "asset_id", "id"]


def load_datasets(farm):
    event_info = pd.read_csv(f"../data/care_to_compare/Wind Farm {farm}/event_info.csv", delimiter=';')
    return event_info


def load_df_and_annotate_anomalies(farm, event_id):
    path = f"../data/care_to_compare/Wind Farm {farm}/datasets/{event_id}.csv"
    df = pd.read_csv(path, delimiter=';')

    event_info = pd.read_csv(f"../data/care_to_compare/Wind Farm {farm}/event_info.csv", delimiter=';')

    # Find the row where event_id = event_id
    metadata = event_info[event_info['event_id'] == event_id]

    event_label = metadata["event_label"].values[0]
    event_start_id = metadata["event_start_id"].values[0]
    event_end_id = metadata["event_end_id"].values[0]

    label_value = 1 if event_label == "anomaly" else 0

    # All rows where the column "id" is between event_start_id and event_end_id
    df['label'] = 0
    df.loc[(df['id'] >= event_start_id) & (df['id'] <= event_end_id), 'label'] = label_value

    # Remove columns with suffixes in exclude_columns_with_suffix
    df = df[[col for col in df.columns if not col.endswith('_max')]]
    df = df[[col for col in df.columns if not col.endswith('_min')]]
    df = df[[col for col in df.columns if not col.endswith('_std')]]

    # Replace inf values with NaN and drop rows with NaN values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)

    return df

In [3]:
stats = pd.DataFrame(
    columns=["farm", "test_train", "datasets", "number_of_columns", "number_of_rows"])


In [5]:
# Create a dataframe that holds the summary of the data
# It should have the columns farm, test_train, datasets, number_of_columns, number_of_columns_for_training, number_of_rows, number_of_rows_for_training


wind_farm = "A"
train_datasets = [25, 69, 13, 24, 3, 17, 38, 71, 14, 92, 51]

number_of_rows = 0

for element in train_datasets:
    data = load_df_and_annotate_anomalies(wind_farm, element)

    data = data[data['train_test'] == 'train']
    data = data[data['status_type_id'].isin([0, 2])]

    data = data.drop(columns=['label', 'train_test', 'status_type_id'])

    number_of_columns = len(data.columns)
    number_of_rows += len(data)

# Join them by comma
train_datasets_str = ', '.join([str(elem) for elem in train_datasets])

stats.loc[len(stats)] = {
    "farm": wind_farm,
    "test_train": "train",
    "datasets": train_datasets_str,
    "number_of_columns": number_of_columns,
    "number_of_rows": number_of_rows
}


In [6]:
stats.head()

Unnamed: 0,farm,test_train,datasets,number_of_columns,number_of_rows
0,A,train,"25, 69, 13, 24, 3, 17, 38, 71, 14, 92, 51",57,430495


In [None]:
test_datasets = [68, 22, 72, 73, 0, 26, 40, 42, 10, 45, 84, 25, 69, 13, 24, 3, 17, 38, 71, 14, 92, 51]
