In [0]:
import os
import sys
import pandas as pd
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [0]:
# Define the path one level up
parent_directory = os.path.join(os.getcwd(), '../../Workspace/Users/iaaph@energinet.dk/')
# Add this path to the sys.path list
sys.path.append(f"{parent_directory}")
from utils.Data_splitting import create_split_indices, split_data_with_indices

from performance_metrics.discriminative_score import calculate_discriminative_scores
from performance_metrics.discriminative_LSTM_score import build_and_train_lstm
from performance_metrics.evaluation_measures import calculate_metrics
from performance_metrics.pearson_corr import calculate_pearson_correlations
from performance_metrics.Prediction_models import CNN_GRU_regression_2

In [0]:
from utils.sequencer import RollingWindow
from utils.normalizer import Scaling
from utils.preprocessor import Preprocessor

from utils.Data_splitting import create_split_indices, split_data_with_indices

from sklearn.model_selection import train_test_split


# df_ = pd.read_csv(f"{parent_directory}/Data/wert_stl.csv")
df_ = pd.read_csv(f"{parent_directory}/Data/combined.csv")

attributes = ['Wert', 'temp'] #,'deseasonalized_wert','24seasonality','168seasonality','2920seasonality'] 
feature_n = len(attributes)
seq_length = 24*7
time_col = 'Zeitpunkt'

### Initialize ###
# Initialize the Scaling class
scaler = Scaling(value_cols=attributes)

# Initialize sequencer
sequencer = RollingWindow(seq_number=seq_length, time_col=time_col, value_cols=attributes)

# Initialize preprossing
preprocessor = Preprocessor(data=df_, normalizer=scaler, sequencer=sequencer)

processed_data = preprocessor.preprocess()

print('shape of real data: ', processed_data.shape)

real_model = CNN_GRU_regression_2(timesteps=167, features_per_timestep=5, units=32)

# Initialize the model
synth_model = CNN_GRU_regression_2(timesteps=167, features_per_timestep=5, units=32)

from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Create split indices based on the total number of sequences
total_sequences = processed_data.shape[0]
target_col_indices = [0]

train_indices, validation_indices, test_indices = create_split_indices(
  total_samples=total_sequences,
  test_size=0.10,
  validation_size=0.15
)

    # Split the data using the indices
X_real_train, X_real_validation, X_real_test, y_real_train, y_real_validation, y_real_test = split_data_with_indices(
  processed_data=processed_data,
  train_indices=train_indices,
  validation_indices=validation_indices,
  test_indices=test_indices,
  target_col_indices=target_col_indices
)

for n in range(2,4):

    # if n == 1:
    #     print('skipping due to no data')
    #     continue

    # Initialize a list to store each part
    parts = []
    for m in range(1,11):
        synth_file = f"{parent_directory}/results/results_classic_168_with_temp/synthetic_data_part_{m}_{n}.csv"
        part = np.loadtxt(synth_file, delimiter=',')
        parts.append(part)

        # Concatenate all parts back into a single array
        synth_flat = np.vstack(parts) 

    synthetic_data = synth_flat.reshape((18602, 168, 2))

    X_synth_train, X_synth_validation, X_synth_test, y_synth_train, y_synth_validation, y_synth_test = split_data_with_indices(
        processed_data=synthetic_data,
        train_indices=train_indices,
        validation_indices=validation_indices,
        test_indices=test_indices,
        target_col_indices=target_col_indices
    )

    # Define the path to the CSV file
    csv_file_path = f'{parent_directory}/results/results_model4_{n}.csv'
  
    for _ in range(5):

        discriminative_score = calculate_discriminative_scores(processed_data, synthetic_data, target_col_indices)
        print(discriminative_score)

        lstm_model, d_score = build_and_train_lstm(processed_data, synthetic_data)
        print(lstm_model)

        # Train the model on real
        train_epoch = 500

        history = real_model.fit(
            X_real_train, y_real_train,
            validation_data=(X_real_validation, y_real_validation),
            epochs=train_epoch,  # Adjust the number of epochs as needed
            batch_size=128,  # Adjust the batch size as needed
            callbacks=[early_stopping]
            )

        # Train the model
        synth_history = synth_model.fit(
            X_synth_train, y_synth_train,
            validation_data=(X_real_validation, y_real_validation),
            epochs=train_epoch,  # Adjust the number of epochs as needed
            batch_size=128,  # Adjust the batch size as needed
            callbacks=[early_stopping]
        )

        # Generate predictions for CNN_GRU_regression
        print('for real')
        r_pred = real_model.predict(X_real_test)
        print('for synth')
        s_pred = synth_model.predict(X_real_test)

        print('compared synthetic only wert')
        r2_scores_and_mae = calculate_metrics(y_real_test, r_pred, s_pred)
        print(r2_scores_and_mae)


        n = len(target_col_indices)
        # Keep original unpacking when certain variables have consistent lengths
        R2_Score_Real_1 = r2_scores_and_mae['R2 Score Real']
        R2_Score_Synth_1 = r2_scores_and_mae['R2 Score Synth']
        MAE_Real_1 = r2_scores_and_mae['MAE Real']
        MAE_Synth_1 = r2_scores_and_mae['MAE Synth']

        # Check if the CSV file exists
        if os.path.exists(csv_file_path):
            # CSV file exists, read it into a DataFrame
            print(f"CSV file '{csv_file_path}' already exists, reading data.")
            existing_df = pd.read_csv(csv_file_path)
            num_rows = len(existing_df)
            print(f"The DataFrame has {num_rows} rows.")
        else:
            # CSV file does not exist, create a new DataFrame
            print(f"CSV file '{csv_file_path}' does not exist, creating new DataFrame.")
            num_rows = 0
            existing_df = pd.DataFrame()

        # Define the data for the new row
        new_row_data = {'index': num_rows,
                        'Discriminate Score 1': d_score,
                        'R2 Score Real 1': R2_Score_Real_1,
                        'R2 Score Synthetic 1': R2_Score_Synth_1,
                        'MAE Real 1': MAE_Real_1,
                        'MAE Synthetic 1': MAE_Synth_1,
                        }

        # Append the new row to the existing DataFrame
        updated_df = pd.concat([existing_df, pd.DataFrame(new_row_data, index=[0])], ignore_index=True)

        # Write the updated DataFrame back to the CSV file
        updated_df.to_csv(csv_file_path, index=False)

        print("New row added to the CSV file.")

shape of real data:  (18602, 168, 5)
{0: 0.47787872271798726}
Epoch 1/20


Epoch 8/500
Epoch 9/500
Epoch 10/500
for real
for synth
compared synthetic only wert
Feature 0 contains zero or negative values, MSLE cannot be computed for this feature.
     Feature  R2 Score Real  R2 Score Synth  ...  MAE Synth  MSLE Real  MSLE Synth
0  Feature_0         0.8288        0.557164  ...   0.057761        NaN         NaN

[1 rows x 7 columns]
CSV file '/databricks/driver/../../Workspace/Users/iaaph@energinet.dk//results/results_model4_2.csv' already exists, reading data.
The DataFrame has 4 rows.
New row added to the CSV file.


In [0]:
target_col_indices = [0]

# df_ = pd.read_csv(f"{parent_directory}/Data/wert_stl.csv")
df_ = pd.read_csv(f"{parent_directory}/Data/combined.csv")

attributes = ['Wert', 'temp'] #,'deseasonalized_wert','24seasonality','168seasonality','2920seasonality'] 
feature_n = len(attributes)
seq_length = 24*7
time_col = 'Zeitpunkt'

### Initialize ###
# Initialize the Scaling class
scaler = Scaling(value_cols=attributes)

# Initialize sequencer
sequencer = RollingWindow(seq_number=seq_length, time_col=time_col, value_cols=attributes)

# Initialize preprossing
preprocessor = Preprocessor(data=df_, normalizer=scaler, sequencer=sequencer)

processed_data = preprocessor.preprocess()

total_sequences = processed_data.shape[0]

train_indices, validation_indices, test_indices = create_split_indices(
    total_samples=total_sequences,
    test_size=0.10,
    validation_size=0.15
    )

# Split the data using the indices
X_real_train, X_real_validation, X_real_test, y_real_train, y_real_validation, y_real_test = split_data_with_indices(              
    processed_data=processed_data,
    train_indices=train_indices,
    validation_indices=validation_indices,
    test_indices=test_indices,
    target_col_indices=target_col_indices
    )

real_model = CNN_GRU_regression_2(timesteps=167, features_per_timestep=2, units=32)

# Initialize the model
synth_model = CNN_GRU_regression_2(timesteps=167, features_per_timestep=2, units=32)

from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

for n in range(2,4):
    
    # Define the path to the CSV file
    #define path to results
    result_file_path = os.path.join(parent_directory, f'results')
    csv_file_path = os.path.join(result_file_path, f'results_model3_{n}.csv')
    print(csv_file_path)

    synth_file = f"results/results_classic_168_with_temp/synthetic_data_flattened_{n}_2.csv"
    synthetic_path = os.path.join(parent_directory, synth_file)

    synth_flat = np.genfromtxt(synthetic_path)
    
    synthetic_data = synth_flat.reshape((18602, 168, 2))

    X_synth_train, X_synth_validation, X_synth_test, y_synth_train, y_synth_validation, y_synth_test = split_data_with_indices(
        processed_data=synthetic_data,
        train_indices=train_indices,
        validation_indices=validation_indices,
        test_indices=test_indices,
        target_col_indices=target_col_indices
    )

    for _ in range(5):

        model, d_score = build_and_train_lstm(processed_data, synthetic_data)
        print(model)

        # Train the model on real
        train_epoch = 500

        history = real_model.fit(
            X_real_train, y_real_train,
            validation_data=(X_real_validation, y_real_validation),
            epochs=train_epoch,  # Adjust the number of epochs as needed
            batch_size=128,  # Adjust the batch size as needed
            callbacks=[early_stopping]
            )

        # Train the model
        synth_history = synth_model.fit(
            X_synth_train, y_synth_train,
            validation_data=(X_real_validation, y_real_validation),
            epochs=train_epoch,  # Adjust the number of epochs as needed
            batch_size=128,  # Adjust the batch size as needed
            callbacks=[early_stopping]
        )

        # Generate predictions for CNN_GRU_regression
        print('for real')
        r_pred = real_model.predict(X_real_test)
        print('for synth')
        s_pred = synth_model.predict(X_real_test)

        print('compared synthetic only wert')
        r2_scores_and_mae = calculate_metrics(y_real_test, r_pred, s_pred)
        print(r2_scores_and_mae)

        n = len(target_col_indices)
        # Keep original unpacking when certain variables have consistent lengths
        R2_Score_Real_1 = r2_scores_and_mae['R2 Score Real']
        R2_Score_Synth_1 = r2_scores_and_mae['R2 Score Synth']
        MAE_Real_1 = r2_scores_and_mae['MAE Real']
        MAE_Synth_1 = r2_scores_and_mae['MAE Synth']

        # Check if the CSV file exists
        if os.path.exists(csv_file_path):
            # CSV file exists, read it into a DataFrame
            print(f"CSV file '{csv_file_path}' already exists, reading data.")
            existing_df = pd.read_csv(csv_file_path)
            num_rows = len(existing_df)
            print(f"The DataFrame has {num_rows} rows.")
        else:
            # CSV file does not exist, create a new DataFrame
            print(f"CSV file '{csv_file_path}' does not exist, creating new DataFrame.")
            num_rows = 0
            existing_df = pd.DataFrame()

        # Define the data for the new row
        new_row_data = {'index': num_rows,
                        'Discriminate Score 1': d_score,
                        'R2 Score Real 1': R2_Score_Real_1,
                        'R2 Score Synthetic 1': R2_Score_Synth_1,
                        'MAE Real 1': MAE_Real_1,
                        'MAE Synthetic 1': MAE_Synth_1,
                        }

        # Append the new row to the existing DataFrame
        updated_df = pd.concat([existing_df, pd.DataFrame(new_row_data, index=[0])], ignore_index=True)

        # Write the updated DataFrame back to the CSV file
        updated_df.to_csv(csv_file_path, index=False)

        print("New row added to the CSV file.")

/databricks/driver/../../Workspace/Users/iaaph@energinet.dk/results/results_model3_2.csv
Epoch 1/20


Epoch 4/500
Epoch 5/500
Epoch 6/500
for real
for synth
compared synthetic only wert
Feature 0 contains zero or negative values, MSLE cannot be computed for this feature.
     Feature  R2 Score Real  R2 Score Synth  ...  MAE Synth  MSLE Real  MSLE Synth
0  Feature_0        0.84703        0.594675  ...   0.052509        NaN         NaN

[1 rows x 7 columns]
CSV file '/databricks/driver/../../Workspace/Users/iaaph@energinet.dk/results/results_model3_3.csv' already exists, reading data.
The DataFrame has 4 rows.
New row added to the CSV file.


In [0]:
from utils.sequencer import RollingWindow
from utils.normalizer import Scaling
from utils.preprocessor import Preprocessor
from utils.Data_splitting import create_split_indices, split_data_with_indices

from databricks.feature_store import FeatureStoreClient
# Initialize the feature store client
fs = FeatureStoreClient()

# We can then load the feature tabel with the following command
spark_df2 = fs.read_table("stl_wert")
spark_df2

# Assuming you have a Spark DataFrame called spark_df
df_ = spark_df2.toPandas()

target_col_indices = [0]

attributes = ['Wert'] # ['deseasonalized_wert', '24seasonality',  '168seasonality', '2920seasonality']
feature_n = len(attributes)
seq_length = 24
time_col = 'Zeitpunkt'

### Initialize ###
# Initialize the Scaling class
scaler = Scaling(value_cols=attributes)

# Initialize sequencer
sequencer = RollingWindow(seq_number=seq_length, time_col=time_col, value_cols=attributes)

# Initialize preprossing
preprocessor = Preprocessor(data=df_, normalizer=scaler, sequencer=sequencer)

processed_data = preprocessor.preprocess()

total_sequences = processed_data.shape[0]

train_indices, validation_indices, test_indices = create_split_indices(
    total_samples=total_sequences,
    test_size=0.10,
    validation_size=0.15
    )

# Split the data using the indices
X_real_train, X_real_validation, X_real_test, y_real_train, y_real_validation, y_real_test = split_data_with_indices(              
    processed_data=processed_data,
    train_indices=train_indices,
    validation_indices=validation_indices,
    test_indices=test_indices,
    target_col_indices=target_col_indices
    )

real_model = CNN_GRU_regression_2(timesteps=23, features_per_timestep=1, units=32)

# Initialize the model
synth_model = CNN_GRU_regression_2(timesteps=23, features_per_timestep=1, units=32)

from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

for n in range(12):

    if n < 3:
        print("Skipping due to missing data")
        continue 
    
    # Define the path to the CSV file
    #define path to results
    result_file_path = os.path.join(parent_directory, f'results')
    csv_file_path = os.path.join(result_file_path, f'results_model1_{n}.csv')
    print(csv_file_path)

    synth_file = f"results/results_classic_24/synthetic_data_flattened_{n}.csv"
    synthetic_path = os.path.join(parent_directory, synth_file)

    synth_flat = np.genfromtxt(synthetic_path)
    
    synthetic_data = synth_flat.reshape((18746, 24, 1))

    X_synth_train, X_synth_validation, X_synth_test, y_synth_train, y_synth_validation, y_synth_test = split_data_with_indices(
        processed_data=synthetic_data,
        train_indices=train_indices,
        validation_indices=validation_indices,
        test_indices=test_indices,
        target_col_indices=target_col_indices
    )

    for _ in range(5):

        model, d_score = build_and_train_lstm(processed_data, synthetic_data)
        print(model)

        # Train the model on real
        train_epoch = 500

        history = real_model.fit(
            X_real_train, y_real_train,
            validation_data=(X_real_validation, y_real_validation),
            epochs=train_epoch,  # Adjust the number of epochs as needed
            batch_size=128,  # Adjust the batch size as needed
            callbacks=[early_stopping]
            )

        # Train the model
        synth_history = synth_model.fit(
            X_synth_train, y_synth_train,
            validation_data=(X_real_validation, y_real_validation),
            epochs=train_epoch,  # Adjust the number of epochs as needed
            batch_size=128,  # Adjust the batch size as needed
            callbacks=[early_stopping]
        )

        # Generate predictions for CNN_GRU_regression
        print('for real')
        r_pred = real_model.predict(X_real_test)
        print('for synth')
        s_pred = synth_model.predict(X_real_test)

        print('compared synthetic only wert')
        r2_scores_and_mae = calculate_metrics(y_real_test, r_pred, s_pred)
        print(r2_scores_and_mae)

        n = len(target_col_indices)
        # Keep original unpacking when certain variables have consistent lengths
        R2_Score_Real_1 = r2_scores_and_mae['R2 Score Real']
        R2_Score_Synth_1 = r2_scores_and_mae['R2 Score Synth']
        MAE_Real_1 = r2_scores_and_mae['MAE Real']
        MAE_Synth_1 = r2_scores_and_mae['MAE Synth']

        # Check if the CSV file exists
        if os.path.exists(csv_file_path):
            # CSV file exists, read it into a DataFrame
            print(f"CSV file '{csv_file_path}' already exists, reading data.")
            existing_df = pd.read_csv(csv_file_path)
            num_rows = len(existing_df)
            print(f"The DataFrame has {num_rows} rows.")
        else:
            # CSV file does not exist, create a new DataFrame
            print(f"CSV file '{csv_file_path}' does not exist, creating new DataFrame.")
            num_rows = 0
            existing_df = pd.DataFrame()

        # Define the data for the new row
        new_row_data = {'index': num_rows,
                        'Discriminate Score 1': d_score,
                        'R2 Score Real 1': R2_Score_Real_1,
                        'R2 Score Synthetic 1': R2_Score_Synth_1,
                        'MAE Real 1': MAE_Real_1,
                        'MAE Synthetic 1': MAE_Synth_1,
                        }

        # Append the new row to the existing DataFrame
        updated_df = pd.concat([existing_df, pd.DataFrame(new_row_data, index=[0])], ignore_index=True)

        # Write the updated DataFrame back to the CSV file
        updated_df.to_csv(csv_file_path, index=False)

        print("New row added to the CSV file.")

Skipping due to missing data
Skipping due to missing data
Skipping due to missing data
/databricks/driver/../../Workspace/Users/iaaph@energinet.dk/results/results_model1_3.csv
Epoch 1/20
Epoch 2/20


Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
for real
for synth
compared synthetic only wert
Feature 0 contains zero or negative values, MSLE cannot be computed for this feature.
     Feature  R2 Score Real  R2 Score Synth  ...  MAE Synth  MSLE Real  MSLE Synth
0  Feature_0       0.147212        0.078203  ...   0.089391        NaN         NaN

[1 rows x 7 columns]
CSV file '/databricks/driver/../../Workspace/Users/iaaph@energinet.dk/results/results_model1_11.csv' already exists, reading data.
The DataFrame has 4 rows.
New row added to the CSV file.
