In [3]:
import numpy as np
import pandas as pd
from loguru import logger
from pathlib import Path
import hjson
import os

from phd_visualizations.utils.units import unit_conversion
from phd_visualizations.utils import rename_signal_ids_to_var_ids

In [5]:
data_path = Path(f'{os.getenv("HOME")}/Nextcloud/Juanmi_MED_PSA/EURECAT/data')

filename_process_data = '20230505_solarMED.csv'
filename_process_data2 = '20231030_MED.csv'

paths = [data_path / filename_process_data, data_path / filename_process_data2]
sample_rate_key: str = '60s'



In [6]:
# Load varibles configuration

with open( Path("../data") / "variables_config.hjson") as f:
    vars_config = hjson.load(f)

In [16]:
def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:

    df = df.copy()

    df.index = df.index.round('s')
    df = df.tz_localize('UTC')
    if 'time.1' in df.columns:
        df.drop(columns='time.1', inplace=True)

    # Select columns of 'object' data type
    object_columns = df.select_dtypes(include=['object'])

    # Print the column names
    logger.debug(object_columns.columns)  # %%

    if not object_columns.empty:
        logger.warning(f"Columns with object data type that will be deleted: {object_columns.columns}")
        
    # Delete columns with 'object' data type
    df = df.select_dtypes(exclude=['object'])

    return df

In [17]:
index_cols = ['time', 'TimeStamp']

if not isinstance(paths, list):
    paths = [paths]

for path in paths:
    if isinstance(path, str):
        path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"Path {path} does not exist.")

# Read reference dataframe
df = None
for index_col in index_cols:
    try:
        df = pd.read_csv(paths[0], parse_dates=True, index_col=index_col)
        break
    except Exception as e:
        pass
        # logger.error(f'Error while reading data from {paths[0]} with index_col={index_col}: {e}')

if df is None:
    raise RuntimeError(f'Failed to read data from CSV file with any of the provided index columns: {index_cols}')

df = process_dataframe(df)


[32m2024-04-02 12:39:17.140[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mprocess_dataframe[0m:[36m14[0m - [34m[1mIndex(['Date', 'Time'], dtype='object')[0m


In [18]:
# Read additional dataframes and concatenate them
for idx, path in enumerate(paths[1:]):
    df_aux = None
    for index_col in index_cols:
        try:
            df_aux = pd.read_csv(path, parse_dates=True, index_col=index_col)
            df_aux.index.names = ['time']
            break
        except Exception as e:
            logger.error(f'Error while reading data from {path} with index_col={index_col}: {e}')

    if df_aux is None:
        logger.error(f'Failed to read data from CSV file with any of the provided index columns for the {idx+2} path: {index_cols}')

    df_aux = process_dataframe(df_aux)

    # Find the common columns in both dataframes and drop them from the second
    common_columns = df.columns.intersection(df_aux.columns)
    df_aux = df_aux.drop(columns=common_columns)
    df = pd.concat([df, df_aux], axis=1)

# Preprocessing
# Sample every `sample_rate` seconds to reduce the size of the dataframe
df = df.resample(sample_rate_key).mean()

# Rename columns from signal_id to var_id
df = rename_signal_ids_to_var_ids(df, vars_config)

# Convert units to model units
df = unit_conversion(df, vars_config, input_unit_key='units_scada', output_unit_key='units_model')

# Filter out nans until first value in Tts
logger.warning(f"Removing {df['Tts_h_t'].isna().sum()} NaNs from the dataframe")
df = df[df['Tts_h_t'].notna()]

[32m2024-04-02 12:39:23.706[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mprocess_dataframe[0m:[36m14[0m - [34m[1mIndex([], dtype='object')[0m
[32m2024-04-02 12:39:23.954[0m | [34m[1mDEBUG   [0m | [36mphd_visualizations.utils.units[0m:[36munit_conversion[0m:[36m552[0m - [34m[1mUpdated Tamb to C from C[0m
[32m2024-04-02 12:39:23.963[0m | [34m[1mDEBUG   [0m | [36mphd_visualizations.utils.units[0m:[36munit_conversion[0m:[36m552[0m - [34m[1mUpdated Tmed_s_out to C from C[0m
[32m2024-04-02 12:39:23.972[0m | [34m[1mDEBUG   [0m | [36mphd_visualizations.utils.units[0m:[36munit_conversion[0m:[36m552[0m - [34m[1mUpdated Tsf_in to C from C[0m
[32m2024-04-02 12:39:23.981[0m | [34m[1mDEBUG   [0m | [36mphd_visualizations.utils.units[0m:[36munit_conversion[0m:[36m552[0m - [34m[1mUpdated Tsf_out to C from C[0m
[32m2024-04-02 12:39:23.990[0m | [34m[1mDEBUG   [0m | [36mphd_visualizations.utils.units[0m:[36munit_conversion[0m:

# Compare columns in two dataframes

In [23]:

df1 = pd.read_csv(data_path / '20231030_solarMED.csv', parse_dates=True, index_col='TimeStamp')
df2 = pd.read_csv(data_path / '20230505_solarMED.csv', parse_dates=True, index_col='TimeStamp')

# Find columns that are in df1 but not in df2
columns_df1_not_df2 = df1.columns.difference(df2.columns)

logger.info(f"Columns in df1 but not in df2: {columns_df1_not_df2}")
logger.info(f"Columns 1: {df1.columns}")
logger.info(f"Columns 2: {df2.columns}")

[32m2024-04-02 13:04:30.275[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mColumns in df1 but not in df2: Index(['CT-DES-001', 'FT-DES-003', 'FT-DES-005', 'RE-SF-001', 'SW2TC1',
       'TE-DES-015', 'TT-AQU-107a'],
      dtype='object')[0m
[32m2024-04-02 13:04:30.276[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mColumns 1: Index(['FT-AQU-100', 'FT-AQU-101', 'TT-AQU-106', 'TT-AQU-107a', 'TT-AQU-109',
       'CT-DES-001', 'FT-DES-003', 'FT-DES-005', 'SW2TC1', 'TE-DES-015',
       'TT-DES-030', 'FT-SF-001', 'FT-SF-002', 'RE-SF-001', 'TT-SF-001',
       'TT-SF-002', 'TT-SF-003', 'TT-SF-004', 'TT-SF-005', 'TT-SF-006',
       'TT-SF-007', 'TT-SF-008', 'TT-SF-009', 'TT-SF-010'],
      dtype='object')[0m
[32m2024-04-02 13:04:30.277[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mColumns 2: Index(['Row', 'Date', 'Time', 'FT-AQU-100_1', 'FT-AQU-101_1', 'SC-AQU-P102_1',
       'TT-AQU-106_1', 'TT-AQU

In [24]:
df3 = pd.read_csv(Path('../data/calibration') / '20230707_20230710_datos_tanques.csv', parse_dates=True, index_col='TimeStamp')

logger.info(list(df3.columns))


[32m2024-04-02 13:06:33.066[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1m['TT-SF-004', 'TT-SF-005', 'TT-SF-006', 'TT-SF-001', 'TT-SF-002', 'TT-SF-003', 'TT-DES-030', 'FT-SF-001', 'FT-AQU-100', 'TT-SF-008', 'TT-AQU-109', 'FT-AQU-101', 'TT-AQU-106', 'TT-SF-007', 'ZT-AQU-TCV102'][0m


# Nueva traza solarMED 
añadiendo datos necesarios para calibración de cada componente

In [25]:
traza_original = [
    'FT-AQU-100', 'FT-AQU-101', 'TT-AQU-106', 'TT-AQU-107a', 'TT-AQU-109',
    'CT-DES-001', 'FT-DES-003', 'FT-DES-005', 'SW2TC1', 'TE-DES-015',
    'TT-DES-030', 'FT-SF-001', 'FT-SF-002', 'RE-SF-001', 'TT-SF-001',
    'TT-SF-002', 'TT-SF-003', 'TT-SF-004', 'TT-SF-005', 'TT-SF-006',
    'TT-SF-007', 'TT-SF-008', 'TT-SF-009', 'TT-SF-010'
]

additional_signals = [
    'TT-SF-018', 'TT-SF-019', 'FT-SF-005', 'TT-SF-024', 'TT-SF-025',
    'FT-SF-006', 'TT-SF-030', 'TT-SF-031', 'FT-SF-007', 'TT-SF-036',
    'TT-SF-037', 'FT-SF-008',  'FT-SF-002', 'TT-SF-009', 'TT-SF-010',
     'TT-DES-030', 'RE-SF-001', 'TT-SF-010', 'TT-SF-009', 'TT-SF-007',
    'TT-SF-008', 'TT-DES-030', 'FT-SF-002', 'FT-SF-001', 'TT-SF-010', 
    'TT-SF-009', 'TT-SF-007', 'TT-SF-008', 'TT-DES-030', 'FT-SF-002',
    'FT-SF-001', 'TT-SF-010',  'TT-SF-009', 'TT-SF-007', 'TT-SF-008',
    'TT-DES-030', 'FT-SF-002', 'FT-SF-001', 'UK-SF-P001-fq'
]

# Convert lists to sets for easier comparison
first_set = set(traza_original)
second_set = set(additional_signals)

# Find the elements in the second set that are not in the first set
difference = second_set - first_set

# Print the elements that are not present in the first list
for element in difference:
    print(element)

FT-SF-005
TT-SF-018
FT-SF-007
TT-SF-019
FT-SF-006
TT-SF-031
TT-SF-036
UK-SF-P001-fq
TT-SF-025
TT-SF-037
FT-SF-008
TT-SF-030
TT-SF-024
