# CSV 2 H5

Let's transform CSV sensor data to H5 format.

In [4]:
# Auxiliary class:
import datetime as dt
import numpy as np
import os
import tables as tb

class DataUtils:
    """Class for data utilities."""
    
    class SensorData(tb.IsDescription):
        """Class to define the structure of the table in the HDF5 file."""

        timestamp = tb.Time64Col(pos=0)
        radiation = tb.Float32Col(pos=1)    # Radiation, W/m2


    def __init__(self):
        pass

    def transform_oahu_data_to_h5(self, base_folder: str, data_center_name: str, farm_name: str, sensor_names: list[str], sensor_latitudes: list[float], sensor_longitudes: list[float]):
        # Let's create the h5 file
        h5_file = tb.open_file(f'{base_folder}/{farm_name}/sensors_data.h5', 'w')
        # Now, we create the group for the farm:
        info_group = h5_file.create_group('/', 'info', 'Basic information')
        data_group = h5_file.create_group('/', 'data', 'Sensor data')
        # In the following we add tables regarding sensor names, latitudes and longitudes
        h5_file.create_array(info_group, 'sensor_names', sensor_names)
        h5_file.create_array(info_group, 'sensor_latitudes', np.array(sensor_latitudes))
        h5_file.create_array(info_group, 'sensor_longitudes', np.array(sensor_longitudes))
        for sensor_name in sensor_names:
            sensor_file_paths: list = []
            for file_name in os.listdir(f'{base_folder}/{data_center_name}/{farm_name}/{sensor_name}'):
                file_path: str = os.path.join(f'{base_folder}/{data_center_name}/{farm_name}/{sensor_name}', file_name)
                if os.path.isfile(file_path):
                    sensor_file_paths.append(file_path)
                    print(f'Storing file path {file_path} ...')
            sensor_file_paths.sort()
            # Now, we create the table for the sensor
            table = h5_file.create_table(data_group, sensor_name, DataUtils.SensorData, f'Sensor {sensor_name} data')
            # ... and add all the CSV data to the table
            for sensor_file_path in sensor_file_paths:
                print(f'Processing file {sensor_file_path} ...')
                reader = open(sensor_file_path, 'r')
                reader.readline()  # Nos saltamos la cabecera
                for line in reader:
                    parts: list = line.split(',')
                    sensor_data = table.row
                    sensor_data['timestamp'] = dt.datetime.strptime(parts[0], '%Y-%m-%d %H:%M:%S-10:00').timestamp()
                    sensor_data['radiation'] = float(parts[1])
                    sensor_data.append()
                reader.close()
            table.flush()
        h5_file.close()

    def transform_pvgis_data_to_h5(self, base_folder: str, farm_name: str, sensor_names: list[str], sensor_latitudes: list[float], sensor_longitudes: list[float]):
        # Let's create the h5 file
        h5_file = tb.open_file(f'{base_folder}/{farm_name}/sensors_data.h5', 'w')
        # Now, we create the group for the farm:
        info_group = h5_file.create_group('/', 'info', 'Basic information')
        data_group = h5_file.create_group('/', 'data', 'Sensor data')
        # In the following we add tables regarding sensor names, latitudes and longitudes
        h5_file.create_array(info_group, 'sensor_names', sensor_names)
        h5_file.create_array(info_group, 'sensor_latitudes', np.array(sensor_latitudes))
        h5_file.create_array(info_group, 'sensor_longitudes', np.array(sensor_longitudes))
        for sensor_name in sensor_names:
            sensor_file_path = f'{base_folder}/{farm_name}/{sensor_name}.csv'
            # Now, we create the table for the sensor
            table = h5_file.create_table(data_group, sensor_name, DataUtils.SensorData, f'Sensor {sensor_name} data')
            # ... and add all the CSV data to the table
            print(f'Processing file {sensor_file_path} ...')
            reader = open(sensor_file_path, 'r')
            # We jump the first 9 lines
            for _ in range(9):
                reader.readline()
            for line in reader:
                # Break the loop when line is empty
                if line == '\n':
                    break
                parts: list = line.split(',')
                sensor_data = table.row
                sensor_data['timestamp'] = dt.datetime.strptime(parts[0], '%Y%m%d:%H%M').timestamp()
                sensor_data['radiation'] = float(parts[1])
                sensor_data.append()
            reader.close()
            table.flush()
        h5_file.close()        

In [2]:
sensor_names = []
# Build sensor names in the format sensor01, sensor02, etc.
for i in range(1, 19):
    sensor_names.append("sensor" + str(i).zfill(2))
sensor_latitudes: list[float] = [36.875, 36.875, 36.875, 36.875, 36.875, 36.875, 36.874, 36.874, 36.874, 36.874, 36.874, 36.874, 36.873, 36.873, 36.873, 36.873, 36.873, 36.873]
sensor_longitudes: list[float] = [-2.594, -2.593, -2.592, -2.591, -2.590, -2.589, -2.594, -2.593, -2.592, -2.591, -2.590, -2.589, -2.594, -2.593, -2.592, -2.591, -2.590, -2.589]
base_folder = '../data/input'
farm_name = 'Almeria'

Let's transform the data:

In [5]:
data_utils = DataUtils()
data_utils.transform_pvgis_data_to_h5(base_folder, farm_name, sensor_names, sensor_latitudes, sensor_longitudes)
# Let's print firt row of ap1 table
h5_file = tb.open_file(f'{base_folder}/{farm_name}/sensors_data.h5', 'r')
table = h5_file.root.data.sensor01
for i in range(0, 10):
    print(dt.datetime.fromtimestamp(table[i]['timestamp']).strftime('%Y-%m-%d %H:%M:%S'))
    print(table[i]['radiation'])
h5_file.close()

Processing file ../data/input/Almeria/sensor01.csv ...
Processing file ../data/input/Almeria/sensor02.csv ...
Processing file ../data/input/Almeria/sensor03.csv ...
Processing file ../data/input/Almeria/sensor04.csv ...
Processing file ../data/input/Almeria/sensor05.csv ...
Processing file ../data/input/Almeria/sensor06.csv ...
Processing file ../data/input/Almeria/sensor07.csv ...
Processing file ../data/input/Almeria/sensor08.csv ...
Processing file ../data/input/Almeria/sensor09.csv ...
Processing file ../data/input/Almeria/sensor10.csv ...
Processing file ../data/input/Almeria/sensor11.csv ...
Processing file ../data/input/Almeria/sensor12.csv ...
Processing file ../data/input/Almeria/sensor13.csv ...
Processing file ../data/input/Almeria/sensor14.csv ...
Processing file ../data/input/Almeria/sensor15.csv ...
Processing file ../data/input/Almeria/sensor16.csv ...
Processing file ../data/input/Almeria/sensor17.csv ...
Processing file ../data/input/Almeria/sensor18.csv ...
2019-01-01