# CSV 2 H5

Let's transform CSV sensor data to H5 format.

In [1]:
# Auxiliary class:
import datetime as dt
import numpy as np
import os
import tables as tb

class DataUtils:
    """Class for data utilities."""
    
    class SensorData(tb.IsDescription):
        """Class to define the structure of the table in the HDF5 file."""

        timestamp = tb.Time64Col(pos=0)
        radiation = tb.Float32Col(pos=1)    # Radiation, W/m2


    def __init__(self):
        self.oahu_sensor_names = ["ap1","ap3","ap4","ap5","ap6","ap7","dh1","dh2","dh3","dh4","dh5","dh6","dh7","dh8","dh9","dh10","dh11"]
        self.oahu_sensor_latitudes = [21.31276, 21.31281, 21.31141, 21.30983, 21.30812, 21.31478, 21.31533, 21.31451, 21.31236, 21.31303, 21.31357, 21.31179, 21.31418, 21.31034, 21.31268, 21.31183, 21.31042]
        self.oahu_sensor_longitudes = [-158.08389, -158.08163, -158.07947, -158.08249, -158.07935, -158.07785, -158.087, -158.08534, -158.08463, -158.08505, -158.08424, -158.08678, -158.08685, -158.08675, -158.08688, -158.08554, -158.0853]
        self.oahu_sensor_fields = {"ap1": 15, "ap3": 16, "ap4": 18, "ap5": 17, "ap6": 13, "ap7": 19, 
                                   "dh1": 11, "dh2": 10, "dh3": 4, "dh4": 5, "dh5": 6, "dh6": 20, "dh7": 21, 
                                   "dh8": 22, "dh9": 9, "dh10": 7, "dh11": 8}

    def transform_oahu_data_to_h5(self, input_folder: str, output_folder: str):
        # Let's create the h5 file
        h5_file = tb.open_file(f'{output_folder}/sensors_data.h5', 'w')
        # Now, we create the group for the farm:
        info_group = h5_file.create_group('/', 'info', 'Basic information')
        data_group = h5_file.create_group('/', 'data', 'Sensor data')
        # In the following we add tables regarding sensor names, latitudes and longitudes
        h5_file.create_array(info_group, 'sensor_names', self.oahu_sensor_names)
        h5_file.create_array(info_group, 'sensor_latitudes', np.array(self.oahu_sensor_latitudes))
        h5_file.create_array(info_group, 'sensor_longitudes', np.array(self.oahu_sensor_longitudes))

        file_paths: list = []
        for file_name in os.listdir(input_folder):
            file_path: str = os.path.join(input_folder, file_name)
            if os.path.isfile(file_path):
                file_paths.append(file_path)
                print(f'Storing file path {file_path} ...')
        file_paths.sort()

        sensor_tables = {}
        for sensor_name in self.oahu_sensor_names:
            # Create a table for each sensor
            sensor_tables[sensor_name] = h5_file.create_table(data_group, sensor_name, DataUtils.SensorData, f'Sensor {sensor_name} data')
        # ... and add all the files to the table
        for file_path in file_paths:
            print(f'Processing file {file_path} ...')
            reader = open(file_path, 'r')
            counter = 0
            for line in reader:
                parts: list = line.split(',')
                date_string = parts[0] + "," + parts[1] + "," + parts[2] + "," + parts[3]
                # Let's build the timestamp, with the format second,year,day_of_year,hour+minute
                timestamp = dt.datetime.strptime(date_string, '%S,%Y,%j,%H%M')
                if counter < 20:
                    print(f'Processing timestamp {timestamp}...')
                    counter += 1
                timestamp = timestamp.timestamp()
                for sensor_name in self.oahu_sensor_names:
                    sensor_data = sensor_tables[sensor_name].row
                    sensor_data['timestamp'] = timestamp
                    sensor_data['radiation'] = float(parts[self.oahu_sensor_fields[sensor_name]])
                    sensor_data.append()
            reader.close()
        for sensor_name in self.oahu_sensor_names:
            sensor_tables[sensor_name].flush()
        h5_file.close()

    def transform_pvgis_data_to_h5(self, base_folder: str, farm_name: str, sensor_names: list[str], sensor_latitudes: list[float], sensor_longitudes: list[float]):
        # Let's create the h5 file
        h5_file = tb.open_file(f'{base_folder}/{farm_name}/sensors_data.h5', 'w')
        # Now, we create the group for the farm:
        info_group = h5_file.create_group('/', 'info', 'Basic information')
        data_group = h5_file.create_group('/', 'data', 'Sensor data')
        # In the following we add tables regarding sensor names, latitudes and longitudes
        h5_file.create_array(info_group, 'sensor_names', sensor_names)
        h5_file.create_array(info_group, 'sensor_latitudes', np.array(sensor_latitudes))
        h5_file.create_array(info_group, 'sensor_longitudes', np.array(sensor_longitudes))
        for sensor_name in sensor_names:
            sensor_file_path = f'{base_folder}/{farm_name}/{sensor_name}.csv'
            # Now, we create the table for the sensor
            table = h5_file.create_table(data_group, sensor_name, DataUtils.SensorData, f'Sensor {sensor_name} data')
            # ... and add all the CSV data to the table
            print(f'Processing file {sensor_file_path} ...')
            reader = open(sensor_file_path, 'r')
            # We jump the first 9 lines
            for _ in range(9):
                reader.readline()
            for line in reader:
                # Break the loop when line is empty
                if line == '\n':
                    break
                parts: list = line.split(',')
                sensor_data = table.row
                sensor_data['timestamp'] = dt.datetime.strptime(parts[0], '%Y%m%d:%H%M').timestamp()
                sensor_data['radiation'] = float(parts[1])
                sensor_data.append()
            reader.close()
            table.flush()
        h5_file.close()        

## Oahu to H5

In [2]:
data_utils = DataUtils()
data_utils.transform_oahu_data_to_h5('/home/jlrisco/Descargas/201006', '/home/jlrisco/Descargas')

Storing file path /home/jlrisco/Descargas/201006/20100619.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100624.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100622.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100626.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100602.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100604.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100628.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100620.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100608.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100614.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100609.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100629.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100627.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100607.txt ...
Storing file path /home/jlrisco/Descargas/201006/20100613.txt ...
Storing fi

In [2]:
sensor_names = []
# Build sensor names in the format sensor01, sensor02, etc.
for i in range(1, 19):
    sensor_names.append("sensor" + str(i).zfill(2))
sensor_latitudes: list[float] = [36.875, 36.875, 36.875, 36.875, 36.875, 36.875, 36.874, 36.874, 36.874, 36.874, 36.874, 36.874, 36.873, 36.873, 36.873, 36.873, 36.873, 36.873]
sensor_longitudes: list[float] = [-2.594, -2.593, -2.592, -2.591, -2.590, -2.589, -2.594, -2.593, -2.592, -2.591, -2.590, -2.589, -2.594, -2.593, -2.592, -2.591, -2.590, -2.589]
output_folder = '../data/input'
farm_name = 'Almeria'

Let's transform the data:

In [5]:
data_utils = DataUtils()
data_utils.transform_pvgis_data_to_h5(output_folder, farm_name, sensor_names, sensor_latitudes, sensor_longitudes)
# Let's print firt row of ap1 table
h5_file = tb.open_file(f'{output_folder}/{farm_name}/sensors_data.h5', 'r')
table = h5_file.root.data.sensor01
for i in range(0, 10):
    print(dt.datetime.fromtimestamp(table[i]['timestamp']).strftime('%Y-%m-%d %H:%M:%S'))
    print(table[i]['radiation'])
h5_file.close()

Processing file ../data/input/Almeria/sensor01.csv ...
Processing file ../data/input/Almeria/sensor02.csv ...
Processing file ../data/input/Almeria/sensor03.csv ...
Processing file ../data/input/Almeria/sensor04.csv ...
Processing file ../data/input/Almeria/sensor05.csv ...
Processing file ../data/input/Almeria/sensor06.csv ...
Processing file ../data/input/Almeria/sensor07.csv ...
Processing file ../data/input/Almeria/sensor08.csv ...
Processing file ../data/input/Almeria/sensor09.csv ...
Processing file ../data/input/Almeria/sensor10.csv ...
Processing file ../data/input/Almeria/sensor11.csv ...
Processing file ../data/input/Almeria/sensor12.csv ...
Processing file ../data/input/Almeria/sensor13.csv ...
Processing file ../data/input/Almeria/sensor14.csv ...
Processing file ../data/input/Almeria/sensor15.csv ...
Processing file ../data/input/Almeria/sensor16.csv ...
Processing file ../data/input/Almeria/sensor17.csv ...
Processing file ../data/input/Almeria/sensor18.csv ...
2019-01-01