In [1]:
# !pip install h5py

In [2]:
import pandas as pd
from collections import defaultdict
import h5py
import numpy as np

### Requirement
Allow a model to be consumed as a REST API. Input is a variable length array or arrays; output is a single value or value set.

We need this to be easily deployable, and we need the ability to change models easily.

#### Summary:

* Perform Inferencing with a Keras/TF model
* Load a Keras/TF model from a path
* Expose the model as a restful service
* The input will be an array or arrays of data
* This should be defined per model on import
* The output is the next sequence or sequences
* The response should be JSON with this data

In [3]:
from concurrent.futures import ThreadPoolExecutor
import os

def read_multiple_csv(files):
    """
    Reads multiple CSV files and combines them into a single DataFrame.

    Parameters:
        files (list): A list of file paths to the CSV files.

    Returns:
        pandas.DataFrame: A DataFrame containing the combined data from all CSV files.

    Raises:
        FileNotFoundError: If a file in the list does not exist.

    Example:
        files = ['data1.csv', 'data2.csv', 'data3.csv']
        combined_data = read_multiple_csv(files)
    """
    df_list = []

    def _read_csv(file):
        if not os.path.exists(file):
            raise FileNotFoundError(f"File not found: {file}")
        return pd.read_csv(file)

    with ThreadPoolExecutor() as executor:
        # Submit tasks to read CSV files concurrently
        futures = [executor.submit(_read_csv, file) for file in files]

        # Process results as they become available
        for future in futures:
            df = future.result()
            if len(df):
                df_list.append(df)

    # Concatenate the DataFrames
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

In [4]:
data_df = read_multiple_csv([f"BTCUSDT-1m-data_{i}.csv" for i in range(1,9)])
data_df

Unnamed: 0,timestamp,open,high,low,close,volume,close_time,quote_av,trades,tb_base_av,tb_quote_av,ignore
0,2017-08-17 04:00:00,4261.48,4261.48,4261.48,4261.48,1.775183,1502942459999,7.564907e+03,3,0.075183,3.203909e+02,7960.540180
1,2017-08-17 04:01:00,4261.48,4261.48,4261.48,4261.48,0.000000,1502942519999,0.000000e+00,0,0.000000,0.000000e+00,7960.452630
2,2017-08-17 04:02:00,4280.56,4280.56,4280.56,4280.56,0.261074,1502942579999,1.117543e+03,2,0.261074,1.117543e+03,7960.375295
3,2017-08-17 04:03:00,4261.48,4261.48,4261.48,4261.48,0.012008,1502942639999,5.117185e+01,3,0.012008,5.117185e+01,7960.375295
4,2017-08-17 04:04:00,4261.48,4261.48,4261.48,4261.48,0.140796,1502942699999,5.999993e+02,1,0.140796,5.999993e+02,7960.375295
...,...,...,...,...,...,...,...,...,...,...,...,...
2755836,2022-11-18 18:49:00,16574.96,16579.62,16572.01,16576.72,60.414710,1668797399999,1.001404e+06,2016,27.835590,4.613993e+05,0.000000
2755837,2022-11-18 18:50:00,16576.72,16586.54,16575.81,16586.10,114.207770,1668797459999,1.893602e+06,2503,62.700730,1.039658e+06,0.000000
2755838,2022-11-18 18:51:00,16585.44,16590.00,16581.50,16581.97,101.938210,1668797519999,1.690762e+06,2533,43.835610,7.270968e+05,0.000000
2755839,2022-11-18 18:52:00,16581.97,16582.48,16571.88,16573.01,141.041360,1668797579999,2.338037e+06,2862,37.730090,6.254966e+05,0.000000


#### Close Price and Volume are the inputs for this model. The input length is 23 for each.

**NOTE**: The data structure of this incoming input data to the package/function will be arrays directly

In [7]:
arr = np.array(data_df[['close','volume']])
arr

array([[4.2614800e+03, 1.7751830e+00],
       [4.2614800e+03, 0.0000000e+00],
       [4.2805600e+03, 2.6107400e-01],
       ...,
       [1.6581970e+04, 1.0193821e+02],
       [1.6573010e+04, 1.4104136e+02],
       [1.6574550e+04, 8.5867230e+01]])

In [8]:
arr_length = arr.shape[0]
arr_length

2755841

In [9]:
size = 23

In [10]:
chunks, remaining = divmod(arr_length, size)
chunks, remaining

(119819, 4)

In [11]:
# arr[-remaining:]

In [12]:
# np.array_split(arr[0:-remaining], chunks)#[0].shape

In [13]:
input_arr = np.array_split(arr[0:-remaining], chunks) + np.array_split(arr[-remaining:], 1)

In [14]:
len(input_arr)

119820

In [15]:
input_arr

[array([[4.261480e+03, 1.775183e+00],
        [4.261480e+03, 0.000000e+00],
        [4.280560e+03, 2.610740e-01],
        [4.261480e+03, 1.200800e-02],
        [4.261480e+03, 1.407960e-01],
        [4.261480e+03, 0.000000e+00],
        [4.261480e+03, 0.000000e+00],
        [4.261480e+03, 0.000000e+00],
        [4.261480e+03, 0.000000e+00],
        [4.261480e+03, 0.000000e+00],
        [4.261480e+03, 0.000000e+00],
        [4.261480e+03, 0.000000e+00],
        [4.261480e+03, 0.000000e+00],
        [4.261480e+03, 0.000000e+00],
        [4.261480e+03, 0.000000e+00],
        [4.261480e+03, 0.000000e+00],
        [4.261480e+03, 0.000000e+00],
        [4.264880e+03, 7.545500e-02],
        [4.264880e+03, 0.000000e+00],
        [4.261480e+03, 4.092110e-01],
        [4.264880e+03, 1.483014e+00],
        [4.266290e+03, 8.455560e-01],
        [4.266290e+03, 0.000000e+00]]),
 array([[4.266290e+03, 0.000000e+00],
        [4.266290e+03, 0.000000e+00],
        [4.266290e+03, 4.674350e-01],
        [4

In [16]:
import json
from json import JSONEncoder

class NumpyArrayEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NumpyArrayEncoder, self).default(obj)

In [17]:
with open('output_json.json', 'w') as f:
    json.dump(input_arr, f, cls=NumpyArrayEncoder)  # use dump() to write array into file

# print("Printing JSON serialized NumPy array")
# print(encodedNumpyData)

### Reading hdf `.h5` file

In [18]:
# def divide_chunks(l, n):     
#     for i in range(0, len(l), n):
#         yield l[i:i + n]

In [19]:
# divide_chunks(arr, 23)

In [20]:
# list(divide_chunks(arr, 23))[0]


In [21]:
def traverse_datasets(hdf_file):

    def h5py_dataset_iterator(g, prefix=''):
        for key in g.keys():
            item = g[key]
            path = f'{prefix}/{key}'
            if isinstance(item, h5py.Dataset): # test for dataset
                yield (path, item)
            elif isinstance(item, h5py.Group): # test for group (go down)
                yield from h5py_dataset_iterator(item, path)

    for path, _ in h5py_dataset_iterator(hdf_file):
        yield path

In [22]:
data = {}
data_info = []
with h5py.File('example_23_x_2.h5', 'r') as f:
    for dset in traverse_datasets(f):
        data_info.append([dset, f[dset].shape, f[dset].dtype])
#         print('Path:', dset)
#         print('Shape:', f[dset].shape)
#         print('Data type:', f[dset].dtype)
        data[dset] = f[dset][()]

In [23]:
data_info_df = pd.DataFrame(data_info, columns = ['Path', 'Shape', 'DataType'])
data_info_df

Unnamed: 0,Path,Shape,DataType
0,/model_weights/dense/dense/bias:0,"(2,)",float32
1,/model_weights/dense/dense/kernel:0,"(111, 2)",float32
2,/model_weights/lstm/lstm/lstm_cell/bias:0,"(444,)",float32
3,/model_weights/lstm/lstm/lstm_cell/kernel:0,"(2, 444)",float32
4,/model_weights/lstm/lstm/lstm_cell/recurrent_k...,"(111, 444)",float32
...,...,...,...
56,/optimizer_weights/Adam/lstm_5/lstm_cell_5/bia...,"(444,)",float32
57,/optimizer_weights/Adam/lstm_5/lstm_cell_5/ker...,"(111, 444)",float32
58,/optimizer_weights/Adam/lstm_5/lstm_cell_5/ker...,"(111, 444)",float32
59,/optimizer_weights/Adam/lstm_5/lstm_cell_5/rec...,"(111, 444)",float32


In [24]:
v = data['/model_weights/dense/dense/bias:0']
v

array([0.25209793, 0.01043009], dtype=float32)