# chatgpt test hdf5

In [1]:
import os
import time
import numpy as np
import pandas as pd
from pandas import HDFStore
import logging

log = logging.getLogger(__name__)
# 假设 SafeHDFStore 已经定义好

def load_hdf_db(
    fname,
    table="all",
    code_l=None,
    timelimit=True,
    index=False,
    limit_time=300,
    dratio_limit=0.5,
    MultiIndex=False,
    showtable=False,
):
    """
    精简版 HDF5 读取函数，路径处理跨平台
    """
    t0 = time.time()
    df, dd = None, None

    if not os.path.exists(fname):
        log.error("HDF5 file not found: %s", fname)
        return None

    # 读取表
    with SafeHDFStore(fname, mode="r") as store:
        if store is None:
            return None
        keys = list(store.keys())
        if showtable:
            print(f"fname: {fname}, keys: {keys}")

        if "/" + table not in keys:
            log.error("%s not found in %s", table, fname)
            return None

        dd = store[table]

    if dd is None or len(dd) == 0:
        log.warning("Empty table %s in %s", table, fname)
        return None

    # --- 按 code_l 过滤 ---
    if code_l is not None:
        if not MultiIndex:
            if index:
                code_l = [str(1000000 - int(x)) if x.startswith("0") else x for x in code_l]
            dif_co = list(set(dd.index) & set(code_l))

            if len(code_l) > 0:
                dratio = (len(code_l) - len(dif_co)) / float(len(code_l))
            else:
                dratio = 0.0

            log.info("find all:%s missing:%s dratio:%.2f",
                     len(code_l), len(code_l) - len(dif_co), dratio)

            # 时间限制
            if timelimit and "timel" in dd.columns:
                dd = dd.loc[dif_co]
                o_time = [time.time() - t for t in dd[dd.timel != 0].timel.tolist()]
                if len(o_time) > 0:
                    l_time = np.mean(o_time)
                    return_hdf_status = l_time < limit_time
                    log.info("return_hdf_status:%s mean_time:%.2f limit:%.2f",
                             return_hdf_status, l_time, limit_time)
                    if return_hdf_status:
                        df = dd
            else:
                df = dd.loc[dif_co]

            if dratio > dratio_limit:
                log.warning("Too many codes missing: %.2f > %.2f",
                            dratio, dratio_limit)
                return None
        else:
            # 多层索引
            df = dd.loc[dd.index.isin(code_l, level="code")]
    else:
        df = dd

    # --- 统一清理 ---
    if df is not None and len(df) > 0:
        df = df.fillna(0)
        df = df[~df.index.duplicated(keep="last")]

        # MultiIndex 去重逻辑
        if MultiIndex and "volume" in df.columns:
            count_before = len(df)
            df = df.drop_duplicates()
            dratio = len(df) / float(count_before)
            log.debug("MultiIndex drop_duplicates: before=%d after=%d ratio=%.2f",
                      count_before, len(df), dratio)

    log.info("load_hdf_time: %.2f sec", time.time() - t0)
    return df

In [11]:

def write_hdf_with_code(fname, table, df, index=True, complib='zlib', complevel=9):
    """
    写入 HDF5 并把 code 加入 data_columns
    """
    if df is None or len(df) == 0:
        print("Empty DataFrame, skip write")
        return

    # 如果是 MultiIndex，确保 code 是列
    if isinstance(df.index, pd.MultiIndex) and 'code' in df.index.names:
        df_reset = df.reset_index(level='code')
    elif 'code' not in df.columns:
        # 单索引，假设 index 是 code
        df_reset = df.copy()
        df_reset['code'] = df_reset.index
    else:
        df_reset = df.copy()

    # 确保 code 在 data_columns
    df_reset.to_hdf(
        fname,
        key=table,
        mode='a',
        format='table',          # table 格式才能用 data_columns
        data_columns=['code'],   # 指定 code 可筛选
        complevel=complevel,
        complib=complib,
        index=index
    )
    print(f"Wrote table {table} to {fname}, rows: {len(df_reset)}")

# --- 示例 ---
basedir = os.path.join("G:",os.sep )
fname = os.path.join(basedir, "sina_MultiIndex_data.h5")
table = 'sina_MultiIndex'

# 读取数据
df = load_hdf_db(fname, table=table, MultiIndex=True)

fname_data_col = os.path.join(basedir, "sina_MultiIndex_data_columns.h5") 
# 写入 HDF5，带 code data_column
write_hdf_with_code(fname_data_col, table, df)


HDF5 file not found: G:\sina_MultiIndex_data.h5


Empty DataFrame, skip write


In [3]:
import pandas as pd
import time
import os

basedir = "G:" + os.sep  # 或者 basedir = os.path.join("G:", "")

# config_ini = os.path.join(basedir, "h5config.txt")
# temp_file = os.path.join(basedir, "tmpfile.h5")
# 示例文件路径
# fname = os.path.join(basedir, "test.h5")
# table = "all"

# MultiIndex 示例路径
# multi_fname = os.path.join(basedir, "sina_MultiIndex.h5")

fname_normal = os.path.join(basedir, "sina_MultiIndex_data.h5")             # 原始 HDF5
fname_data_col = os.path.join(basedir, "sina_MultiIndex_data_columns.h5")  # 写入了 data_columns 的 HDF5
table = "all"
test_code = "002258"


fname_normal = os.path.join(basedir, "sina_data.h5")             # 原始 HDF5
fname_data_col = os.path.join(basedir, "sina_data_columns.h5")  # 写入了 data_columns 的 HDF5
# table = "all_30"
table = "all"

test_code = "002258"
# -------- 全表读取 --------
t0 = time.time()
with pd.HDFStore(fname_normal, mode="r") as store:
    df_all = store[table]
t1 = time.time()
print(f"[全表读取] rows: {len(df_all)}, time: {t1-t0:.4f} s")

# 筛选指定 code
t0 = time.time()
df_code = df_all.loc[df_all.index.get_level_values("code") == test_code]
t1 = time.time()
print(f"[全表筛选 code={test_code}] rows: {len(df_code)}, time: {t1-t0:.4f} s")


# -------- select 查询（data_columns） --------
t0 = time.time()
with pd.HDFStore(fname_data_col, mode="r") as store:
    df_select = store.select(table, where=f'code="{test_code}"')
t1 = time.time()
print(f"[select code={test_code}] rows: {len(df_select)}, time: {t1-t0:.4f} s")


[全表读取] rows: 5201, time: 0.0928 s


KeyError: 'Requested level (code) does not match index name (None)'

In [15]:
 dd = df_all.loc[test_code]

In [13]:
# with pd.HDFStore(r"G:\sina_MultiIndex_data_columns.h5") as store:
    # print(store.get_storer('all_30').data_columns

['code']


In [17]:
round((dd.close - dd.llastp) / dd.llastp *100,1)

-2.3

In [26]:
round(dd.turnover/100/10000/100,1),dd

(1.6,
 name               利尔化学
 open              12.45
 llastp            12.51
 now               12.22
 trade               0.0
 high              12.49
 low               12.17
 buy               12.21
 sell              12.22
 volume         13458640
 turnover    164729444.2
 b1_v              84100
 b1                12.21
 b2_v             127000
 b2                 12.2
 b3_v             121800
 b3                12.19
 b4_v             216900
 b4                12.18
 b5_v             143920
 b5                12.17
 a1_v              23700
 a1                12.22
 a2_v              23100
 a2                12.23
 a3_v              18300
 a3                12.24
 a4_v              80500
 a4                12.25
 a5_v              13700
 a5                12.26
 dt           2025-09-15
 ticktime       15:00:00
 b1_vv                 0
 close             12.22
 nvol           13458640
 code             002258
 Name: 002258, dtype: object)

In [42]:
for code, row in df_all.iterrows():
    print(code,row.name,row.close,row.llastp,row['name'],row)
    break

603768 603768 12.8 12.8 常青股份 name              常青股份
open              12.8
llastp            12.8
now               12.8
trade              0.0
high             13.03
low              12.78
buy               12.8
sell             12.81
volume         3045600
turnover    39205795.0
b1_v               400
b1                12.8
b2_v              7700
b2               12.79
b3_v              4900
b3               12.78
b4_v              1100
b4               12.77
b5_v              1600
b5               12.76
a1_v               200
a1               12.81
a2_v              2500
a2               12.83
a3_v               100
a3               12.84
a4_v              4500
a4               12.85
a5_v              4500
a5               12.86
dt          2025-09-15
ticktime      15:00:02
b1_vv                0
close             12.8
nvol           3045600
code            603768
Name: 603768, dtype: object


In [5]:
import h5py
import pandas as pd

def read_hdf_h5py(fname, table='all'):
    with h5py.File(fname, 'r') as f:
        if table not in f:
            raise KeyError(f"Table '{table}' not found in {fname}")

        obj = f[table]

        # 如果是 group，尝试遍历 fields
        if isinstance(obj, h5py.Group):
            # PyTables Table 通常存储在 'data' 或 'table' 下
            # 这里直接尝试按字段读取
            data_dict = {}
            for key in obj.keys():
                data_dict[key] = obj[key][:]
            df = pd.DataFrame(data_dict)
        else:
            # 普通 dataset
            data = obj[:]
            if hasattr(data, 'dtype') and data.dtype.names is not None:
                df = pd.DataFrame.from_records(data)
            else:
                df = pd.DataFrame(data)
    return df

fname = r'G:\sina_data.h5'
df = read_hdf_h5py(fname, table='all')
print(df.shape)
print(df.head())


TypeError: Accessing a group is done with bytes or str,  not <class 'slice'>

In [8]:
import h5py
import pandas as pd

def read_hdf_h5py(fname, table='all'):
    """
    使用 h5py 读取 HDF5 文件，返回 pandas DataFrame
    fname: HDF5 文件路径
    table: 数据集名称，通常是 '/all'
    """
    with h5py.File(fname, 'r') as f:
        # 检查 table 是否存在
        if table not in f:
            raise KeyError(f"Table '{table}' not found in {fname}")
        
        dset = f[table]
        # 如果是表格形式，直接转换为 DataFrame
        # h5py 数据集通常是 numpy structured array
        print('dset:',dset)
        print(f'list(dset.keys()): {list(dset.keys())}')
        data = dset[:]
        if hasattr(data, 'dtype') and data.dtype.names is not None:
            df = pd.DataFrame.from_records(data)
        else:
            # 如果是普通数组，生成默认列名
            df = pd.DataFrame(data)
    return df

# 使用示例
fname = r'G:\sina_data.h5'
df = read_hdf_h5py(fname, table='all')
print(df.shape)
print(df.head())


dset: <HDF5 group "/all" (2 members)>
list(dset.keys()): ['_i_table', 'table']


TypeError: Accessing a group is done with bytes or str,  not <class 'slice'>

In [11]:
import h5py
import os

basedir = "G:" + os.sep 
fname = os.path.join(basedir, "sina_data.h5")             # 原始 HDF5
# table = "all_30"
table = "all"
test_code = "002258"
import h5py
import pandas as pd

fname = r"G:\sina_data.h5"

import h5py
import pandas as pd

with h5py.File(fname, "r") as f:
    data = f["all"]["table"][:]  # fixed 格式可以直接读取
    df = pd.DataFrame(data)
    print(f'df: {df}')



OSError: Can't read data (can't open directory)

In [13]:
df = pd.read_hdf(fname, key='all')

In [15]:
print(df.loc[test_code])

name                       利尔化学
open                       12.2
llastp                    12.22
now                       12.25
trade                       0.0
high                      12.36
low                       12.04
buy                       12.24
sell                      12.25
volume                 13639443
turnover            166071488.0
b1_v                      49600
b1                        12.24
b2_v                      79900
b2                        12.23
b3_v                      16400
b3                        12.22
b4_v                      10000
b4                        12.21
b5_v                      36300
b5                         12.2
a1_v                      48000
a1                        12.25
a2_v                      51400
a2                        12.26
a3_v                      51800
a3                        12.27
a4_v                      42200
a4                        12.28
a5_v                      43700
a5                        12.29
dt      