# 初始化

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import panel as pn
pn.extension()

sys.path.append(os.path.abspath('../'))
os.environ["NUMBA_CUDA_DRIVER"] = "C:\\Windows\\System32\\nvcuda.dll"
from util.data_utils import load_csv_folder
from util.viz_utils import line_chart

In [None]:
# 加载数据
data_dir = '../../data/sat1'  # 路径可根据实际情况调整
dataframes = load_csv_folder(data_dir)
file = list(dataframes.keys())[0]  # 选择第一个文件演示
df = dataframes[file]
print(f'加载数据文件: {file}, 行数: {len(df)}')
df.head()

# 降维

In [4]:
# 分箱算法
def group_by_bins(data, bins):
    """
    将数据按箱数进行分组
    
    参数:
        data: DataFrame 或类数组数据
        bins: 总箱数，每箱约4个点
    
    返回:
        分箱后的数据
    """
    output=data.copy()

    # 确定时间范围
    tmin = data.index.min()
    tmax = data.index.max()
    delta = tmax - tmin
    
    # 将时间戳映射到整数箱坐标
    output['group_idx'] = ((data.index - tmin) / delta * (bins - 1)).round().astype(int)

    return output

# def group_by_timedelta(data, timedelta):
#     output=data.copy()
#     return output

def group_by_time(data, freq='min'):
    """
    将数据按时间频率进行分组
    
    参数:
        data: DataFrame 或类数组数据
        freq: 时间频率，可选值为 'min'、'h'、'd' 等

    返回:
        分组后的数据
    """
    output=data.copy()
    output['group_idx'] = pd.Series(data.index).dt.floor(freq).values

    return output



In [5]:
def m4(data):
    """
    M4 算法实现，用于可视化保留的方式减少时间序列数据点数
    
    参数:
        data: 分组后数据，要求每组多于4个点
    返回:
        减采样后的数据
    """
    # 对每个箱计算极值
    result_indices = []
    value_col = data.columns[0]
    
    for bin_idx, group in data.groupby('group_idx'):
        # print(group)
        if len(group) > 0:
            # 找到时间最小/最大和值最小/最大的行索引
            tmin_idx = group.index.min()
            tmax_idx = group.index.max()
            vmin_idx = group[value_col].idxmin()
            vmax_idx = group[value_col].idxmax()
            
            # 将这些索引添加到结果中（可能有重复）
            result_indices.extend([tmin_idx, tmax_idx, vmin_idx, vmax_idx])
    
    # 去除重复索引
    # result_indices = list(set(result_indices))
    # print(result_indices)
    # 返回过滤后的数据
    result = data.loc[result_indices].drop(columns=['group_idx'])
    
    print(f"原始数据点数: {len(data)}, 处理后数据点数: {len(result)}")
    
    return result

def maxmin(data):
    # 对每个箱计算极值
    result_indices = []
    
    for bin_idx, group in data.groupby('group_idx'):
        if len(group) > 0:
            # 找到时间最小/最大和值最小/最大的行索引
            tmin_idx = group.index.min()
            tmax_idx = group.index.max()
            
            # 将这些索引添加到结果中（可能有重复）
            result_indices.extend([tmin_idx, tmax_idx])
    
    # 返回过滤后的数据
    result = data.loc[result_indices].drop(columns=['group_idx'])
    
    print(f"原始数据点数: {len(data)}, 处理后数据点数: {len(result)}")
    
    return result

def mean(data):
    pass


In [None]:
freq = '10min'

m4ed = {}
maxmined = {}
for file in dataframes.keys():
    df = dataframes[file]
    grouped=group_by_time(df,freq)
    m4ed[file] = m4(grouped)
    maxmined[file] = maxmin(grouped)

In [None]:
import lttbc

def lttb(data, threshold=1000):
    """
    使用lttbc库对原始数据进行LTTB降采样

    参数:
        data: DataFrame，index为时间戳，只有一列数据
        threshold: 降采样后点数

    返回:
        降采样后的DataFrame，index和原始数据类型一致
    """
    col = data.columns[0]
    x = np.arange(len(data))  # 用顺序编号做x
    y = data[col].values

    # lttbc.downsample要求x和y为一维数组
    nx, ny = lttbc.downsample(x, y, threshold)

    # nx是降采样后x的下标（浮点型），需要四舍五入转为int
    nx = np.round(nx).astype(int)
    # 防止越界
    nx = np.clip(nx, 0, len(data)-1)

    # 取出原始index和列名
    new_index = data.index[nx]
    result = pd.DataFrame({col: ny}, index=new_index)
    
    print(f"原始数据点数: {len(data)}, 处理后数据点数: {len(result)}")
    
    return result

lttbed = {}
for file in dataframes.keys():
    df = dataframes[file]
    lttbed[file] = lttb(df,104758)

# 可视化

In [None]:
import panel as pn
import hvplot.pandas

channel_selector = pn.widgets.Select(name="选择数据")
channel_selector.options = list(dataframes.keys())
channel_selector.value = list(dataframes.keys())[0]

def plot_line(dfs, channel, name):
    df = dfs[channel].astype(np.float32)
    plot = df.hvplot.line(
        title=name,
        width=1200,
        height=300,
        downsample=True
    ).opts(
        # toolbar='disable',
        backend_opts={
            "x_range.bounds": (
                df.index.min(),
                df.index.max(),
            ),  # optional: limit max viewable x-extent to data
            "y_range.bounds": (
                df[channel].min() - 1,
                df[channel].max() + 1,
            ),  # optional: limit max viewable y-extent to data
        }
    )
    return plot

data_list = {'raw':dataframes,'m4':m4ed,'maxmin':maxmined,'lttb':lttbed}

data_selector = pn.widgets.MultiSelect(name="选择数据")
data_selector.options = list(data_list.keys())

@pn.depends(channel_selector.param.value, data_selector.param.value)
def chart(channel,data):
    if channel is None or data is None:
        return pn.Column()
    charts=[]
    for d in data:
        charts.append(plot_line(data_list[d],channel,d))
    return pn.Column(*charts)


panel = pn.Column(pn.Row(channel_selector,data_selector), chart)
panel

# 性能评估