In [89]:
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from scipy.stats import rankdata
from dtaidistance import dtw
from sklearn.metrics.pairwise import cosine_similarity

In [90]:
def create_time_index(timeend: np.datetime64, window_size: int) -> pd.DatetimeIndex:
    freq = '10min'
    return pd.date_range(end=timeend, periods=window_size, freq=freq)

def safe_nansum(arr, axis=None):
    result = np.nansum(arr, axis=axis)
    # 判斷原始陣列沿 axis 全部是 NaN 的位置
    all_nan = np.isnan(arr).all(axis=axis)
    
    # 將那些位置改為 np.nan
    if np.isscalar(result):
        return np.nan if all_nan else result
    result = result.astype('float64')  # 確保能裝 NaN
    result[all_nan] = np.nan
    return result

In [91]:
a_tdf = pd.DataFrame()
for i in range(0,40):
    try:
        df = pd.read_csv(f'../data/Raw_Data/Gogoro/台北市大安區_臺大二活停車場站A ({i:02d}).csv',index_col=0)
        df.index = pd.to_datetime(df.index)
        df.index = df.index.floor('min')
        df = df[~df.index.duplicated()]
        a_tdf = pd.concat([a_tdf,df])
    except:
        continue
a_tdf = a_tdf[~a_tdf.index.duplicated()]
a_tdf.sort_index(inplace=True)

b_tdf = pd.DataFrame()
for i in range(0,37):
    try:
        df = pd.read_csv(f'../data/Raw_Data/Gogoro/台北市大安區_臺大二活停車場站B ({i:02d}).csv',index_col=0)
        df.index = pd.to_datetime(df.index)
        df.index = df.index.floor('min')
        df = df[~df.index.duplicated()]
        b_tdf = pd.concat([b_tdf,df])
    except:
        continue
b_tdf = b_tdf[~b_tdf.index.duplicated()]
b_tdf.sort_index(inplace=True)

tdf = pd.concat([a_tdf,b_tdf],axis=1).dropna().sum(axis=1)

In [92]:
# 假設 df 已經處理好 index 是 datetime 且只保留到分鐘
start = tdf.index.min()
end = tdf.index.max()

# 產生每分鐘的完整時間序列
full_index = pd.date_range(start=start, end=end, freq='1min')

# 將原始 df 補上缺的時間，空值保持為 NaN
tdf_filled = tdf.reindex(full_index)

tdf_filled = tdf_filled.resample('h').mean()
tdf_filled.name = 'raw_data'

In [None]:
def generate_valid_subsequences(arr, min_len=1, max_len=None):
    if max_len is None:
        max_len = len(arr)

    subsequences = []
    N = len(arr)
    
    for i in range(N):
        for j in range(i + min_len, min(i + max_len, N) + 1):
            subseq = arr[i:j]
            if not np.isnan(subseq[0]) and not np.isnan(subseq[-1]):
                subsequences.append(subseq)
    
    return subsequences

# 範例資料
arr = tdf_filled.values

window_size = 24
samples = generate_valid_subsequences(arr, min_len=2, max_len=window_size)

In [94]:
from collections import defaultdict
import numpy as np

non_seen_list = []
seen_in_group = defaultdict(set)  # 用來記錄每個 key 下已出現的 trimmed（tuple 形式）

for i,row in enumerate(samples):
    not_nan_indices = np.where(~np.isnan(row))[0]
    if len(not_nan_indices) == 0:
        continue  # 跳過全 NaN 的 row

    # 去掉頭尾 NaN
    start = not_nan_indices[0]
    end = not_nan_indices[-1] + 1
    trimmed = row[start:end]

    # 觀測的位置（相對於 trimmed）
    observed = tuple(np.where(~np.isnan(trimmed))[0])

    # 把 trimmed 轉成 hashable 的 tuple 來比對是否出現過
    trimmed_key = tuple(trimmed)

    if trimmed_key not in seen_in_group[observed]:
        non_seen_list.append(trimmed)
        seen_in_group[observed].add(trimmed_key)

In [95]:
grouped_samples = defaultdict(list)

for i,row in enumerate(non_seen_list):
    row[row<0] = np.nan
    not_nan_indices = np.where(~np.isnan(row))[0]
    if len(not_nan_indices) == 0:
        continue  # 跳過全 NaN 的 row
    
    # 去掉頭尾 NaN
    start = not_nan_indices[0]
    end = not_nan_indices[-1] + 1
    trimmed = row[start:end]
    # 觀測的位置（相對於 trimmed）
    observed = tuple(np.where(~np.isnan(trimmed))[0])
    grouped_samples[observed].append(trimmed)
    
sorted_items = sorted(grouped_samples.items(), key=lambda x: len(x[0]), reverse=True)
grouped_samples = dict(sorted_items)

In [None]:
seed = np.random.normal(loc=0, scale=1, size=window_size)

In [None]:
from collections import defaultdict
import numpy as np

grouped_samples_copy = defaultdict(list)

# 主迴圈
while grouped_samples:
    print(f"剩餘群組數：{len(grouped_samples)}, 最長序列長度: {len(list(grouped_samples.keys())[0])}")

    # 取第一個 key（不需轉成 list）
    current_key = next(iter(grouped_samples))
    current_group = grouped_samples[current_key]

    # 若樣本數夠多，移到 copy，並移除原 key
    if len(current_group) > 30:
        grouped_samples_copy[current_key] = current_group
        del grouped_samples[current_key]
        continue

    # 不夠多，進行單點擴增
    all_versions = []
    for arr in current_group:
        valid_indices = np.where(~np.isnan(arr))[0]
        for idx in valid_indices:
            arr_copy = arr.copy()
            arr_copy[idx] = np.nan
            all_versions.append(arr_copy)

    # 重新分群
    for row in all_versions:
        observed = tuple(np.where(~np.isnan(row))[0])
        grouped_samples.setdefault(observed, []).append(row)

    # 移除已處理的 key
    del grouped_samples[current_key]

    # 可選：排序 key（但不轉成普通 dict）
    grouped_samples = defaultdict(
        list,
        dict(sorted(grouped_samples.items(), key=lambda x: len(x[0]), reverse=True))
    )
    


In [96]:
def fast_dtw_distance(ref, compare_arr):
    if len(compare_arr.shape) == 2:
        distances = np.empty(compare_arr.shape[0])
        for i in range(compare_arr.shape[0]):
            mask = ~np.isnan(compare_arr[i])
            distances[i] = dtw.distance(ref[mask], compare_arr[i][mask])
    else:
        mask = ~np.isnan(compare_arr)
        distances = dtw.distance(ref[mask], compare_arr[mask])
    return distances

In [99]:
tmp = []
for i in range(len(list(grouped_samples.keys()))):
    key = list(grouped_samples.keys())[i]

    p_weighted = defaultdict(float)
    history = np.array(grouped_samples[key])

    print(pd.DataFrame(history).dropna(axis=1).shape)
    

    tmp.append(history.shape[0])
    distances = fast_dtw_distance(seed, history)
    # weights = 1/distances**2
    # for row, weight in zip(history, weights):
    #     key = tuple(row[~np.isnan(row)])  # 觀測的變數組合
    #     p_weighted[key] += weight
    # total = sum(p_weighted.values())
    # for k in p_weighted:
    #     p_weighted[k] /= total

(406, 24)
(426, 23)


IndexError: boolean index did not match indexed array along axis 0; size of axis is 24 but size of corresponding boolean axis is 23

In [98]:
np.unique(tmp,return_counts=True)

(array([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,
          12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,
          23,   24,   25,   26,   27,   28,   29,   30,   31,   32,   33,
          34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,
          45,   46,   48,   49,   50,   52,   53,   54,   55,   56,   58,
          59,   61,   62,   63,   64,   65,   66,   67,   69,   70,   71,
          75,   76,   79,   80,   84,   88,   90,   95,   99,  101,  106,
         110,  119,  137,  138,  202,  406,  426,  448,  472,  497,  525,
         555,  590,  630,  676,  728,  782,  841,  903,  968, 1042, 1128,
        1224, 1331, 1412, 1451, 1588, 1716]),
 array([6726,  731,  314,  239,  142,   48,   41,   22,   12,    9,    9,
           7,    4,   12,    7,   11,   13,   13,    8,   21,    9,   12,
           7,   10,    7,    8,    6,    7,    6,    4,    9,    3,    3,
           8,    6,    3,    5,    4,    2,    5,    4,    1,    2

In [None]:
import numpy as np
import pandas as pd
from collections import Counter

# 模擬四組樣本
# 先驗：完整四維樣本（40筆）
prior_samples = pd.DataFrame(np.random.randint(0, 2, size=(40, 4)), columns=["x1", "x2", "x3", "x4"])

# 觀測一：p(x1,x2,x3) (100筆)
obs_123 = pd.DataFrame(np.random.randint(0, 2, size=(100, 3)), columns=["x1", "x2", "x3"])

# 觀測二：p(x2,x3,x4) (80筆)
obs_234 = pd.DataFrame(np.random.randint(0, 2, size=(80, 3)), columns=["x2", "x3", "x4"])

# 觀測三：p(x1,x3,x4) (60筆)
obs_134 = pd.DataFrame(np.random.randint(0, 2, size=(60, 3)), columns=["x1", "x3", "x4"])

# 統計出觀測分布（作為後驗修正依據）
def empirical_prob(samples, cols):
    count = Counter([tuple(row) for row in samples[cols].values])
    total = sum(count.values())
    return {k: v / total for k, v in count.items()}

p_obs_123 = empirical_prob(obs_123, ["x1", "x2", "x3"])
p_obs_234 = empirical_prob(obs_234, ["x2", "x3", "x4"])
p_obs_134 = empirical_prob(obs_134, ["x1", "x3", "x4"])

# 初始樣本統計分布（先驗）
prior_list = [tuple(row) for row in prior_samples[["x1", "x2", "x3", "x4"]].values]

# 先驗邊際（p123）
prior_123 = Counter([(x1, x2, x3) for (x1, x2, x3, x4) in prior_list])
prior_234 = Counter([(x2, x3, x4) for (x1, x2, x3, x4) in prior_list])
prior_134 = Counter([(x1, x3, x4) for (x1, x2, x3, x4) in prior_list])
total_prior = len(prior_list)

# 對每一筆樣本做連乘權重更新（依序套用三次貝氏加權）
weights = []
for sample in prior_list:
    x1, x2, x3, x4 = sample

    # 取得對應子分布機率
    p1 = p_obs_123.get((x1, x2, x3), 1e-6) / (prior_123.get((x1, x2, x3), 1e-6) / total_prior)
    p2 = p_obs_234.get((x2, x3, x4), 1e-6) / (prior_234.get((x2, x3, x4), 1e-6) / total_prior)
    p3 = p_obs_134.get((x1, x3, x4), 1e-6) / (prior_134.get((x1, x3, x4), 1e-6) / total_prior)

    weights.append(p1 * p2 * p3)

# 權重正規化
weights = np.array(weights)
weights /= weights.sum()

# 統計後驗分布
posterior = {}
for i, sample in enumerate(prior_list):
    posterior[sample] = posterior.get(sample, 0) + weights[i]

# 整理為 DataFrame
df_post = pd.DataFrame([(*k, v) for k, v in posterior.items()], columns=["x1", "x2", "x3", "x4", "prob"])
df_post = df_post.sort_values("prob", ascending=False).reset_index(drop=True)
