In [109]:
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from scipy.stats import rankdata
from dtaidistance import dtw
from sklearn.metrics.pairwise import cosine_similarity

In [110]:


def create_time_index(timeend: np.datetime64, window_size: int) -> pd.DatetimeIndex:
    freq = '10min'
    return pd.date_range(end=timeend, periods=window_size, freq=freq)

def safe_nansum(arr, axis=None):
    result = np.nansum(arr, axis=axis)
    # 判斷原始陣列沿 axis 全部是 NaN 的位置
    all_nan = np.isnan(arr).all(axis=axis)
    
    # 將那些位置改為 np.nan
    if np.isscalar(result):
        return np.nan if all_nan else result
    result = result.astype('float64')  # 確保能裝 NaN
    result[all_nan] = np.nan
    return result

In [111]:
a_tdf = pd.DataFrame()
for i in range(0,40):
    try:
        df = pd.read_csv(f'../data/Raw_Data/Gogoro/台北市大安區_臺大二活停車場站A ({i:02d}).csv',index_col=0)
        df.index = pd.to_datetime(df.index)
        df.index = df.index.floor('min')
        df = df[~df.index.duplicated()]
        a_tdf = pd.concat([a_tdf,df])
    except:
        continue
a_tdf = a_tdf[~a_tdf.index.duplicated()]
a_tdf.sort_index(inplace=True)

b_tdf = pd.DataFrame()
for i in range(0,37):
    try:
        df = pd.read_csv(f'../data/Raw_Data/Gogoro/台北市大安區_臺大二活停車場站B ({i:02d}).csv',index_col=0)
        df.index = pd.to_datetime(df.index)
        df.index = df.index.floor('min')
        df = df[~df.index.duplicated()]
        b_tdf = pd.concat([b_tdf,df])
    except:
        continue
b_tdf = b_tdf[~b_tdf.index.duplicated()]
b_tdf.sort_index(inplace=True)

tdf = pd.concat([a_tdf,b_tdf],axis=1).dropna().sum(axis=1)

In [112]:
# 假設 df 已經處理好 index 是 datetime 且只保留到分鐘
start = tdf.index.min()
end = tdf.index.max()

# 產生每分鐘的完整時間序列
full_index = pd.date_range(start=start, end=end, freq='1min')

# 將原始 df 補上缺的時間，空值保持為 NaN
tdf_filled = tdf.reindex(full_index)

tdf_filled = tdf_filled.resample('20min').mean()
tdf_filled.name = 'raw_data'

In [113]:
# 轉為 xarray DataArray
da = xr.DataArray(tdf_filled, dims='time')

# 定義滑動視窗參數
window_size = 6 * 3    # 72 個點（12 小時 * 每10分鐘）
step_size = 1         # 每 1 小時（6 個點）滑動一次

# 建立滑動視窗
rolling_windows = da.rolling(time=window_size, center=False).construct('window')

# 下採樣：每 step_size 個點保留一個視窗
sampled_windows = rolling_windows.isel(time=slice(0, None, step_size))

# 將結果轉為 numpy 陣列 (samples, time steps)
samples = sampled_windows.transpose('time', 'window').values

# 篩選掉含 nan 的視窗
valid_mask = np.isnan(samples).sum(axis=1) < window_size-2
samples = samples[valid_mask]
index = sampled_windows.transpose('time', 'window')['time'].values[valid_mask]

In [114]:
def fast_dtw_distance(ref, compare_arr):
    if len(compare_arr.shape) == 2:
        distances = np.empty(compare_arr.shape[0])
        for i in range(compare_arr.shape[0]):
            mask = ~np.isnan(compare_arr[i])
            distances[i] = dtw.distance(ref[mask], compare_arr[i][mask])
    else:
        mask = ~np.isnan(compare_arr)
        distances = dtw.distance(ref[mask], compare_arr[mask])
    return distances

In [115]:
seed = np.random.normal(loc=0, scale=1, size=window_size)

In [116]:
from collections import defaultdict

grouped_samples = defaultdict(list)

for row in samples:
    observed = tuple(np.where(~np.isnan(row))[0])
    grouped_samples[observed].append(row)

In [138]:
np.unique(tmp, return_counts=True)

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 63]),
 array([6851,  314,   51,    8,    7,    5,    7,    3,    1,    1]))

In [142]:
tmp = []
for i in range(len(list(grouped_samples.keys()))):
    key = list(grouped_samples.keys())[i]

    p_weighted = defaultdict(float)
    history = np.array(grouped_samples[key])
    print(pd.DataFrame(history).dropna(axis=1).shape)
    tmp.append(history.shape[0])
    distances = fast_dtw_distance(seed, history)
    weights = 1/distances**2
    for row, weight in zip(history, weights):
        key = tuple(row[~np.isnan(row)])  # 觀測的變數組合
        p_weighted[key] += weight
    total = sum(p_weighted.values())
    for k in p_weighted:
        p_weighted[k] /= total

(3, 3)
(3, 4)
(3, 5)
(2, 6)
(2, 7)
(1, 8)
(1, 9)
(2, 10)
(2, 11)
(1, 12)
(1, 13)
(1, 14)
(1, 15)
(1, 15)
(1, 16)
(1, 16)
(1, 16)
(7, 17)
(3, 16)
(5, 16)
(5, 16)
(3, 15)
(2, 14)
(1, 14)
(1, 14)
(1, 13)
(1, 13)
(1, 12)
(1, 12)
(1, 12)
(1, 12)
(1, 13)
(1, 13)
(1, 12)
(1, 12)
(1, 11)
(1, 12)
(1, 12)
(1, 12)
(1, 13)
(1, 14)
(1, 14)
(1, 14)
(1, 15)
(1, 14)
(2, 15)
(2, 15)
(2, 15)
(2, 14)
(2, 14)
(1, 14)
(2, 15)
(2, 15)
(3, 16)
(2, 16)
(1, 16)
(1, 16)
(2, 16)
(2, 16)
(1, 16)
(1, 16)
(1, 16)
(7, 17)
(9, 17)
(6, 17)
(8, 17)
(8, 17)
(2, 16)
(2, 16)
(2, 16)
(2, 15)
(1, 15)
(2, 14)
(1, 13)
(1, 12)
(1, 11)
(1, 11)
(1, 11)
(1, 11)
(1, 11)
(1, 11)
(1, 11)
(1, 11)
(1, 10)
(1, 11)
(1, 11)
(1, 11)
(1, 10)
(1, 11)
(1, 11)
(1, 12)
(1, 12)
(1, 12)
(2, 12)
(2, 12)
(2, 12)
(2, 11)
(2, 11)
(2, 11)
(2, 10)
(1, 9)
(1, 9)
(1, 8)
(1, 8)
(1, 7)
(1, 7)
(1, 6)
(1, 5)
(1, 4)
(1, 4)
(2, 4)
(1, 4)
(2, 3)
(7, 3)
(5, 4)
(4, 5)
(3, 6)
(1, 6)
(1, 7)
(1, 8)
(1, 9)
(1, 9)
(1, 9)
(1, 9)
(1, 10)
(1, 10)
(1, 10)
(1, 10)
(1, 10)

In [None]:
import numpy as np
import pandas as pd
from collections import Counter

# 模擬四組樣本
# 先驗：完整四維樣本（40筆）
prior_samples = pd.DataFrame(np.random.randint(0, 2, size=(40, 4)), columns=["x1", "x2", "x3", "x4"])

# 觀測一：p(x1,x2,x3) (100筆)
obs_123 = pd.DataFrame(np.random.randint(0, 2, size=(100, 3)), columns=["x1", "x2", "x3"])

# 觀測二：p(x2,x3,x4) (80筆)
obs_234 = pd.DataFrame(np.random.randint(0, 2, size=(80, 3)), columns=["x2", "x3", "x4"])

# 觀測三：p(x1,x3,x4) (60筆)
obs_134 = pd.DataFrame(np.random.randint(0, 2, size=(60, 3)), columns=["x1", "x3", "x4"])

# 統計出觀測分布（作為後驗修正依據）
def empirical_prob(samples, cols):
    count = Counter([tuple(row) for row in samples[cols].values])
    total = sum(count.values())
    return {k: v / total for k, v in count.items()}

p_obs_123 = empirical_prob(obs_123, ["x1", "x2", "x3"])
p_obs_234 = empirical_prob(obs_234, ["x2", "x3", "x4"])
p_obs_134 = empirical_prob(obs_134, ["x1", "x3", "x4"])

# 初始樣本統計分布（先驗）
prior_list = [tuple(row) for row in prior_samples[["x1", "x2", "x3", "x4"]].values]

# 先驗邊際（p123）
prior_123 = Counter([(x1, x2, x3) for (x1, x2, x3, x4) in prior_list])
prior_234 = Counter([(x2, x3, x4) for (x1, x2, x3, x4) in prior_list])
prior_134 = Counter([(x1, x3, x4) for (x1, x2, x3, x4) in prior_list])
total_prior = len(prior_list)

# 對每一筆樣本做連乘權重更新（依序套用三次貝氏加權）
weights = []
for sample in prior_list:
    x1, x2, x3, x4 = sample

    # 取得對應子分布機率
    p1 = p_obs_123.get((x1, x2, x3), 1e-6) / (prior_123.get((x1, x2, x3), 1e-6) / total_prior)
    p2 = p_obs_234.get((x2, x3, x4), 1e-6) / (prior_234.get((x2, x3, x4), 1e-6) / total_prior)
    p3 = p_obs_134.get((x1, x3, x4), 1e-6) / (prior_134.get((x1, x3, x4), 1e-6) / total_prior)

    weights.append(p1 * p2 * p3)

# 權重正規化
weights = np.array(weights)
weights /= weights.sum()

# 統計後驗分布
posterior = {}
for i, sample in enumerate(prior_list):
    posterior[sample] = posterior.get(sample, 0) + weights[i]

# 整理為 DataFrame
df_post = pd.DataFrame([(*k, v) for k, v in posterior.items()], columns=["x1", "x2", "x3", "x4", "prob"])
df_post = df_post.sort_values("prob", ascending=False).reset_index(drop=True)
