# Spatial Analysis V3

- 討論：youbike 目前都負面的，但是什麼情況下他是有幫助的
- 納入時間軸，某些時段不要開放youbike (拆成離峰和尖峰時段)
- 篩選到**市區道路**後再帶入模型
- 特徵考慮：速差、youbike空間滯後、時間軸

In [None]:
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from utils import get_grid, read_data, read_taiwan_specific
from utils_macro import LocalMoranAnalysis, GetisOrdGiAnalysis

combined_data = read_data()
taiwan, grid_filter = read_taiwan_specific()

In [None]:
hex_grid = pd.read_csv("../ComputedDataV2/Grid/hex_gridV1.csv")
from shapely import wkt
hex_grid['geometry'] = hex_grid['geometry'].apply(wkt.loads)
hex_grid = gpd.GeoDataFrame(hex_grid, geometry='geometry', crs='EPSG:3826')

youbike架設區：台北市、新北市、桃園市、新竹市、新竹縣、苗栗縣、台中市、嘉義市、嘉義縣、台南市、高雄市、屏東縣、台東縣、花蓮縣以及新竹科學園區

In [None]:
import geopandas as gpd

# https://www.youbike.com.tw/region/ntpc/stations/list/
# 看要以縣市或區域為單位篩選
youbike_counties = [
    '臺北市', '台北市', '新北市', '桃園市', 
    '新竹市', '新竹縣', '苗栗縣', 
    '臺中市', '台中市', '嘉義市', '嘉義縣', 
    '臺南市', '台南市', '高雄市', '屏東縣', 
    '臺東縣', '台東縣', '花蓮縣'
]

target_towns = [
    # 台北
    '大安區','大同區', '士林區', '文山區', '中正區', '中山區',
    '內湖區','北投區','松山區','南港區','信義區','萬華區','臺大公館校區',
    # 新北
    '八里區', '三芝區', '三重區', '三峽區', '土城區', '中和區', '五股區',
    '永和區', '石門區', '石碇區', '平溪區', '汐止區', '金山區', '林口區',
    '坪林區', '板橋區', '泰山區', '烏來區', '貢寮區', '淡水區', '深坑區',
    '萬里區', '瑞芳區', '新店區', '新莊區', '樹林區', '雙溪區', '蘆洲區', '鶯歌區',
    # 桃園
    '八德區','大園區','大溪區','中壢區','平鎮區','桃園區',
    '復興區', '新屋區','楊梅區','龍潭區','龜山區','蘆竹區', '觀音區',
    # 新竹縣
    '竹北市', '竹東鎮', '湖口鄉', '新豐鄉', '新埔鎮','芎林鄉','寶山鄉',
    # 新竹市
    '東區', '北區', '香山區',
    # 新竹科學園區
    '新竹科學園區',
    # 苗栗縣
    '三義鄉', '三灣鄉', '公館鄉', '竹南鎮', '南庄鄉', '後龍鎮',
    '苑裡鎮', '苗栗市', '通霄鎮', '造橋鄉', '銅鑼鄉', '頭份市', '頭屋鄉',
    # 台中市
    '中區', '東區', '西區', '南區', '北區', '西屯區',
    '南屯區', '北屯區', '豐原區', '大里區', '太平區',
    '清水區', '沙鹿區', '大甲區', '東勢區', '梧棲區',
    '烏日區', '神岡區', '大肚區', '大雅區', '后里區',
    '霧峰區', '潭子區', '龍井區', '外埔區', '和平區',
    '石岡區', '大安區', '新社區',
    # 嘉義縣
    '太保市', '朴子市', '民雄鄉', '中埔鄉', '水上鄉',
    # 嘉義市
    '西區', '東區',
    # 台南市
    '七股區', '下營區', '大內區', '山上區', '中西區',
    '仁德區', '六甲區', '北門區', '北區', '左鎮區',
    '永康區', '玉井區', '白河區', '安平區', '安定區',
    '安南區', '西港區', '佳里區', '官田區', '東山區',
    '東區', '南化區','南區', '後壁區', '柳營區',
    '將軍區', '麻豆區', '善化區', '新化區', '新市區',
    '新營區', '楠西區', '學甲區', '龍崎區', '歸仁區', '關廟區', '鹽水區',
    # 高雄
    '新興區', '苓雅區', '三民區', '鹽埕區', '前金區', '鳳山區',
    '左營區', '前鎮區', '鼓山區', '楠梓區', '旗津區', '小港區',
    '梓官區', '仁武區', '林園區', '岡山區', '茄萣區', '鳥松區', '大寮區',
    '橋頭區', '湖內區', '大社區', '彌陀區','路竹區', '阿蓮區',
    '大樹區', '永安區', '燕巢區', '旗山區', '美濃區', '甲仙區',
    # 屏東
    '屏東市', '東港鎮', '恆春鎮', '潮州鎮',
    '內埔鄉', '竹田鄉', '車城鄉', '佳冬鄉', '來義鄉',
    '枋寮鄉', '林邊鄉', '南州鄉', '崁頂鄉', '新園鄉', '麟洛鄉',
    # 台東
    '臺東市', '卑南鄉', '鹿野鄉', '關山鎮',
    '池上鄉', '成功鎮', '東河鄉',
    # 花蓮
    '光復鄉'
]


hex_grid_centroid = hex_grid.copy()
hex_grid_centroid['geometry'] = hex_grid_centroid.geometry.centroid

In [None]:
joined = gpd.sjoin(
    hex_grid_centroid, 
    taiwan[['COUNTYNAME', 'TOWNNAME', 'geometry']], 
    how='left', 
    predicate='intersects'
)

joined['geometry'] = hex_grid['geometry']
joined = joined.set_geometry('geometry') 
joined = joined[~joined.index.duplicated(keep='first')]

In [None]:
hex_grid_youbike = joined[joined['TOWNNAME'].isin(target_towns)].copy()

In [None]:
len(hex_grid_youbike['TOWNNAME'].value_counts()) == len(target_towns)
# len(hex_grid_youbike['TOWNNAME'].value_counts()) == len(target_towns)
# 台大公館位於大安區與部分中正區所以已經包含
# 為了補足竹科的範圍則新增了寶山鄉
set(target_towns) - set(hex_grid_youbike['TOWNNAME'].value_counts().index)

In [None]:
hex_grid_youbike

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from esda.getisord import G_Local
from libpysal.weights import Queen, DistanceBand, KNN
import matplotlib.colors as mcolors

class GetisOrdGiAnalysis:
    def __init__(self, grid, taiwan):
        self.grid = grid
        self.taiwan = taiwan

    def calculate_gi(self, best_distance, adjacency=None):
        """
        best_distance: when adjacency is 'knn', it indicates the number of neighbors k, if adjacency is 'distance', it indicates the distance threshold
        grid: GeoDataFrame with 'num_accidents' column
        adjacency: 'knn', 'queen', or 'distance'

        Returns: GeoDataFrame with 'GiZScore' column added
        """
        self.grid = self.grid.reset_index(drop=True)

        self.grid['centroid'] = self.grid.geometry.centroid
        coords = np.vstack((self.grid['centroid'].x, self.grid['centroid'].y)).T
        # coords = np.array(list(zip(centroids.x, centroids.y)))

        if adjacency=='knn':
            w = KNN.from_array(coords, k=best_distance)
        elif adjacency=='queen':
            w = Queen.from_dataframe(self.grid)
        else:
            w = DistanceBand(coords, threshold=best_distance, binary=True, silence_warnings=True)

        if w.islands:
            self.grid = self.grid.drop(index=w.islands)
            return self.calculate_gi(best_distance, adjacency) 

        y = self.grid['num_accidents'].astype(np.float64).values
        g_local = G_Local(y, w, transform='R', star=True)
        self.grid['GiZScore'] = g_local.Zs
        self.grid['GiPValue'] = g_local.p_sim

        self.grid['hotspot'] = 'Not Significant'
        self.grid.loc[(self.grid['GiPValue'] < 0.01) & (self.grid['GiZScore'] > 0), 'hotspot'] = 'Hotspot 99%'
        self.grid.loc[(self.grid['GiPValue'] < 0.05) & (self.grid['GiZScore'] > 0) & (self.grid['hotspot'] == 'Not Significant'), 'hotspot'] = 'Hotspot 95%'
        self.grid.loc[(self.grid['GiPValue'] < 0.10) & (self.grid['GiZScore'] > 0) & (self.grid['hotspot'] == 'Not Significant'), 'hotspot'] = 'Hotspot 90%'
        self.grid.loc[(self.grid['GiPValue'] < 0.01) & (self.grid['GiZScore'] < 0), 'hotspot'] = 'Coldspot 99%'
        self.grid.loc[(self.grid['GiPValue'] < 0.05) & (self.grid['GiZScore'] < 0) & (self.grid['hotspot'] == 'Not Significant'), 'hotspot'] = 'Coldspot 95%'
        self.grid.loc[(self.grid['GiPValue'] < 0.10) & (self.grid['GiZScore'] < 0) & (self.grid['hotspot'] == 'Not Significant'), 'hotspot'] = 'Coldspot 90%'

        return self.grid

    def plot_gi_map(self):
        color_dict = {
            'Hotspot 99%': '#800026',
            'Hotspot 95%': '#FC4E2A',
            'Hotspot 90%': '#FD8D3C',
            'Not Significant': '#d9d9d9',
            'Coldspot 90%': '#6baed6',
            'Coldspot 95%': '#3182bd',
            'Coldspot 99%': '#08519c'
        }

        categories = [
            'Hotspot 99%', 'Hotspot 95%', 'Hotspot 90%', 
            'Not Significant', 
            'Coldspot 90%', 'Coldspot 95%', 'Coldspot 99%'
        ]

        grid = self.grid.to_crs(epsg=4326)

        fig, ax = plt.subplots(figsize=(10, 12))
        self.taiwan.to_crs(epsg=4326).plot(ax=ax, color='white', edgecolor='black', linewidth=0.5)

        grid.plot(
            column='hotspot', 
            categorical=True, 
            cmap=mcolors.ListedColormap([color_dict[cat] for cat in categories if cat in grid['hotspot'].unique()]),
            legend=True, 
            edgecolor='white',
            linewidth=0.01, 
            alpha=0.8,
            ax=ax,
            categories=categories,
            legend_kwds={
                'bbox_to_anchor': (1.05, 1),
                'loc': 'upper left',
                'frameon': False
            }
        )

        plt.title('Hotspot Analysis (Getis-Ord Gi*) - 90%, 95%, 99% Confidence Levels')
        plt.axis('off')
        plt.show()


In [None]:
go = GetisOrdGiAnalysis(hex_grid_youbike, taiwan)
go.calculate_gi(best_distance=6, adjacency='knn')
go.plot_gi_map()

In [None]:
go.grid.to_csv('../ComputedDataV7/Grid/grid_data_區級篩選.csv')

In [120]:
import ast
from shapely import wkt
grid_filter = pd.read_csv('../ComputedDataV7/Grid/grid_data_區級篩選.csv')
grid_filter['accident_indices'] = grid_filter['accident_indices'].apply(ast.literal_eval)
grid_filter = grid_filter[grid_filter['num_accidents'] > 0]
grid_filter

Unnamed: 0.1,Unnamed: 0,geometry,num_accidents,accident_indices,index_right,COUNTYNAME,TOWNNAME,centroid,GiZScore,GiPValue,hotspot
73,73,POLYGON ((151342.4826740732 2554495.4605936077...,1.0,[23624],347.0,臺南市,七股區,POINT (151240.0278324099 2554496.136306807),-0.173213,0.001,Coldspot 99%
89,89,POLYGON ((151489.2226439283 2553439.4605828677...,1.0,[62700],347.0,臺南市,七股區,POINT (151386.76061068074 2553440.1350281034),-0.173213,0.001,Coldspot 99%
346,346,POLYGON ((152718.76532160613 2553431.421766879...,1.0,[223810],347.0,臺南市,七股區,POINT (152616.30353819474 2553432.0877956925),-0.173213,0.001,Coldspot 99%
782,782,"POLYGON ((153960.6140684128 2555341.635098908,...",1.0,[468309],347.0,臺南市,七股區,POINT (153858.16566619073 2555342.2930935216),-0.173213,0.001,Coldspot 99%
917,917,POLYGON ((154258.14271774096 2553805.144946863...,1.0,[349002],347.0,臺南市,七股區,POINT (154155.6838683571 2553805.800531512),-0.173213,0.001,Coldspot 99%
...,...,...,...,...,...,...,...,...,...,...,...
450865,450865,POLYGON ((350677.7336311359 2767941.9201648613...,1.0,[539920],303.0,新北市,貢寮區,POINT (350576.79699343245 2767941.17733529),-0.173213,0.001,Coldspot 99%
450886,450886,POLYGON ((350980.54368653294 2767944.153064321...,1.0,[521212],303.0,新北市,貢寮區,POINT (350879.60698927194 2767943.407998718),-0.149802,0.359,Not Significant
450893,450893,"POLYGON ((351135.496103809 2767465.610973455, ...",2.0,"[624010, 648156]",303.0,新北市,貢寮區,POINT (351034.5558350338 2767464.8648845125),-0.149802,0.001,Coldspot 99%
450899,450899,POLYGON ((351290.45861353533 2766987.070545973...,1.0,[359176],303.0,新北市,貢寮區,POINT (351189.51477376075 2766986.323433991),-0.149802,0.359,Not Significant


In [77]:
select_group = [
    '路面狀況-路面鋪裝名稱', '路面狀況-路面狀態名稱', '路面狀況-路面缺陷名稱',
    '道路障礙-障礙物名稱', '道路障礙-視距品質名稱', '道路障礙-視距名稱',
    '號誌-號誌種類名稱', '號誌-號誌動作名稱',
    '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '車輛撞擊部位大類別名稱-最初',
    '事故類型及型態大類別名稱', '車道劃分設施-分向設施大類別名稱',
    '道路型態大類別名稱',
    '速限-第1當事者', '道路類別-第1當事者-名稱',
    ]

def extract_features(grid, combined_data, select_group, rows):

    indices = grid['accident_indices'].iloc[rows] # return list of original data index
    sample = combined_data.iloc[indices]
    sample = sample[select_group]

    cat_cols = sample.select_dtypes(include='object').columns
    num_cols = sample.select_dtypes(include='number').columns

    cat_features = []
    num_features = []
    # for categorical features
    if len(cat_cols) > 0:
        for col in cat_cols:
            vc = sample[col].value_counts(normalize=True)
            vc.index = [f"{col}_{v}" for v in vc.index]
            cat_features.append(vc)
        cat_features = pd.concat(cat_features)
    else:
        cat_features = pd.Series(dtype='float64')
    # for numerical features
    if len(num_cols) > 0:
        num_features = sample[num_cols].mean()
        num_features.index = [f"{col}_mean" for col in num_features.index]
    else:
        num_features = pd.Series(dtype='float64')

    all_features = pd.concat([cat_features, num_features])
    all_features_df = all_features.to_frame().T

    return all_features_df

In [78]:
# from utils_model import extract_features

all_features_list = []

for rows in range(grid_filter.shape[0]):
    features = extract_features(grid_filter, combined_data, select_group, rows)
    all_features_list.append(features)

all_features_df = pd.concat(all_features_list, ignore_index=True)

mean_cols = [c for c in all_features_df.columns if '_mean' in c]
prop_cols = [c for c in all_features_df.columns if c not in mean_cols]

all_features_df[prop_cols] = all_features_df[prop_cols].fillna(0)
for col in mean_cols:
    all_features_df[col] = all_features_df[col].fillna(all_features_df[col].mean())

all_features_df['hotspot'] = grid_filter['hotspot'].values

## 增加空間滯後以及速差到最終模型輸入

In [None]:
import libpysal

youbike = pd.read_csv('../ComputedData/Youbike/full_youbike.csv')
youbike_gdf = gpd.GeoDataFrame(youbike, geometry=gpd.points_from_xy(youbike['PositionLon'], youbike['PositionLat']), crs='EPSG:4326')
grid_filter['geometry'] = grid_filter['geometry'].apply(wkt.loads)
grid_filter_gdf = gpd.GeoDataFrame(grid_filter, geometry='geometry')

def calculate_counts(grid, facility_df, name):
    gdf_fac = gpd.GeoDataFrame(
        facility_df, 
        geometry=gpd.points_from_xy(facility_df['PositionLon'], facility_df['PositionLat']), crs=4326).to_crs(epsg=3826)

    join = gpd.sjoin(gdf_fac, grid[['geometry']], how='inner', predicate='intersects')

    return join.groupby('index_right').size().rename(f'num_{name}')

w = libpysal.weights.KNN.from_dataframe(grid_filter_gdf, k=6)

print("Facilities")
grid_filter_gdf = grid_filter_gdf.join(calculate_counts(grid_filter_gdf, youbike_gdf, 'youbike'), how='left').fillna({'num_youbike': 0})

print("Spatial Lag")
grid_filter_gdf['lag_num_youbike'] = libpysal.weights.lag_spatial(w, grid_filter_gdf['num_youbike'])

Facilities
Spatial Lag


 There are 12 disconnected components.
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:3826
Right CRS: None

  join = gpd.sjoin(gdf_fac, grid[['geometry']], how='inner', predicate='intersects')


In [123]:
# read shape
speed_diff = gpd.read_file('../../ST-RTA-GIS/CalculatedData/pairs_annot_all_cities.shp')
speed_diff_gdf = gpd.GeoDataFrame(speed_diff, geometry='geometry')

joined_speed = gpd.sjoin(
    speed_diff_gdf[['geometry']], 
    grid_filter_gdf[['geometry']], 
    how='inner', 
    predicate='intersects'
)

speed_counts = joined_speed.groupby('index_right').size().rename('speed_diff')
grid_filter_gdf = grid_filter_gdf.join(speed_counts, how='left')
grid_filter_gdf['speed_diff'] = grid_filter_gdf['speed_diff'].fillna(0)
grid_filter_gdf['speed_diff'].value_counts()

all_features_df[['num_youbike', 'lag_num_youbike', 'speed_diff']] = grid_filter_gdf[['num_youbike', 'lag_num_youbike', 'speed_diff']].values

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:3826
Right CRS: None

  joined_speed = gpd.sjoin(


In [126]:
all_features_df.to_csv('../ComputedDataV7/ForModel/features.csv', index=False)