這個檔案以notion中V3為中心進行分析，新增目前認為最正確的模型

In [13]:
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

import libpysal
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from utils import get_grid, read_data, read_taiwan_specific
from utils_macro import LocalMoranAnalysis, GetisOrdGiAnalysis
from utils_model import extract_features

from shapely import wkt
from libpysal.weights import Queen

combined_data = read_data()
taiwan, grid_filter = read_taiwan_specific()

In [14]:
mrt = pd.read_csv('../ComputedData/MRT/full_mrt.csv')
parking_lot = pd.read_csv('../ComputedData/Parkinglot/full_parkinglot.csv')
youbike = pd.read_csv('../ComputedData/Youbike/full_youbike.csv')

mrt_gdf = gpd.GeoDataFrame(mrt, geometry=gpd.points_from_xy(mrt['PositionLon'], mrt['PositionLat']), crs='EPSG:4326')
parking_lot_gdf = gpd.GeoDataFrame(parking_lot, geometry=gpd.points_from_xy(parking_lot['PositionLon'], parking_lot['PositionLat']), crs='EPSG:4326')
youbike_gdf = gpd.GeoDataFrame(youbike, geometry=gpd.points_from_xy(youbike['PositionLon'], youbike['PositionLat']), crs='EPSG:4326')

speed_diff_gdf = gpd.read_file('../../ST-RTA-GIS/CalculatedData/pairs_annot_all_cities.shp')
osm_transport_gdf = gpd.read_file('../../ST-RTA-GIS/Data/road_new.shp/gis_osm_transport_free_1.shp')
bus_stop_gdf = osm_transport_gdf[osm_transport_gdf['fclass'] == 'bus_stop'].copy()

In [15]:
hex_grid_full = pd.read_csv("../ComputedDataV7/Grid/grid_data_區級篩選.csv")
# hex_grid_peak = pd.read_csv("../ComputedDataV7/Grid/grid_data_區級篩選_num_peak.csv")
# hex_grid_offpeak = pd.read_csv("../ComputedDataV7/Grid/grid_data_區級篩選_num_off_peak.csv")

In [16]:
hex_grid = hex_grid_full.copy()
hex_grid['geometry'] = hex_grid['geometry'].apply(wkt.loads)
grid_gdf = gpd.GeoDataFrame(hex_grid, geometry='geometry')
grid_gdf = grid_gdf.set_crs(epsg=3826, allow_override=True)

## 所有設施& lags

In [17]:
def calculate_counts(grid, facility_df, name):
    gdf_fac = gpd.GeoDataFrame(
        facility_df, 
        geometry=gpd.points_from_xy(facility_df['PositionLon'], facility_df['PositionLat']), crs=4326).to_crs(epsg=3826)

    join = gpd.sjoin(gdf_fac, grid[['geometry']], how='inner', predicate='intersects')

    return join.groupby('index_right').size().rename(f'num_{name}')

print("Facilities")
grid_gdf = grid_gdf.join(calculate_counts(grid_gdf, mrt_gdf, 'mrt'), how='left').fillna({'num_mrt': 0})
grid_gdf = grid_gdf.join(calculate_counts(grid_gdf, parking_lot_gdf, 'parking'), how='left').fillna({'num_parking': 0})
grid_gdf = grid_gdf.join(calculate_counts(grid_gdf, youbike_gdf, 'youbike'), how='left').fillna({'num_youbike': 0})

print("Spatial Lag")
w = Queen.from_dataframe(grid_gdf, use_index=True)
grid_gdf['lag_num_mrt'] = libpysal.weights.lag_spatial(w, grid_gdf['num_mrt'])
grid_gdf['lag_num_parking'] = libpysal.weights.lag_spatial(w, grid_gdf['num_parking'])
grid_gdf['lag_num_youbike'] = libpysal.weights.lag_spatial(w, grid_gdf['num_youbike'])

def calculate_gdf_counts(grid, gdf_fac, name):
    if gdf_fac.crs is None or gdf_fac.crs.to_string() != "EPSG:3826":
        gdf_fac = gdf_fac.to_crs(epsg=3826)
    
    join = gpd.sjoin(gdf_fac, grid[['geometry']], how='inner', predicate='intersects')

    return join.groupby('index_right').size().rename(f'num_{name}')

print("Facilities (New SHP)")
grid_gdf = grid_gdf.join(calculate_gdf_counts(grid_gdf, speed_diff_gdf, 'speed_diff'), how='left').fillna({'num_speed_diff': 0})
grid_gdf = grid_gdf.join(calculate_gdf_counts(grid_gdf, bus_stop_gdf, 'bus_stop'), how='left').fillna({'num_bus_stop': 0})

print("Spatial Lag (New SHP)")
grid_gdf['lag_num_speed_diff'] = libpysal.weights.lag_spatial(w, grid_gdf['num_speed_diff'])
grid_gdf['lag_num_bus_stop'] = libpysal.weights.lag_spatial(w, grid_gdf['num_bus_stop'])

Facilities
Spatial Lag


 There are 759 disconnected components.
 There are 35 islands with ids: 14819, 39742, 47224, 55249, 64684, 101938, 110219, 113385, 113386, 147426, 162411, 174776, 191728, 201476, 245333, 255384, 272283, 283803, 289525, 324425, 332090, 346395, 353258, 353344, 357045, 357046, 361494, 369304, 398951, 398952, 416152, 416153, 434639, 439103, 443576.


Facilities (New SHP)
Spatial Lag (New SHP)


In [119]:
grid_gdf[['num_mrt', 'lag_num_mrt',
          'num_parking', 'lag_num_parking',
          'num_youbike', 'lag_num_youbike',
          'num_speed_diff', 'lag_num_speed_diff',
          'num_bus_stop', 'lag_num_bus_stop',
          'hotspot']]

import ast

grid_gdf['accident_indices'] = grid_gdf['accident_indices'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

grid_filter_with_accidents = grid_gdf[grid_gdf['accident_indices'].str.len() > 0]

In [None]:
select_group = [
    '路面狀況-路面鋪裝名稱', '路面狀況-路面狀態名稱', '路面狀況-路面缺陷名稱',
    '道路障礙-障礙物名稱', '道路障礙-視距品質名稱', '道路障礙-視距名稱',
    '號誌-號誌種類名稱', '號誌-號誌動作名稱',
    '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱', '車道劃分設施-分向設施大類別名稱',
    '道路型態大類別名稱',
    '速限-第1當事者', '道路類別-第1當事者-名稱',
    ]

In [None]:
all_features_list = []

for rows in range(grid_filter_with_accidents.shape[0]):
    features = extract_features(grid_filter_with_accidents, combined_data, select_group, rows)
    all_features_list.append(features)

    all_features_df = pd.concat(all_features_list, ignore_index=True)
all_features_df.fillna(0, inplace=True)

### Fixing

In [120]:
cols = [
    'num_mrt', 'lag_num_mrt', 'num_parking', 'lag_num_parking',
    'num_youbike', 'lag_num_youbike', 'num_speed_diff', 'lag_num_speed_diff',
    'num_bus_stop', 'lag_num_bus_stop', 'hotspot',
]

final_full = pd.read_csv("../ComputedDataV7/ForModel/final_data_full.csv")
all_features_df = final_full.drop(columns=cols, inplace=False, errors='ignore')

In [123]:
city = hex_grid_full[hex_grid_full['num_accidents'] > 0]
grid_filter_with_accidents = grid_filter_with_accidents.reset_index().rename(columns={'index': 'grid_id'})

In [124]:
final_data = pd.merge(all_features_df, grid_filter_with_accidents[['num_mrt', 'lag_num_mrt', 'num_parking', 'lag_num_parking',
    'num_youbike', 'lag_num_youbike', 'num_speed_diff', 'lag_num_speed_diff',
    'num_bus_stop', 'lag_num_bus_stop', 'COUNTYNAME', 'TOWNNAME', 'grid_id']], on='grid_id', how='inner')
final_data

Unnamed: 0,路面狀況-路面鋪裝名稱_柏油,路面狀況-路面狀態名稱_乾燥,路面狀況-路面缺陷名稱_無缺陷,道路障礙-障礙物名稱_無障礙物,道路障礙-視距品質名稱_無遮蔽物,道路障礙-視距名稱_良好,號誌-號誌種類名稱_無號誌,號誌-號誌動作名稱_無號誌,車道劃分設施-分道設施-快車道或一般車道間名稱_未繪設車道線,車道劃分設施-分道設施-快慢車道間名稱_未繪設快慢車道分隔線,...,num_parking,lag_num_parking,num_youbike,lag_num_youbike,num_speed_diff,lag_num_speed_diff,num_bus_stop,lag_num_bus_stop,COUNTYNAME,TOWNNAME
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,臺南市,七股區
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,臺南市,七股區
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,臺南市,七股區
3,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,臺南市,七股區
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,臺南市,七股區
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69872,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,新北市,貢寮區
69873,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,新北市,貢寮區
69874,1.0,1.0,0.5,1.0,1.0,1.0,0.5,0.5,1.0,1.0,...,0.0,0.0,0.0,0.0,4.0,2.0,0.0,4.0,新北市,貢寮區
69875,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,新北市,貢寮區


In [125]:
final_data.to_csv("../ComputedDataV7/ForModel/final_data_full.csv", index=False)