このファイルについて
- about
    - 関越道を通過する検索クエリの件数を計算する

In [1]:
import os
import time
import tqdm
import datetime as dt
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cudf

import warnings
warnings.simplefilter('ignore')

In [3]:
# road name
TARGET_ROAD = 'kannetsu'
if TARGET_ROAD == 'kannetsu':
    TARGET_ROAD_CODE = '1800'
elif TARGET_ROAD == 'touhoku':
    TARGET_ROAD_CODE = '1040'
elif TARGET_ROAD == 'tateyama':
    TARGET_ROAD_CODE = '1130'

# data directory
DATA_DIR = '../../Input_processed_data'

# IC, 道路情報 csv
IC_CSV = f'{DATA_DIR}/road_master/ic_preprocessed.csv'
IC_NET_CSV = f'{DATA_DIR}/road_master/220303-doronet_ic.csv'
IC_NET_SUB_CSV = f'{DATA_DIR}/road_master/tateyama_kannetsu_icnet.csv'

# 検索ログ csv
SEARCH_LOG_DIR = lambda month: f'{DATA_DIR}/search_records/csv{month}'
SEARCH_LOG_CSV = lambda date: f'{SEARCH_LOG_DIR(date[:6])}/record_{date}.csv'

## preparing

In [4]:
# モジュール内で前処理済み
df_ic = pd.read_csv(IC_CSV, dtype={'ic_code': str})
df_icnet = pd.read_csv(IC_NET_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})
sub_icnet = pd.read_csv(IC_NET_SUB_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})

code2name = dict(zip(df_ic['ic_code'], df_ic['ic_name']))
name2code = {v: k for k, v in code2name.items()}

ic_graph = nx.from_pandas_edgelist(df_icnet, source='start_code', target='end_code',
                                  edge_attr=['distance', 'road_code', 'direction'],
                                  create_using=nx.DiGraph())

In [5]:
# 区間ごとの制限速度を格納したテーブル, Map を作成
df_limits = sub_icnet.loc[:, ['start_code', 'end_code', 'start_name', 'end_name', 'road_code', 'limit']]

limit_dict = {
    (s_code, e_code): lim 
    for s_code, e_code, lim in df_limits.loc[:, ['start_code', 'end_code', 'limit']].values
}

In [6]:
cudf_icnet = cudf.from_pandas(df_icnet)
cudf_limits = cudf.from_pandas(df_limits)

In [7]:
def str2time(time_str, format='%H:%M'):
    '''
    文字列形式の時刻 => datetime.Timeクラスの時刻
    Parameters
    ----------
    time_str: str
    時刻
    format: str
    時刻文字列の形式

    Returns
    -------
    time: datetime.time
    '''
    time_datetime = dt.datetime.strptime(time_str, format)
    time = time_datetime.time()
    return time

In [8]:
def get_route(src_name, target_name):
    '''
    ic_graph上で出発地（src_name）から目的地（target_name）までの経路を得る関数

    Parameters
    --------------
    ic_graph: nx.DiGraph
        ICの繋がりを表す有向グラフ
    src_name: str
        出発IC名
    target_name: str
        目的IC名

    Returns
    ----------
    path: List[str]
    経路中の各IC codeのリスト
    '''  
    if not (src_name in name2code and target_name in name2code):
        return None

    src = name2code[src_name]
    target = name2code[target_name]

    try:
        path = PATH_DICT[src][target]
        return path
    # 経路が存在しない, もしくはノードがグラフ上に存在しない場合
    except:
        return None


def get_route_with_time(src_name, target_name, departure_time='12:00', arrival_time=None, spec_date=dt.date.today()):
    '''
    ic_graph上で出発地（src_name）から目的地（target_name）までの予想通過時刻付き経路を得る関数

    Parameters
    ----------
    ic_graph: nx.DiGraph
        ICの繋がりを表す有向グラフ
    src_name: str
        出発IC名
    target_name: str
        目的IC名
    departure_time: str or datetime.time
        出発時刻
    arrival_time: str or datetime.time
        到着時刻
    spec_date: str or dateitme.date
        指定日

    Returns
    -------
    path: List[str]
        経路中の各IC codeのリスト
    arrival_time_list: List[datetime.time]
        経路中の各ICへの予想到着時刻（datetime.time型）のリスト
    '''
    # 関越道・館山道 以外の道路の移動速度は80km/hと仮定する
    DEFAULT_SPEED = 80

    path = get_route(src_name, target_name)

    # 経路が存在しない場合
    if path is None:
        return (None, None)
    
    try:
        if isinstance(departure_time, str):
            departure_time = str2time(departure_time)
        if isinstance(arrival_time, str):
            arrival_time = str2time(arrival_time)
    except (ValueError, TypeError) as e:
        return (None, None)
    
    if isinstance(spec_date, str):
        spec_date = dt.datetime.strptime(spec_date, '%Y-%m-%d').date()

    elapsed = dt.timedelta()
    elapsed_time_list = [elapsed]

    for i in range(len(path)-1):
        s, t = path[i], path[i+1]

        dist = ic_graph[s][t]['distance']
        limit_speed = limit_dict.get((s, t), DEFAULT_SPEED)

        # s -> t　までの所要時間を算出
        td = dt.timedelta(hours = dist / limit_speed)

        elapsed += td
        elapsed_time_list.append(elapsed)

    if arrival_time:
        spec_datetime = dt.datetime.combine(spec_date, arrival_time)
        time_list = [spec_datetime - td for td in elapsed_time_list[::-1]]
    else:
        spec_datetime = dt.datetime.combine(spec_date, departure_time)
        time_list = [spec_datetime + td for td in elapsed_time_list]

    return path, time_list

In [9]:
PATH_DICT = dict(nx.all_pairs_dijkstra_path(ic_graph, weight='distance'))

In [10]:
def get_log(date):
    if not os.path.exists(SEARCH_LOG_CSV(date)):
        return None
    
    dtype = {
        'start_code': str,
        'end_code': str,
        'via1_code': str,
        'via2_code': str,
        'via3_code': str,
        'order': str,
    }
    df = cudf.from_pandas(pd.read_csv(SEARCH_LOG_CSV(date), dtype=dtype))
    return df


def get_simple_log(date):
    SIMPLE_SEARCH_QUERY_COL_LIST = ['date', 'start_name', 'end_name']
    
    df = get_log(date)
    return df.loc[:, SIMPLE_SEARCH_QUERY_COL_LIST]

In [17]:
df = get_log('20230901')
print(df.shape)
df.head()

(373484, 19)


Unnamed: 0,date,start_code,start_name,end_code,end_name,via1_code,via1_name,via2_code,via2_name,via3_code,via3_name,spec_day,spec_time,spec_type,order,car_type,use_nexco,use_urban,use_local
0,2023/09/01 00:00:00,6001006,宝町,1400091,那珂,,,,,,,2023-09-01,07:00,1,2,3,1,1,1
1,2023/09/01 00:00:00,1461080,太田桐生,1040011,浦和（東京方面）,,,,,,,2023-08-31,08:00,1,3,2,1,1,1
2,2023/09/01 00:00:00,1072023,大和まほろばスマート,1612046,須崎東,,,,,,,2023-10-21,15:00,1,2,2,1,1,1
3,2023/09/01 00:00:01,1461080,太田桐生,212B011,大宮,,,,,,,2023-09-01,00:00,1,2,2,1,1,1
4,2023/09/01 00:00:01,1800076,高崎,1040241,仙台宮城,,,,,,,2023-08-31,23:50,1,2,2,1,1,1


In [18]:
df = get_simple_log('20230901')
print(df.shape)
df.head()

(373484, 3)


Unnamed: 0,date,start_name,end_name
0,2023/09/01 00:00:00,宝町,那珂
1,2023/09/01 00:00:00,太田桐生,浦和（東京方面）
2,2023/09/01 00:00:00,大和まほろばスマート,須崎東
3,2023/09/01 00:00:01,太田桐生,大宮
4,2023/09/01 00:00:01,高崎,仙台宮城


## main

In [40]:
def extract_road_related_queries(df_log: pd.DataFrame, target_road_code_set: set):
    query_indices = []

    for i_query, record in enumerate(df_log.to_numpy()):
        start_name, end_name = record[[2, 4]]
        path = get_route(start_name, end_name)
        # when the shortest path cannot be calculated
        if path is None:
            continue
        
        for i_segment, (start_code, end_code) in enumerate(zip(path, path[1:])):
            if ic_graph[start_code][end_code]['road_code'] in target_road_code_set:
                query_indices.append(i_query)
                break

    related_df_log = df_log.iloc[query_indices].reset_index(drop=True)
    return related_df_log

In [32]:
PERIOD_BLOCKS = [
    ('20210401', '20210630'),
    ('20210701', '20210930'),
    ('20211001', '20211231'),
    ('20220101', '20220331'),
    ('20220401', '20220630'),
    ('20220701', '20220930'),
    ('20221001', '20221231'),
    ('20230101', '20230331'),
    ('20230401', '20230630'),
    ('20230701', '20230930'),
]

In [42]:
target_road_code_set = {'1800', '1040'}

In [43]:
for start_date, end_date in PERIOD_BLOCKS:
    date_list = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date, freq='1D')]

    for i_date, target_date in enumerate(date_list):
        print('='*20, target_date, '='*20)
        
        df_log = get_log(target_date)

        s = time.time()
        df_related_log = extract_road_related_queries(df_log, target_road_code_set)
        
        print(f'{target_date} | # of related queries: {len(df_related_log)} ({time.time() - s:.2f} [sec])')
        break
    break

    print()

20210401 | # of related queries: 64545 (14.59 [sec])


In [44]:
df_log.shape, df_related_log.shape

((217648, 19), (64545, 19))

In [45]:
df_related_log.head()

Unnamed: 0,date,start_code,start_name,end_code,end_name,via1_code,via1_name,via2_code,via2_name,via3_code,via3_name,spec_day,spec_time,spec_type,order,car_type,use_nexco,use_urban,use_local
0,2021/04/01 00:00:01,6016021,扇大橋,1461150,宇都宮上三川,,,,,,,2021-03-31,06:00,1,2,2,1,1,1
1,2021/04/01 00:00:06,214K106,五霞,1010066,駒門ＰＡ,,,,,,,2021-04-01,00:00,1,2,2,1,1,1
2,2021/04/01 00:00:06,6016021,扇大橋,1461150,宇都宮上三川,,,,,,,2021-03-31,07:00,1,2,2,1,1,1
3,2021/04/01 00:00:10,1040263,三本木ＰＡ,1010046,大井松田,,,,,,,2021-04-01,00:00,1,2,2,1,1,1
4,2021/04/01 00:00:14,1040366,盛岡,5057020,筑穂,,,,,,,2021-04-01,00:00,1,2,2,1,1,1
