# このファイルについて
- about: ドラぷら検索ログの簡略化
    - カラムの限定
    - コンテスト対象道路（関越道・東北道）を通過しない検索の除外
- author: 松永

In [1]:
import os
import time
import pickle
from typing import Dict, List, Optional, Set
import networkx as nx
import numpy as np
import pandas as pd
import cudf

import warnings
warnings.simplefilter('ignore')

In [2]:
# data directory
DATA_DIR = '../../Input_processed_data'

# IC, 道路情報 csv
IC_NET_CSV = f'{DATA_DIR}/road_master/220303-doronet_ic.csv'
IC_SUBNET_CSV = f'{DATA_DIR}/road_master/icnet_sub.csv'

# 検索ログ csv
SEARCH_LOG_DIR = lambda month: f'{DATA_DIR}/search_records/csv{month}'
SEARCH_LOG_CSV = lambda date: f'{SEARCH_LOG_DIR(date[:6])}/record_{date}.csv'

# 準備

In [3]:
# モジュール内で前処理済み
df_icnet = pd.read_csv(IC_NET_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})
sub_icnet = pd.read_csv(IC_SUBNET_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})

In [4]:
ic_graph = nx.from_pandas_edgelist(
    df_icnet, source='start_code', target='end_code',
    edge_attr=['distance', 'road_code', 'direction'], create_using=nx.DiGraph())

In [5]:
ic_nodes_set: set = set(ic_graph.nodes)

# 検索ログを簡略化

In [6]:
def get_log(date: str) -> cudf.DataFrame:
    type_map = {
        'start_code': str,
        'end_code': str,
        'via1_code': str,
        'via2_code': str,
        'via3_code': str,
        'order': str,
        'car_type': str,
    }

    if not os.path.exists(SEARCH_LOG_CSV(date)):
        return cudf.DataFrame()
    df = pd.read_csv(SEARCH_LOG_CSV(date), dtype=type_map)
    return cudf.from_pandas(df)

## 列を選択

In [7]:
def simplify_search_log(df: cudf.DataFrame) -> cudf.DataFrame:
    # レコードを限定
    df_res = df.loc[(df['start_code'].isin(ic_nodes_set)) & (df['end_code'].isin(ic_nodes_set))]
    
    # 列を限定
    select_columns = ['date', 'start_code', 'end_code', 'spec_day', 'spec_time', 'spec_type', 'car_type']
    df_res = df_res.loc[:, select_columns].rename(columns={'date': 'datetime'})
    
    # 指定日時を1列にまとめる
    df_res = df_res.assign(spec_datetime=df_res['spec_day'] + ' ' + df_res['spec_time'])
    df_res.drop(['spec_day', 'spec_time'], axis=1, inplace=True)
    
    # 型変換
    df_res = df_res.astype({
        'start_code': 'category',
        'end_code': 'category',
        'spec_type': 'category',
        'car_type': 'category',
    })
    df_res = df_res.assign(
        datetime=cudf.to_datetime(df_res['datetime']),
        spec_datetime=cudf.to_datetime(df_res['spec_datetime'])
    )
    
    # 列並び替え
    select_columns = ['datetime', 'start_code', 'end_code', 'spec_datetime', 'spec_type', 'car_type']
    return df_res.loc[:, select_columns]

In [8]:
df_simple = simplify_search_log(get_log('20240201'))
print(df_simple.shape)
df_simple.head()

(372388, 6)


Unnamed: 0,datetime,start_code,end_code,spec_datetime,spec_type,car_type
0,2024-02-01 00:00:00,5013001,1720081,2024-02-01 00:00:00,1,2
1,2024-02-01 00:00:00,1040141,1030091,2024-02-01 00:00:00,1,2
2,2024-02-01 00:00:01,1720031,1073006,2024-01-31 11:50:00,1,2
3,2024-02-01 00:00:01,214K071,1040086,2024-02-01 00:00:00,1,2
4,2024-02-01 00:00:02,1010031,1120071,2024-01-31 00:00:00,1,4


## 関越道・東北道を通過するレコードのみを抜き出す

### 経路検索用プログラム

In [9]:
def __get_route(
    src: str, dest: str, route_dict: Dict[str, Dict[str, List[str]]]
) -> Optional[List[str]]:
    if not (src in ic_nodes_set and dest in ic_nodes_set):
        return []
    try:
        path = route_dict[src][dest]
        return path
    except: # 経路が存在しない, もしくはノードがグラフ上に存在しない場合
        return []

In [10]:
def get_route(
    src: str, 
    dest: str, 
    route_dict: Dict[str, Dict[str, List[str]]],
    excluded_ic_set: Set[str] = set(),
) -> List[str]:
    '''
    ic_graph上で出発地から目的地までの経路を得る関数

    Parameters
    --------------
    src: 出発ICコード
    dest: 目的ICコード
    '''
    path = __get_route(src, dest, route_dict=route_dict)
    
    if len(excluded_ic_set) > 0:
        path = [ic for ic in path if ic not in excluded_ic_set]
    return path

In [11]:
fname = './route_dict.pkl'

if os.path.exists(fname): # 経路マップがすでに存在しているとき、それを使う
    with open(fname, 'rb') as f:
        print('Loading IC Routes...')
        route_dict = pickle.load(f)
else: # 存在していなければ計算してバイナリで保存
    print('Calculating IC Routes...')
    route_dict = dict(nx.all_pairs_dijkstra_path(ic_graph, weight='distance'))
    
    with open(fname, 'wb') as f:
        pickle.dump(route_dict, f)
print('Finished.')

Loading IC Routes...
Finished.


### 対象道路のみを抽出

In [12]:
target_road_code_set = {'1800', '1040'}

In [13]:
def extract_road_related_queries(df_log: cudf.DataFrame, target_road_code_set: set) -> cudf.DataFrame:
    query_indices = []

    for i_query, record in enumerate(df_log.to_numpy()):
        start_code, end_code = record[[1, 2]]
        path = get_route(start_code, end_code, route_dict)
        # when the shortest path cannot be calculated
        if len(path) == 0:
            continue
        
        for i_segment, (start_code, end_code) in enumerate(zip(path, path[1:])):
            if ic_graph[start_code][end_code]['road_code'] in target_road_code_set:
                query_indices.append(i_query)
                break

    related_df_log = df_log.iloc[query_indices].reset_index(drop=True)
    return related_df_log

In [14]:
PERIOD_BLOCKS = [
    # ('20210401', '20210630'),
    # ('20210701', '20210930'),
    # ('20211001', '20211231'),
    # ('20220101', '20220331'),
    # ('20220401', '20220630'),
    # ('20220701', '20220930'),
    # ('20221001', '20221231'),
    # ('20230101', '20230331'),
    # ('20230401', '20230630'),
    # ('20230701', '20230930'),
    # ('20231001', '20231231'),
    # ('20240101', '20240131'),
    # ('20240201', '20240331'),
    # ('20240401', '20240506'),
    ('20240507', '20240831')
]

In [15]:
OUTPUT_FILE = lambda date: f'{DATA_DIR}/simple_search_records/csv{date[:6]}/record_{date}.csv'

In [None]:
for start_date, end_date in PERIOD_BLOCKS:
    print('='*40, f'{start_date} -> {end_date}', '='*40)
    date_list = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date, freq='1D')]
    
    for i_date, target_date in enumerate(date_list):
        s = time.time()
        
        df_simple = simplify_search_log(get_log(target_date))
        df_related_log = extract_road_related_queries(df_simple, target_road_code_set)
                
        fname_out = OUTPUT_FILE(target_date)
        print(f'{target_date} | # of related queries: {len(df_related_log)} ({time.time() - s:.2f} [sec])')

        os.makedirs(os.path.dirname(fname_out), exist_ok=True)
        df_related_log.to_pandas().to_csv(fname_out, index=False)

20240507 | # of related queries: 191888 (36.13 [sec])
20240508 | # of related queries: 161177 (28.92 [sec])
20240509 | # of related queries: 169666 (28.96 [sec])
20240510 | # of related queries: 172653 (28.61 [sec])
20240511 | # of related queries: 145759 (23.43 [sec])
20240512 | # of related queries: 140923 (23.20 [sec])
20240513 | # of related queries: 181106 (29.91 [sec])
20240514 | # of related queries: 163654 (27.64 [sec])


In [17]:
df_simple = simplify_search_log(get_log('20240507'))
df_related_log = extract_road_related_queries(df_simple, target_road_code_set) 

KeyboardInterrupt: 

In [19]:
df_simple.shape, df_related_log.shape

((346484, 6), (112648, 6))

In [20]:
df_simple.head()

Unnamed: 0,datetime,start_code,end_code,spec_datetime,spec_type,car_type
0,2024-01-31 00:00:00,5045040,1810086,2024-02-23 12:00:00,1,2
1,2024-01-31 00:00:00,5022005,1011131,2024-01-31 00:00:00,1,2
2,2024-01-31 00:00:00,1010266,1010181,2024-01-29 15:00:00,1,1
3,2024-01-31 00:00:00,1040251,5037010,2024-01-31 00:00:00,1,2
4,2024-01-31 00:00:01,1040241,1040266,2024-01-30 07:00:00,1,2


In [21]:
df_related_log.head()

Unnamed: 0,datetime,start_code,end_code,spec_datetime,spec_type,car_type
0,2024-01-31 00:00:00,5045040,1810086,2024-02-23 12:00:00,1,2
1,2024-01-31 00:00:00,1040251,5037010,2024-01-31 00:00:00,1,2
2,2024-01-31 00:00:01,1040241,1040266,2024-01-30 07:00:00,1,2
3,2024-01-31 00:00:02,5046020,1040368,2024-01-31 00:00:00,1,2
4,2024-01-31 00:00:02,1800076,5002041,2024-01-31 00:00:00,1,2
