In [1]:
import os
import time
import tqdm
import random
import pickle
import datetime as dt
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cudf

import warnings
warnings.simplefilter('ignore')

In [2]:
# data directory
DATA_DIR = '../../Input_processed_data'

# IC, 道路情報 csv
IC_CSV = f'{DATA_DIR}/road_master/ic_preprocessed.csv'
IC_NET_CSV = f'{DATA_DIR}/road_master/220303-doronet_ic.csv'
IC_NET_SUB_CSV = f'{DATA_DIR}/road_master/tateyama_kannetsu_icnet.csv'

# 検索ログ csv
SEARCH_LOG_DIR = lambda month: f'{DATA_DIR}/search_records/csv{month}'
SEARCH_LOG_CSV = lambda date: f'{SEARCH_LOG_DIR(date[:6])}/record_{date}.csv'

# 準備

In [3]:
# モジュール内で前処理済み
df_ic = pd.read_csv(IC_CSV, dtype={'ic_code': str})
df_icnet = pd.read_csv(IC_NET_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})
sub_icnet = pd.read_csv(IC_NET_SUB_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})

code2name = dict(zip(df_ic['ic_code'], df_ic['ic_name']))
name2code = {v: k for k, v in code2name.items()}

ic_graph = nx.from_pandas_edgelist(df_icnet, source='start_code', target='end_code',
                                  edge_attr=['distance', 'road_code', 'direction'],
                                  create_using=nx.DiGraph())

In [4]:
# 区間ごとの制限速度を格納したテーブル, Map を作成
df_limits = sub_icnet.loc[:, ['start_code', 'end_code', 'start_name', 'end_name', 'road_code', 'limit']]

limit_dict = {
    (s_code, e_code): lim 
    for s_code, e_code, lim in df_limits.loc[:, ['start_code', 'end_code', 'limit']].values
}

In [5]:
cudf_icnet = cudf.from_pandas(df_icnet)
cudf_limits = cudf.from_pandas(df_limits)

# 経路表をDict形式で作成

In [8]:
df_icnet.head()

Unnamed: 0,rosen_code,road_code,road_name,direction,start_code,end_code,billing_No,start_name,end_name,start_lat,start_lng,end_lat,end_lng,distance,start_degree,end_degree
0,109,1010,【E1】東名高速道路,0,1010001,1010004,1,東京,東京本線,35.62455,139.626661,35.5911,139.576578,5.86,4,4
1,109,1010,【E1】東名高速道路,0,1010004,1010006,6,東京本線,東名川崎,35.5911,139.576578,35.58415,139.573103,0.83,4,4
2,109,1010,【E1】東名高速道路,0,1010006,1010016,11,東名川崎,横浜青葉,35.58415,139.573103,35.54295,139.540811,5.43,4,4
3,109,1010,【E1】東名高速道路,0,1010016,1010018,16,横浜青葉,横浜青葉ＪＣＴ,35.54295,139.540811,35.54295,139.540811,0.0,4,6
4,109,1010,【E1】東名高速道路,0,1010018,1010011,21,横浜青葉ＪＣＴ,港北ＰＡ,35.54295,139.540811,35.53066,139.533192,1.53,6,4


In [9]:
ic_graph = nx.from_pandas_edgelist(df_icnet, source='start_code', target='end_code',
                                  edge_attr=['distance', 'road_code', 'direction'],
                                  create_using=nx.DiGraph())
route_dict = dict(nx.all_pairs_dijkstra_path(ic_graph, weight='distance'))

In [10]:
# バイナリ保存
# fname_out = './route_dict.pkl'
# with open(fname_out, 'wb') as f:
#     pickle.dump(route_dict, f)

In [6]:
! du -h ./route_dict.pkl

du: cannot access './route_dict.pkl': No such file or directory


In [None]:
# バイナリ形式をロード
import pickle

fname_in = './route_dict.pkl'
with open(fname_in, 'rb') as f:
    route_dict = pickle.load(f)

In [38]:
tmp = df_icnet.loc[:, ['road_code', 'road_name', 'direction', 'start_name', 'start_code', 'end_name', 'end_code', 'distance']]
tmp.columns = ['路線コード', '路線名', '方向', '発IC名称', '発ICコード', '着IC名称', '着ICコード', '距離']
tmp.head()

Unnamed: 0,路線コード,路線名,方向,発IC名称,発ICコード,着IC名称,着ICコード,距離
0,1010,【E1】東名高速道路,0,東京,1010001,東京本線,1010004,5.86
1,1010,【E1】東名高速道路,0,東京本線,1010004,東名川崎,1010006,0.83
2,1010,【E1】東名高速道路,0,東名川崎,1010006,横浜青葉,1010016,5.43
3,1010,【E1】東名高速道路,0,横浜青葉,1010016,横浜青葉ＪＣＴ,1010018,0.0
4,1010,【E1】東名高速道路,0,横浜青葉ＪＣＴ,1010018,港北ＰＡ,1010011,1.53


# 経路表をpandas.DataFrameの形式で作成

In [None]:
df_route = pd.DataFrame.from_dict(route_dict, orient='index')
df_route = df_route.stack().reset_index()
df_route.columns = ['start_code', 'end_code', 'route']
df_route = df_route.astype({'start_code': 'category', 'end_code': 'category'})
df_route.info()

In [None]:
df_route.head()

In [None]:
# df_route.to_csv('./route_table.csv', index=False)
# df_route.to_pickle('./route_table.pkl')

In [None]:
! du -h ./route_table.*

In [None]:
df_route = pd.read_pickle('./route_table.pkl')
df_route.head()

## インデックスを張る

In [None]:
df_route_indexed = df_route.set_index(['start_code', 'end_code'])
df_route_indexed.info()

In [None]:
df_route_indexed.head()

In [None]:
# df_route_indexed.to_pickle('./route_table_indexed.pkl')

In [None]:
! du -h ./route_table_indexed.*

In [None]:
df_route_indexed = pd.read_pickle('./route_table_indexed.pkl')
df_route_indexed.head()

# 経路の検索速度を比較
- 辞書 < テーブル（インデックス） < テーブル
- 辞書はテーブル（インデックス）のおよそ1000倍速い
- テーブル（インデックス）はテーブルのおよそ10倍速い

In [7]:
start_code = "1040191"
end_code = "1440096"

In [None]:
%timeit route_dict[start_code][end_code]

In [None]:
%timeit df_route.loc[(df_route.start_code == start_code) & (df_route.end_code == end_code)]

In [None]:
%timeit df_route_indexed.loc[(start_code, end_code)]

# 検索ログを簡略化

In [8]:
def get_log(date):
    if not os.path.exists(SEARCH_LOG_CSV(date)):
        return None

    df = pd.read_csv(SEARCH_LOG_CSV(date), 
                     dtype={
                         'start_code': str,
                         'end_code': str,
                         'via1_code': str,
                         'via2_code': str,
                         'via3_code': str,
                         'order': str
                     })
    df = cudf.from_pandas(df)
    return df

In [9]:
nodes_set = set(ic_graph.nodes)

def simplify_search_log(df):
    # レコードを限定
    df = df.loc[(df['start_code'].isin(nodes_set)) & (df['end_code'].isin(nodes_set))]
    
    # 列を限定
    df_res = df.loc[:, ['date', 'start_code', 'end_code', 'spec_day', 'spec_time', 'spec_type', 'car_type']]
    df_res.rename(columns={'date': 'datetime'}, inplace=True)
    
    # 指定日時を1列にまとめる
    df_res = df_res.assign(spec_datetime=df_res['spec_day'] + ' ' + df_res['spec_time'])
    df_res.drop(['spec_day', 'spec_time'], axis=1, inplace=True)
    
    # 型変換
    df_res = df_res.astype({
        'start_code': 'category',
        'end_code': 'category',
        'spec_type': 'category',
        'car_type': 'category',
    })
    df_res = df_res.assign(
        datetime=cudf.to_datetime(df_res['datetime']),
        spec_datetime=cudf.to_datetime(df_res['spec_datetime'])
    )
    
    # 列並び替え
    df_res = df_res.loc[:, ['datetime', 'start_code', 'end_code', 'spec_datetime', 'spec_type', 'car_type']]
    return df_res

In [10]:
df = get_log('20230901')
print(df.shape)
df.head()

(373484, 19)


Unnamed: 0,date,start_code,start_name,end_code,end_name,via1_code,via1_name,via2_code,via2_name,via3_code,via3_name,spec_day,spec_time,spec_type,order,car_type,use_nexco,use_urban,use_local
0,2023/09/01 00:00:00,6001006,宝町,1400091,那珂,,,,,,,2023-09-01,07:00,1,2,3,1,1,1
1,2023/09/01 00:00:00,1461080,太田桐生,1040011,浦和（東京方面）,,,,,,,2023-08-31,08:00,1,3,2,1,1,1
2,2023/09/01 00:00:00,1072023,大和まほろばスマート,1612046,須崎東,,,,,,,2023-10-21,15:00,1,2,2,1,1,1
3,2023/09/01 00:00:01,1461080,太田桐生,212B011,大宮,,,,,,,2023-09-01,00:00,1,2,2,1,1,1
4,2023/09/01 00:00:01,1800076,高崎,1040241,仙台宮城,,,,,,,2023-08-31,23:50,1,2,2,1,1,1


In [None]:
df = simplify_search_log(df)
df.head()

Unnamed: 0,datetime,start_code,end_code,spec_datetime,spec_type,car_type
0,2023-09-01 00:00:00,6001006,1400091,2023-09-01 07:00:00,1,3
1,2023-09-01 00:00:00,1461080,1040011,2023-08-31 08:00:00,1,2
2,2023-09-01 00:00:00,1072023,1612046,2023-10-21 15:00:00,1,2
3,2023-09-01 00:00:01,1461080,212B011,2023-09-01 00:00:00,1,2
4,2023-09-01 00:00:01,1800076,1040241,2023-08-31 23:50:00,1,2


In [None]:
df.to_pandas().to_csv('./tmp2.csv', index=False)

In [None]:
! du -h ./*

24K	./1_handle_raw.ipynb
136K	./2_specified_search_count.ipynb
136K	./3_unspecified_search_count.ipynb
128K	./4_resample.ipynb
52K	./dorapura_format.ipynb
28K	./expressway_route.ipynb
728K	./ipynb_checkpoints
12G	./search_count
34M	./tmp.csv
22M	./tmp2.csv


In [35]:
df.columns = ['検索日時', '出発IC', '到着IC', '指定日時', '指定種別', '車両種別']
df.head()

Unnamed: 0,検索日時,出発IC,到着IC,指定日時,指定種別,車両種別
0,2023-09-01 00:00:00,6001006,1400091,2023-09-01 07:00:00,1,3
1,2023-09-01 00:00:00,1461080,1040011,2023-08-31 08:00:00,1,2
2,2023-09-01 00:00:00,1072023,1612046,2023-10-21 15:00:00,1,2
3,2023-09-01 00:00:01,1461080,212B011,2023-09-01 00:00:00,1,2
4,2023-09-01 00:00:01,1800076,1040241,2023-08-31 23:50:00,1,2


In [36]:
df.to_pandas()

Unnamed: 0,検索日時,出発IC,到着IC,指定日時,指定種別,車両種別
0,2023-09-01 00:00:00,6001006,1400091,2023-09-01 07:00:00,1,3
1,2023-09-01 00:00:00,1461080,1040011,2023-08-31 08:00:00,1,2
2,2023-09-01 00:00:00,1072023,1612046,2023-10-21 15:00:00,1,2
3,2023-09-01 00:00:01,1461080,212B011,2023-09-01 00:00:00,1,2
4,2023-09-01 00:00:01,1800076,1040241,2023-08-31 23:50:00,1,2
...,...,...,...,...,...,...
373479,2023-09-01 23:55:24,1810046,6026071,2024-02-05 06:40:00,1,2
373480,2023-09-01 23:56:19,5020011,5020001,2024-02-19 15:00:00,1,4
373481,2023-09-01 23:58:31,1010181,7019011,2023-12-25 00:00:00,1,2
373482,2023-09-01 23:58:36,1010181,7019011,2023-12-25 00:00:00,1,2


In [15]:
df.info()

<class 'cudf.core.dataframe.DataFrame'>
Int64Index: 493645 entries, 0 to 494075
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype
---  ------         --------------   -----
 0   datetime       493645 non-null  datetime64[ns]
 1   start_code     493645 non-null  category
 2   end_code       493645 non-null  category
 3   spec_datetime  493645 non-null  datetime64[ns]
 4   spec_type      493645 non-null  category
 5   car_type       493645 non-null  category
dtypes: category(4), datetime64[ns](2)
memory usage: 14.2 MB


# 検索ログと経路表をマージ

In [None]:
df = get_log('20230501')
df = simplify_search_log(df)
df.head()

In [None]:
# df_route = pd.read_pickle('./route_table.pkl')
df_route.head()

In [None]:
df.info()

In [None]:
pd.merge(df.to_pandas(), df_route, how='left', on=['start_code', 'end_code'])

# 検索ログの扱い
- **所要時間の計算**を改良するアプローチ
    - 車両種別によって速度を変化させる
        - https://www.driveplaza.com/search/division/
            - 1: 軽自動車等
            - 2: 普通車
            - 3: 中型車
            - 4: 大型車
            - 5: 特大車
    - 過去の交通データを参考にする
        - 1年前のある日時・ある区間はこの平均速度で車両が通過していた、という情報
        - 厳密にやるには<u>予測対象道路（関越道 + 東北道）以外の交通データが必要になってしまう</u>
    - <u>結局1時間単位で丸めてしまえば多少の計算方法の違いでは影響出にくい？</u>
    
- **検索ログの集計方法**を改良するアプローチ
    - 検索した日時と指定日時の時間差によって重みを変える
        - 7日前の検索はあまり参考にせず、昨日の検索を重視する
        - 昨日の朝の検索よりも、昨日の夜の検索を特に重視する
    - 短時間で重複した検索は除外し、その中で最新の検索のみ参照する
        - 出発時刻を変えながら何度か経路を検索してみる人もいそう
        - <u>重複した検索をどう検出するかが問題</u>
            - 出発地・到着地の完全一致
            - 出発と到着のICを少しずらすパターンもある

In [None]:
df = get_log('20230501')
df = simplify_search_log(df)
df.head()

In [None]:
df['car_type'].value_counts()