In [1]:
# run.pyとpredict.pyをipynb上で実行できるようにマージする

In [2]:
!pip install lightGBM



In [3]:
import sys
import argparse
import os
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score
import warnings
warnings.simplefilter('ignore')
import lightgbm
from sklearn.preprocessing import LabelEncoder
from sample_submit.src.predictor import ScoringService



In [4]:
def expand_datetime(df):
    if 'datetime' in df.columns:
        df['year'] = df['datetime'].dt.year
        df['month'] = df['datetime'].dt.month
        df['day'] = df['datetime'].dt.day
        df['hour'] = df['datetime'].dt.hour
    if 'date' in df.columns:
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['day'] = df['date'].dt.day
    return df

In [5]:
def make_dataset(traffic, ic_master, search_spec, search_unspec):
    # 欠損値の除外
    traffic = traffic[traffic['speed'].isnull()==False]
    ic_master.dropna(inplace=True)
    search_spec.dropna(inplace=True)
    search_unspec.dropna(inplace=True)
    
    # datetimeからdateを作成
    traffic['date'] = traffic['datetime'].apply(lambda x: x.split()[0])

    # データのマージ
    traffic = traffic.merge(ic_master, on=['start_code', 'end_code'], how='left')
    traffic = traffic.merge(search_spec, on=['datetime', 'start_code', 'end_code'], how='left')
    traffic = traffic.merge(search_unspec, on=['date', 'start_code', 'end_code'], how='left')
    traffic.sort_values(['date', 'start_code', 'end_code'], inplace=True)
    traffic.reset_index(drop=True, inplace=True)
    traffic.drop(columns='date', inplace=True)

    # データ型の変更
    traffic['datetime'] = pd.to_datetime(traffic['datetime'])
    traffic = expand_datetime(traffic)
    return traffic

In [6]:
# 推論モデルの読みこみ
import pandas as pd
import pickle
def get_model(model_path, inference_df, inference_log):
    """Get model method

    Args:
        model_path (str): Path to the trained model directory.
        inference_df: Past data not subject to prediction.
        inference_log: Past log data that is not subject to prediction.

    Returns:
        bool: The return value. True for success.
    """

    # カレントディレクトリにあるモデルデータの読み込み
    gbm = pickle.load(open(model_path, 'rb'))


    model = gbm[0]
    data = inference_df
    log_paths = inference_log

    return model, data, log_paths

def predict(model, data, log_paths, input, input_log):
    """Predict method

    Args:
        input: meta data of the sample you want to make inference from (DataFrame)
        input_log: meta data of the sample you want to make inference from (DataFrame)

    Returns:
        prediction: Inference for the given input. Return columns must be ['datetime', 'start_code', 'end_code', 'KP'](DataFrame).
    """
    result = data.copy()
    inference_data = data.copy()
    inference_data = inference_data.drop("datetime",axis =1 )
    print(inference_data.columns)
    predict = model.predict(inference_data)
    result['prediction'] = predict
    prediction = result[['datetime', 'start_code', 'end_code', 'KP', 'prediction']]

    return prediction

In [7]:
# parse the arguments
exec_path = '/home/hayato/Kaggle/estimate_traffic/implement/sample_submit/src'
data_dir = 'train/'
start_date = '2023-06-12'
end_date = '2023-06-14'
print('\nstart date: {}, end date:{}'.format(start_date, end_date))

# load the input data
print('\nLoading Dataset...')
traffic = pd.read_csv(os.path.join(data_dir, 'train.csv'))
search_spec = pd.read_csv(os.path.join(data_dir, 'search_specified.csv'))
search_unspec = pd.read_csv(os.path.join(data_dir, 'search_unspecified.csv'))
ic_master = pd.read_csv(os.path.join(data_dir, 'road_local.csv'))

# ファイルサイズ大きく使用していないので一旦コメントアウト
log_pathes = glob.glob(f"{data_dir}/search_raw_log/*.csv")



start date: 2023-06-12, end date:2023-06-14

Loading Dataset...


In [8]:
inference_search_spec = search_spec[search_spec['datetime'] < start_date]
inference_search_unspec = search_unspec[search_unspec['date'] < start_date]
test_search_spec = search_spec[(search_spec["datetime"] >= start_date+' 00:00:00') & (search_spec['datetime']<=end_date+' 23:00:00')]
test_search_unspec = search_unspec[(search_unspec["date"] >= start_date+' 00:00:00') & (search_unspec['date']<=end_date+' 23:00:00')]

# 当日の検索数を使用できるように変更(search_spec, search_unspec)
test_search_spec['datetime'] = pd.to_datetime(test_search_spec['datetime'])
test_search_unspec['date'] = pd.to_datetime(test_search_unspec['date'])
test_search_spec['datetime'] -= pd.to_timedelta(1, 'd')
test_search_unspec['date'] -= pd.to_timedelta(1, 'd')
test_search_spec['datetime'] = test_search_spec['datetime'].astype('str')
test_search_unspec['date'] = test_search_unspec['date'].astype('str')

train = traffic[traffic['datetime'] < start_date]
valid = traffic[(traffic['datetime']>=start_date+' 00:00:00') & (traffic['datetime']<=end_date+' 23:00:00')]
train.describe()


Unnamed: 0,start_code,end_code,KP,OCC,allCars,speed,is_congestion
count,5456880.0,5456880.0,5456880.0,5256342.0,5456880.0,5456880.0,5456880.0
mean,1397125.0,1394713.0,69.16629,3.658999,1174.274,91.70073,0.003613237
std,378408.8,378031.8,40.47841,2.694772,929.8233,7.499249,0.06000152
min,1040013.0,1040013.0,2.26,0.0,0.0,0.9868421,0.0
25%,1040061.0,1040061.0,34.97,1.916667,452.0,88.10456,0.0
50%,1040121.0,1040116.0,67.125,3.083333,899.0,92.0006,0.0
75%,1800051.0,1800051.0,100.261,4.666667,1711.0,95.83648,0.0
max,1800106.0,1800111.0,168.082,93.0,5101.0,120.0,1.0


In [9]:
#inference_dfの作成
train = make_dataset(train, ic_master, inference_search_spec, inference_search_unspec)
train['section'] = train['start_code'].astype(str) + '_' + train['KP'].astype(str) + '_' + train['end_code'].astype(str)
train['dayofweek'] = train['datetime'].dt.weekday

cat_cols = ['road_code', 'start_code', 'end_code', 'section', 'direction', 'hour', 'dayofweek']
num_cols = ['datetime','year', 'month', 'day', 'search_specified', 'search_unspecified', 'KP', 'start_KP', 'end_KP', 'limit_speed']
feature_cols = cat_cols + num_cols

train_base = train[feature_cols]



In [10]:
train_base.head()


Unnamed: 0,road_code,start_code,end_code,section,direction,hour,dayofweek,datetime,year,month,day,search_specified,search_unspecified,KP,start_KP,end_KP,limit_speed
0,1040,1040013,1040016,1040013_5.47_1040016,下り,0,3,2021-04-08 00:00:00,2021,4,8,27.0,4711.0,5.47,4.8,10.5,100.0
1,1040,1040013,1040016,1040013_5.47_1040016,下り,1,3,2021-04-08 01:00:00,2021,4,8,8.0,4711.0,5.47,4.8,10.5,100.0
2,1040,1040013,1040016,1040013_5.47_1040016,下り,2,3,2021-04-08 02:00:00,2021,4,8,15.0,4711.0,5.47,4.8,10.5,100.0
3,1040,1040013,1040016,1040013_5.47_1040016,下り,3,3,2021-04-08 03:00:00,2021,4,8,21.0,4711.0,5.47,4.8,10.5,100.0
4,1040,1040013,1040016,1040013_5.47_1040016,下り,4,3,2021-04-08 04:00:00,2021,4,8,14.0,4711.0,5.47,4.8,10.5,100.0


In [11]:
train.loc[:,~train.columns.duplicated()]
print(train.columns)


Index(['datetime', 'start_code', 'end_code', 'KP', 'OCC', 'allCars', 'speed',
       'is_congestion', 'start_name', 'end_name', 'road_code', 'direction',
       'limit_speed', 'start_KP', 'end_KP', 'start_pref_code', 'end_pref_code',
       'start_lat', 'end_lat', 'start_lng', 'end_lng', 'start_degree',
       'end_degree', 'search_specified', 'search_unspecified', 'year', 'month',
       'day', 'hour', 'section', 'dayofweek'],
      dtype='object')


In [12]:
# ========================================
# カテゴリ変数の処理
# ========================================
le_dict = {}
for c in tqdm(cat_cols):
    le = LabelEncoder()
    # print(train[c])
    train[c] = le.fit_transform(train[c])
    le_dict[c] = le

100%|██████████| 7/7 [00:03<00:00,  2.15it/s]


In [13]:
#学習用のログデータの抽出
train_log_pathes = [path for path in log_pathes if path.split("/")[-1][:-4].replace("_", "-") < start_date]
print('Done')

# change the working directory
os.chdir(exec_path)
cwd = os.getcwd()
print('\nMoved to {}'.format(cwd))
model_path = os.path.join('..', 'model',"trained_model.pkl")
# sys.path.append(cwd)



#日付順にソート(groupbyで時系列順が一致するための対策)
valid["datetime"] = pd.to_datetime(valid["datetime"])
valid = valid.sort_values(by="datetime").reset_index(drop=True)
valid["datetime"] = valid["datetime"].astype(str)

#input_dfの作成
valid = make_dataset(valid, ic_master, test_search_spec, test_search_unspec)

# validの特長量前処理
valid['section'] = valid['start_code'].astype(str) + '_' + valid['KP'].astype(str) + '_' + valid['end_code'].astype(str)
valid['dayofweek'] = valid['datetime'].dt.weekday

cat_cols = ['road_code', 'start_code', 'end_code', 'section', 'direction', 'hour', 'dayofweek']
num_cols = ['datetime','year', 'month', 'day', 'search_specified', 'search_unspecified', 'KP', 'start_KP', 'end_KP', 'limit_speed']
feature_cols = cat_cols + num_cols
print(valid.columns)
valid_correct = valid["is_congestion"]

valid = valid[feature_cols]
# validの特長量前処理

print('\nLoading the model...', end = '\r')
# model, data, log_paths = get_model(model_path, train, train_log_pathes)
model, data, log_paths = ScoringService.get_model(model_path, valid, train_log_pathes)

valid.describe()

Done

Moved to /home/hayato/Kaggle/estimate_traffic/implement/sample_submit/src
Index(['datetime', 'start_code', 'end_code', 'KP', 'OCC', 'allCars', 'speed',
       'is_congestion', 'start_name', 'end_name', 'road_code', 'direction',
       'limit_speed', 'start_KP', 'end_KP', 'start_pref_code', 'end_pref_code',
       'start_lat', 'end_lat', 'start_lng', 'end_lng', 'start_degree',
       'end_degree', 'search_specified', 'search_unspecified', 'year', 'month',
       'day', 'hour', 'section', 'dayofweek'],
      dtype='object')

gbm type:::<class 'list'>
gbm:::[LGBMClassifier(importance_type='gain', metric='auc', n_estimators=100000,
               objective='binary', random_state=42, verbose=-1), LGBMClassifier(importance_type='gain', metric='auc', n_estimators=100000,
               objective='binary', random_state=42, verbose=-1)]


Unnamed: 0,road_code,start_code,end_code,section,direction,hour,dayofweek,datetime,year,month,day,search_specified,search_unspecified,KP,start_KP,end_KP,limit_speed
count,20592.0,20592.0,20592.0,20592.0,20592.0,20592.0,20592.0,20592,20592.0,20592.0,20592.0,13728.0,13728.0,20592.0,20592.0,20592.0,20592.0
mean,0.482517,21.692308,21.118881,142.5,0.454545,11.5,1.0,2023-06-13 11:30:00,2023.0,6.0,13.0,33.212704,3687.784965,69.16629,69.432867,68.622378,98.111888
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-06-12 00:00:00,2023.0,6.0,12.0,0.0,1157.0,2.26,0.8,0.8,80.0
25%,0.0,11.0,11.0,71.0,0.0,5.75,0.0,2023-06-12 17:45:00,2023.0,6.0,12.0,12.0,2639.0,34.97,33.4,33.4,100.0
50%,0.0,21.0,20.0,142.5,0.0,11.5,1.0,2023-06-13 11:30:00,2023.0,6.0,13.0,23.0,3442.0,67.125,63.7,63.7,100.0
75%,1.0,32.0,31.0,214.0,1.0,17.25,2.0,2023-06-14 05:15:00,2023.0,6.0,14.0,41.0,4836.0,100.261,103.0,103.0,100.0
max,1.0,44.0,44.0,285.0,1.0,23.0,2.0,2023-06-14 23:00:00,2023.0,6.0,14.0,231.0,6331.0,168.082,169.7,160.5,100.0
std,0.499706,12.846215,12.436145,82.562589,0.497942,6.922355,0.816516,,0.0,0.0,0.816516,34.052055,1320.052196,40.479389,41.274293,39.982324,5.847985


In [14]:
# このセルを実行しただけでリスタートできるようにインポートも追加
from sample_submit.src.predictor import ScoringService
# 推論フェーズ
predictions = pd.DataFrame()
for d, input_df in tqdm(valid.groupby(valid['datetime'].dt.date)):
    print("valid.groupby(valid['datetime'].dt.date){}:::".format(valid.groupby(valid['datetime'].dt.date)))
    print("valid['datetime'].dt.date{}:::".format(valid['datetime'].dt.date))
    input_df = input_df.reset_index(drop=True)
    datetime_str = d.strftime('%Y/%m/%d').replace('/', '_')
    input_log = pd.read_csv(f"../../{data_dir}search_raw_log/{datetime_str}.csv")
    # prediction = predict(model, data, log_paths, input_df, input_log)
    prediction, prediction_raw_data = ScoringService.predict(model, data, log_paths, input_df, input_log)
    print(prediction.describe())
    if type(prediction)!= pd.DataFrame:
        print('Invalid data type in the prediction. Must be pandas.DataFrame')
    elif set(prediction.columns) != set(['datetime', 'start_code', 'end_code', 'KP', 'prediction']):
        print('Invalid columns name: {},  Excepted name: {}'.format(prediction.columns, {'datetime', 'start_code', 'end_code', 'prediction'}))
    elif prediction[prediction["datetime"].dt.date != d].shape[0] > 0:
        notmatch_datetime = prediction[prediction["datetime"].dt.date != d]["datetime"].dt.date.astype(str).unique()
        # print('datetime does not match: {},  Excepted datetime: {}'.format(notmatch_datetime, d))
    # predictions = pd.concat([predictions, prediction])
    predictions = prediction
# 推論フェーズ


 33%|███▎      | 1/3 [00:00<00:00,  5.68it/s]

valid.groupby(valid['datetime'].dt.date)<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f65c507e7f0>:::
valid['datetime'].dt.date0        2023-06-12
1        2023-06-12
2        2023-06-12
3        2023-06-12
4        2023-06-12
            ...    
20587    2023-06-14
20588    2023-06-14
20589    2023-06-14
20590    2023-06-14
20591    2023-06-14
Name: datetime, Length: 20592, dtype: object:::
       road_code  start_code  end_code  section  direction  hour  dayofweek   
0              0           0         1        2          1     0          0  \
1              0           0         1        1          1     0          0   
2              0           0         1        0          1     0          0   
3              0           0         1        0          1     1          0   
4              0           0         1        2          1     1          0   
...          ...         ...       ...      ...        ...   ...        ...   
20587          1          44        44 

 67%|██████▋   | 2/3 [00:00<00:00,  5.02it/s]

       road_code  start_code  end_code  section  direction  hour  dayofweek   
0              0           0         1        2          1     0          0  \
1              0           0         1        1          1     0          0   
2              0           0         1        0          1     0          0   
3              0           0         1        0          1     1          0   
4              0           0         1        2          1     1          0   
...          ...         ...       ...      ...        ...   ...        ...   
20587          1          44        44      284          1    22          2   
20588          1          44        44      285          1    22          2   
20589          1          44        44      284          1    23          2   
20590          1          44        44      283          1    23          2   
20591          1          44        44      285          1    23          2   

       year  month  day  search_specified  search_u

100%|██████████| 3/3 [00:00<00:00,  5.10it/s]

                  datetime    start_code      end_code            KP   
count                20592  20592.000000  20592.000000  20592.000000  \
mean   2023-06-13 11:30:00     21.692308     21.118881     69.166290   
min    2023-06-12 00:00:00      0.000000      0.000000      2.260000   
25%    2023-06-12 17:45:00     11.000000     11.000000     34.970000   
50%    2023-06-13 11:30:00     21.000000     20.000000     67.125000   
75%    2023-06-14 05:15:00     32.000000     31.000000    100.261000   
max    2023-06-14 23:00:00     44.000000     44.000000    168.082000   
std                    NaN     12.846215     12.436145     40.479389   

       prediction  
count     20592.0  
mean          0.0  
min           0.0  
25%           0.0  
50%           0.0  
75%           0.0  
max           0.0  
std           0.0  





In [15]:
prediction_raw_data.describe()

Unnamed: 0,0
count,20592.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


In [16]:
prediction.describe()

Unnamed: 0,datetime,start_code,end_code,KP,prediction
count,20592,20592.0,20592.0,20592.0,20592.0
mean,2023-06-13 11:30:00,21.692308,21.118881,69.16629,0.0
min,2023-06-12 00:00:00,0.0,0.0,2.26,0.0
25%,2023-06-12 17:45:00,11.0,11.0,34.97,0.0
50%,2023-06-13 11:30:00,21.0,20.0,67.125,0.0
75%,2023-06-14 05:15:00,32.0,31.0,100.261,0.0
max,2023-06-14 23:00:00,44.0,44.0,168.082,0.0
std,,12.846215,12.436145,40.479389,0.0


In [17]:
predictions.describe()

Unnamed: 0,datetime,start_code,end_code,KP,prediction
count,20592,20592.0,20592.0,20592.0,20592.0
mean,2023-06-13 11:30:00,21.692308,21.118881,69.16629,0.0
min,2023-06-12 00:00:00,0.0,0.0,2.26,0.0
25%,2023-06-12 17:45:00,11.0,11.0,34.97,0.0
50%,2023-06-13 11:30:00,21.0,20.0,67.125,0.0
75%,2023-06-14 05:15:00,32.0,31.0,100.261,0.0
max,2023-06-14 23:00:00,44.0,44.0,168.082,0.0
std,,12.846215,12.436145,40.479389,0.0


In [18]:
valid["is_congestion"] = valid_correct
results = valid[['datetime', 'start_code', 'end_code', 'KP']]
results.describe()

Unnamed: 0,datetime,start_code,end_code,KP
count,20592,20592.0,20592.0,20592.0
mean,2023-06-13 11:30:00,21.692308,21.118881,69.16629
min,2023-06-12 00:00:00,0.0,0.0,2.26
25%,2023-06-12 17:45:00,11.0,11.0,34.97
50%,2023-06-13 11:30:00,21.0,20.0,67.125
75%,2023-06-14 05:15:00,32.0,31.0,100.261
max,2023-06-14 23:00:00,44.0,44.0,168.082
std,,12.846215,12.436145,40.479389


In [19]:
# スコア算出用に調整
results = pd.merge(results, predictions, on=['datetime', 'start_code', 'end_code', 'KP'], how='left')
results.describe()

Unnamed: 0,datetime,start_code,end_code,KP,prediction
count,20592,20592.0,20592.0,20592.0,20592.0
mean,2023-06-13 11:30:00,21.692308,21.118881,69.16629,0.0
min,2023-06-12 00:00:00,0.0,0.0,2.26,0.0
25%,2023-06-12 17:45:00,11.0,11.0,34.97,0.0
50%,2023-06-13 11:30:00,21.0,20.0,67.125,0.0
75%,2023-06-14 05:15:00,32.0,31.0,100.261,0.0
max,2023-06-14 23:00:00,44.0,44.0,168.082,0.0
std,,12.846215,12.436145,40.479389,0.0


In [20]:
results['datetime'] = pd.to_datetime(results['datetime'])
results.describe()


Unnamed: 0,datetime,start_code,end_code,KP,prediction
count,20592,20592.0,20592.0,20592.0,20592.0
mean,2023-06-13 11:30:00,21.692308,21.118881,69.16629,0.0
min,2023-06-12 00:00:00,0.0,0.0,2.26,0.0
25%,2023-06-12 17:45:00,11.0,11.0,34.97,0.0
50%,2023-06-13 11:30:00,21.0,20.0,67.125,0.0
75%,2023-06-14 05:15:00,32.0,31.0,100.261,0.0
max,2023-06-14 23:00:00,44.0,44.0,168.082,0.0
std,,12.846215,12.436145,40.479389,0.0


In [21]:
results['datetime'] += pd.to_timedelta(1, 'd')
results.describe()



Unnamed: 0,datetime,start_code,end_code,KP,prediction
count,20592,20592.0,20592.0,20592.0,20592.0
mean,2023-06-14 11:30:00,21.692308,21.118881,69.16629,0.0
min,2023-06-13 00:00:00,0.0,0.0,2.26,0.0
25%,2023-06-13 17:45:00,11.0,11.0,34.97,0.0
50%,2023-06-14 11:30:00,21.0,20.0,67.125,0.0
75%,2023-06-15 05:15:00,32.0,31.0,100.261,0.0
max,2023-06-15 23:00:00,44.0,44.0,168.082,0.0
std,,12.846215,12.436145,40.479389,0.0


In [25]:
results = pd.merge(results, valid[['datetime', 'start_code', 'end_code', 'KP', 'is_congestion']], on=['datetime', 'start_code', 'end_code', 'KP'], how='inner')
# results["prediction"][:-1]= 1
results.describe()

Unnamed: 0,datetime,start_code,end_code,KP,prediction,is_congestion_x,is_congestion_y
count,13728,13728.0,13728.0,13728.0,13728.0,13728.0,13728.0
mean,2023-06-13 23:30:00,21.692308,21.118881,69.16629,0.999927,0.000583,0.000583
min,2023-06-13 00:00:00,0.0,0.0,2.26,0.0,0.0,0.0
25%,2023-06-13 11:45:00,11.0,11.0,34.97,1.0,0.0,0.0
50%,2023-06-13 23:30:00,21.0,20.0,67.125,1.0,0.0,0.0
75%,2023-06-14 11:15:00,32.0,31.0,100.261,1.0,0.0,0.0
max,2023-06-14 23:00:00,44.0,44.0,168.082,1.0,1.0,1.0
std,,12.84637,12.436296,40.47988,0.008535,0.024134,0.024134


In [24]:
print("null値チェック\n{}".format(results.isnull().sum()))
print("データ長チェック\n{}".format(len(results["datetime"])))

# compute F1SCORE
print('\n==================')
print('f1_score:', f1_score(results['is_congestion'], results['prediction']))
print('==================')
results.to_csv("predict.csv", index=False)
results.info()

null値チェック
datetime         0
start_code       0
end_code         0
KP               0
prediction       0
is_congestion    0
dtype: int64
データ長チェック
13728

f1_score: 0.0011649071714597743
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13728 entries, 0 to 13727
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   datetime       13728 non-null  datetime64[ns]
 1   start_code     13728 non-null  int64         
 2   end_code       13728 non-null  int64         
 3   KP             13728 non-null  float64       
 4   prediction     13728 non-null  int64         
 5   is_congestion  13728 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(4)
memory usage: 643.6 KB
