In [1]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [2]:
    
def make_dataset(traffic, ic_master, search_spec, search_unspec):
    # 欠損値の除外
    traffic = traffic[traffic['speed'].isnull()==False]
    ic_master.dropna(inplace=True)
    search_spec.dropna(inplace=True)
    search_unspec.dropna(inplace=True)
    
    # datetimeからdateを作成
    traffic['date'] = traffic['datetime'].apply(lambda x: x.split()[0])

    # データのマージ
    traffic = traffic.merge(ic_master, on=['start_code', 'end_code'], how='left')
    traffic = traffic.merge(search_spec, on=['datetime', 'start_code', 'end_code'], how='left')
    traffic = traffic.merge(search_unspec, on=['date', 'start_code', 'end_code'], how='left')
    traffic.sort_values(['date', 'start_code', 'end_code'], inplace=True)
    traffic.reset_index(drop=True, inplace=True)
    traffic.drop(columns='date', inplace=True)

    # データ型の変更
    traffic['datetime'] = pd.to_datetime(traffic['datetime'])

    return traffic

def expand_datetime(df):
    if 'datetime' in df.columns:
        df['year'] = df['datetime'].dt.year
        df['month'] = df['datetime'].dt.month
        df['day'] = df['datetime'].dt.day
        df['dayofyear'] = df['datetime'].dt.dayofyear
        df['weekofyear'] = df['datetime'].dt.weekofyear
        df['dayofweek'] = df['datetime'].dt.dayofweek
        df['hour'] = df['datetime'].dt.hour

    return df

In [3]:

traffic = pd.read_csv('train.csv')
search_spec = pd.read_csv('search_specified.csv')
search_unspec = pd.read_csv('search_unspecified.csv')
ic_master = pd.read_csv('road_local.csv')

# 当日の検索数を使用できるように変更(search_spec, search_unspec)
search_spec['datetime'] = pd.to_datetime(search_spec['datetime'])
search_unspec['date'] = pd.to_datetime(search_unspec['date'])
search_spec['datetime'] -= pd.to_timedelta(1, 'd')
search_unspec['date'] -= pd.to_timedelta(1, 'd')
search_spec['datetime'] = search_spec['datetime'].astype('str')
search_unspec['date'] = search_unspec['date'].astype('str')

df = make_dataset(traffic, ic_master, search_spec, search_unspec)
train_df = df 
train_df['datetime'] = pd.to_datetime(train_df['datetime'])
# 次の日の同じ時間帯のデータを参照するために、1日加算した列を作成
train_df['next_day'] = train_df['datetime'] + pd.Timedelta(days=1)


# 元のDataFrameとマージするために、一時的なDataFrameを作成
df_temp = train_df[['start_name','end_name','direction','datetime', 'is_congestion']].copy()
df_temp.rename(columns={'datetime': 'next_day', 'is_congestion': 'next_day_congestion'}, inplace=True)


# 次の日のis_congestionを含む新しい列をマージして追加
train_df = pd.merge(train_df, df_temp, on=['start_name','end_name','next_day','direction'], how='left')

# 不要になった'next_day'列を削除
train_df.drop(columns=['next_day'], inplace=True)

train_df=train_df.drop(['start_name','end_name'],axis=1)
# 'direction'の値を0と1に置き換える
replace_dict = {'下り': 0, '上り': 1} # この辞書を編集して任意の変換ルールを定義
train_df['direction'] = train_df['direction'].replace(replace_dict)
train_df = train_df[train_df['datetime'].dt.date != pd.to_datetime('2023-07-31').date()]

train_df['point'] = train_df['start_code'].astype(str) + "_" + train_df['direction'].astype(str) + "_" + train_df['end_code'].astype(str)

train_df = expand_datetime(train_df)

train_df.drop(columns=['start_lat','end_lat','start_lng','end_lng', 'start_code', 'end_code','direction'], inplace=True)

  df['weekofyear'] = df['datetime'].dt.weekofyear


In [4]:

train_df.columns

Index(['datetime', 'KP', 'OCC', 'allCars', 'speed', 'is_congestion',
       'road_code', 'limit_speed', 'start_KP', 'end_KP', 'start_pref_code',
       'end_pref_code', 'start_degree', 'end_degree', 'search_specified',
       'search_unspecified', 'next_day_congestion', 'point', 'year', 'month',
       'day', 'dayofyear', 'weekofyear', 'dayofweek', 'hour'],
      dtype='object')

In [5]:
train_df

Unnamed: 0,datetime,KP,OCC,allCars,speed,is_congestion,road_code,limit_speed,start_KP,end_KP,...,search_unspecified,next_day_congestion,point,year,month,day,dayofyear,weekofyear,dayofweek,hour
0,2021-04-08 00:00:00,5.47,2.000000,528,86.272212,0,1040,100.0,4.8,10.5,...,3417.0,0.0,1040013_0_1040016,2021,4,8,98,14,3,0
1,2021-04-08 00:00:00,5.47,2.000000,528,86.272212,0,1040,100.0,4.8,10.5,...,3417.0,0.0,1040013_0_1040016,2021,4,8,98,14,3,0
2,2021-04-08 00:00:00,5.47,2.000000,528,86.272212,0,1040,100.0,4.8,10.5,...,3417.0,0.0,1040013_0_1040016,2021,4,8,98,14,3,0
3,2021-04-08 01:00:00,5.47,2.000000,462,85.455724,0,1040,100.0,4.8,10.5,...,3417.0,0.0,1040013_0_1040016,2021,4,8,98,14,3,1
4,2021-04-08 01:00:00,5.47,2.000000,462,85.455724,0,1040,100.0,4.8,10.5,...,3417.0,0.0,1040013_0_1040016,2021,4,8,98,14,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24307195,2023-07-30 22:00:00,125.37,1.083333,268,98.327138,0,1800,80.0,120.7,125.9,...,2463.0,0.0,1800106_0_1800111,2023,7,30,211,30,6,22
24307196,2023-07-30 22:00:00,125.37,1.083333,268,98.327138,0,1800,80.0,120.7,125.9,...,2463.0,0.0,1800106_0_1800111,2023,7,30,211,30,6,22
24307197,2023-07-30 23:00:00,125.37,1.000000,228,96.528384,0,1800,80.0,120.7,125.9,...,2463.0,0.0,1800106_0_1800111,2023,7,30,211,30,6,23
24307198,2023-07-30 23:00:00,125.37,1.000000,228,96.528384,0,1800,80.0,120.7,125.9,...,2463.0,0.0,1800106_0_1800111,2023,7,30,211,30,6,23


In [6]:
X = train_df.drop(['datetime','next_day_congestion'],axis=1)
y = train_df['next_day_congestion']

In [7]:
# データセットを訓練セットとテストセットに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 欠損値処理と特徴量のスケーリングを行うパイプラインの作成
pipeline = make_pipeline(
    SimpleImputer(strategy="mean"), # 欠損値を平均値で補完
    StandardScaler(), # 特徴量のスケーリング
    XGBClassifier(eval_metric='mlogloss')
)

# パイプラインを使用して訓練データでモデルを訓練
pipeline.fit(X_train, y_train)

In [8]:
# モデルのテストセットに対する予測
predictions = pipeline.predict(X_test)

# 評価指標の計算
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)  # マルチクラスの場合は'weighted'を使用
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

print(f"Model Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print("Confusion Matrix:")
print(conf_matrix)

# クラスごとの性能評価
print("\nClass-wise Performance Metrics:")
print(classification_report(y_test, predictions, target_names=['Class 0', 'Class 1']))

Model Accuracy: 0.9973088220774091
F1 Score: 0.43356280036368366
Precision: 0.7328747072599532
Recall: 0.30783891792191825
Confusion Matrix:
[[4843350    1825]
 [  11258    5007]]

Class-wise Performance Metrics:
              precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00   4845175
     Class 1       0.73      0.31      0.43     16265

    accuracy                           1.00   4861440
   macro avg       0.87      0.65      0.72   4861440
weighted avg       1.00      1.00      1.00   4861440



In [9]:
param_distributions = {
    'xgbclassifier__learning_rate': uniform(0.01, 0.2),  # 0.01から0.2までの一様分布
    'xgbclassifier__n_estimators': randint(100, 1000),  # 100から1000までの整数
    'xgbclassifier__max_depth': randint(3, 10),  # 3から10までの整数
}


In [None]:
# RandomizedSearchCVのインスタンス化
random_search = RandomizedSearchCV(
    pipeline, param_distributions=param_distributions, n_iter=100, cv=5, verbose=1, n_jobs=-1, random_state=42
)

# チューニングの実行
random_search.fit(X_train, y_train)
print("Best Parameters:", random_search.best_params_)
best_model = random_search.best_estimator_
predictions = best_model.predict(X_test)

# 評価指標の再計算
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)

print(f"Optimized Model Accuracy: {accuracy}")
print(f"Optimized F1 Score: {f1}")
print(f"Optimized Precision: {precision}")
print(f"Optimized Recall: {recall}")


Fitting 5 folds for each of 100 candidates, totalling 500 fits
