# LightGBM の実装

In [29]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import numpy as np
import pandas as pd
from pathlib import Path
import pickle

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder, OneHotEncoder

import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor

In [30]:
!mkdir oof
!mkdir models

mkdir: oof: File exists
mkdir: models: File exists


In [31]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 1
    DATA_PATH = Path('.')
    OOF_DATA_PATH = Path('./oof')
    MODEL_DATA_PATH = Path('./models')
    METHOD_LIST = ['lightgbm', 'xgboost', 'catboost']
    seed = 42
    n_folds = 5
    target_col = 'attendance'
    USE_PLAYER_FEATURES = False
    metric_maximize_flag = False
    num_boost_round = 50500
    early_stopping_round = 500
    verbose = 2000
    boosting_type = 'gbdt' # 'dart'
    lgb_params = {
        'objective': 'regression',
        'metric': 'l2',
        'boosting': boosting_type,
        'learning_rate': 0.005,
        'num_leaves': 5,
        'feature_fraction': 0.50,
        'bagging_fraction': 0.80,
        'lambda_l1': 2,
        'lambda_l2': 4,
        'n_jobs': -1,
        'min_data_in_leaf': 40,
        'bagging_freq': 10,
        'seed': seed,
    }
    xgb_params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': 0.005,
        'max_depth': 4,
        'colsample_bytree': 0.50,
        'subsample': 0.80,
        'eta': 0.03,
        'gamma': 1.5,
        'lambda': 70,
        'min_child_weight': 8,
        'random_state': seed,
    }

    cat_params = {
        'loss_function': 'RMSE',
        'learning_rate': 0.005,
        'iterations': num_boost_round,
        'depth': 4,
        'colsample_bylevel': 0.50,
        'subsample': 0.80,
        'l2_leaf_reg': 3,
        'random_seed': seed
    }


In [32]:
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.seed)

In [33]:
# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'rmse', np.sqrt(mean_squared_error(y_true, y_pred)), False

def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'rmse', np.sqrt(mean_squared_error(y_true, y_pred))

# ====================================================
# Catboost Metric
# ====================================================
class CatboostMetric(object):
    def get_final_error(self, error, weight): return error
    def is_max_optimal(self): return False
    def evaluate(self, approxes, target, weight):
        error = np.sqrt(mean_squared_error(np.array(target), approxes))
        return error, 0

In [34]:
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    # OneHot encode categorical features
    one_hot_enc = OneHotEncoder(cols=categorical_features, handle_unknown='indicator')
    x_train = one_hot_enc.fit_transform(x_train)
    x_valid = one_hot_enc.transform(x_valid)

    # Initialize target encoder and imputer
    target_enc = TargetEncoder(cols=x_train.columns, handle_unknown='value', handle_missing='value')
    imputer = SimpleImputer(strategy='constant', fill_value=-1)

    # Apply target encoding
    x_train = target_enc.fit_transform(x_train, y_train)
    x_valid = target_enc.transform(x_valid)
    x_train = imputer.fit_transform(x_train)
    x_valid = imputer.transform(x_valid)

    # Create LightGBM datasets
    train_data = lgb.Dataset(x_train, label=y_train)
    valid_data = lgb.Dataset(x_valid, label=y_valid)

    # Train model
    model = lgb.train(CFG.lgb_params, train_data, num_boost_round=CFG.num_boost_round, valid_sets=[valid_data], early_stopping_rounds=CFG.early_stopping_round, verbose_eval=CFG.verbose)

    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred, one_hot_enc, target_enc,

def xgboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    # OneHot encode categorical features
    one_hot_enc = OneHotEncoder(cols=categorical_features, handle_unknown='indicator')
    x_train = one_hot_enc.fit_transform(x_train)
    x_valid = one_hot_enc.transform(x_valid)

    # Initialize target encoder and imputer
    target_enc = TargetEncoder(cols=x_train.columns, handle_unknown='value', handle_missing='value')
    imputer = SimpleImputer(strategy='constant', fill_value=-1)

    # Apply target encoding
    x_train = target_enc.fit_transform(x_train, y_train)
    x_valid = target_enc.transform(x_valid)
    x_train = imputer.fit_transform(x_train)
    x_valid = imputer.transform(x_valid)

    # Create XGBoost datasets
    train_data = xgb.DMatrix(x_train, label=y_train)
    valid_data = xgb.DMatrix(x_valid, label=y_valid)

    # Train model
    model = xgb.train(CFG.xgb_params, train_data, num_boost_round=CFG.num_boost_round, evals=[(train_data, 'train'), (valid_data, 'eval')], early_stopping_rounds=CFG.early_stopping_round, verbose_eval=CFG.verbose)

    # Predict validation
    valid_pred = model.predict(valid_data)
    return model, valid_pred, one_hot_enc, target_enc

def catboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    # Define model
    model = CatBoostRegressor(**CFG.cat_params, cat_features=categorical_features)

    # Fit model
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], early_stopping_rounds=CFG.early_stopping_round, verbose=CFG.verbose)

    # Predict validation
    valid_pred = model.predict(x_valid)

    return model, valid_pred

def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list, categorical_features: list):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    kfold = KFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    encoders = {}
    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df, train_df[CFG.target_col])):
        print('-'*50)
        print(f'{method} training fold {fold + 1}')
        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        if method == 'lightgbm':
            model, valid_pred, one_hot_enc, target_enc = lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            encoders[fold] = (one_hot_enc, target_enc)
        if method == 'xgboost':
            model, valid_pred, one_hot_enc, target_enc = xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            encoders[fold] = (one_hot_enc, target_enc)
        if method == 'catboost':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)

        # Save best model
        pickle.dump(model, open(CFG.MODEL_DATA_PATH / f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
        # Save encoders only for lightgbm and xgboost
        if method in ['lightgbm', 'xgboost']:
            pickle.dump(encoders[fold], open(CFG.MODEL_DATA_PATH / f'{method}_encoders_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))

        # Add to out of folds array
        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    # Compute out of folds metric
    score = np.sqrt(mean_squared_error(train_df[CFG.target_col], oof_predictions))
    print(f'{method} our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({'id': train_df['id'], CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    oof_df.to_csv(CFG.MODEL_DATA_PATH / f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)

In [35]:
train_df = pd.read_csv(CFG.DATA_PATH / 'train.csv')
venue_info_df = pd.read_csv(CFG.DATA_PATH / 'venue_information.csv')
test_df = pd.read_csv(CFG.DATA_PATH / 'test.csv')
test_df[CFG.target_col] = -1
match_reports_df = pd.read_csv('match_reports.csv')
holidays_in_japan_df = pd.read_csv('holidays_in_japan.csv')
submission_df = pd.read_csv(CFG.DATA_PATH / 'sample_submit.csv')
all_df = pd.concat([train_df, test_df])

In [36]:
# match_reports_df を 'id' カラムで all_df と結合します
all_df = pd.merge(all_df, match_reports_df, on='id', how='left')

# venue_info_df を 'venue' カラムで all_df と結合します
all_df = pd.merge(all_df, venue_info_df, on='venue', how='left')

# holidays_in_japan_df を 'match_date' カラムで all_df と結合します
all_df['match_date'] = pd.to_datetime(all_df['match_date'])
holidays_in_japan_df['holiday_date'] = pd.to_datetime(holidays_in_japan_df['holiday_date'])
all_df['match_date'] = all_df['match_date'].dt.date
holidays_in_japan_df['holiday_date'] = holidays_in_japan_df['holiday_date'].dt.date

# もう一度 datetime 型に戻します
all_df['match_date'] = pd.to_datetime(all_df['match_date'])
holidays_in_japan_df['holiday_date'] = pd.to_datetime(holidays_in_japan_df['holiday_date'])

all_df = pd.merge(all_df, holidays_in_japan_df, left_on='match_date', right_on='holiday_date', how='left')

In [37]:
import feature_engineering as fe
all_df = fe.apply_feature_engineering(all_df)
all_df = fe.process_periodic_features(all_df)
all_df["prefecture"] = all_df["address"].apply(fe.extract_prefecture)

all_df = all_df.drop(['venue', 'address', 'description', 'match_date', 'kick_off_time'], axis=1)

In [38]:
if not CFG.USE_PLAYER_FEATURES:
    for i in range(1, 12):
        all_df = all_df.drop([f'home_team_player{i}'],axis=1)
        all_df = all_df.drop([f'away_team_player{i}'],axis=1)

# 最後に、訓練データとテストデータに再度分割します
train_df = all_df[all_df['attendance'] != -1]
test_df = all_df[all_df['attendance'] == -1]

# 'Id'や'Target'といった特定のカラムを除外した全てのカラムを特徴量とする場合
features = train_df.columns.drop(['id', 'attendance'])

# または、データ型が 'object'（文字列）または 'category' のカラムをカテゴリカル特徴量とする場合
categorical_features = train_df.select_dtypes(include=['object', 'category']).columns.tolist()

In [39]:
for method in CFG.METHOD_LIST:
    gradient_boosting_model_cv_training(method, train_df, features, categorical_features)

--------------------------------------------------
lightgbm training fold 1
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 936
[LightGBM] [Info] Number of data points in the train set: 2692, number of used features: 113
[LightGBM] [Info] Start training from score 18000.601783
Training until validation scores don't improve for 500 rounds
[2000]	valid_0's l2: 2.04327e+07
[4000]	valid_0's l2: 1.893e+07
[6000]	valid_0's l2: 1.79546e+07
[8000]	valid_0's l2: 1.74196e+07
[10000]	valid_0's l2: 1.71219e+07
[12000]	valid_0's l2: 1.68539e+07
[14000]	valid_0's l2: 1.66757e+07
[16000]	valid_0's l2: 1.65276e+07
Early stopping, best iteration is:
[17020]	valid_0's l2: 1.64614e+07
--------------------------------------------------
lightgbm training fold 2
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 

In [40]:
def apply_encoders(data: pd.DataFrame, one_hot_enc: OneHotEncoder, target_enc: TargetEncoder):
  data = data.copy()

  # Apply one-hot encoding
  data = one_hot_enc.transform(data)

  # Apply target encoding
  data = target_enc.transform(data)

  return data

In [47]:
def lightgbm_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(CFG.MODEL_DATA_PATH / f'lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        one_hot_enc, target_enc = pickle.load(open(CFG.MODEL_DATA_PATH / f'lightgbm_encoders_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        x_test_enc = apply_encoders(x_test, one_hot_enc, target_enc)
        test_pred += model.predict(x_test_enc)
    return test_pred / CFG.n_folds

def xgboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(CFG.MODEL_DATA_PATH / f'xgboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        one_hot_enc, target_enc = pickle.load(open(CFG.MODEL_DATA_PATH / f'xgboost_encoders_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        x_test_enc = apply_encoders(x_test, one_hot_enc, target_enc)
        test_pred += model.predict(xgb.DMatrix(x_test_enc), iteration_range=(0, model.best_ntree_limit))
    return test_pred / CFG.n_folds

def catboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(CFG.MODEL_DATA_PATH / f'catboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        test_pred += model.predict(x_test)
    return test_pred / CFG.n_folds

def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list, categorical_features: list):
    x_test = test_df[features]
    if method in ['lightgbm', 'xgboost']:
        test_pred = lightgbm_inference(x_test) if method == 'lightgbm' else xgboost_inference(x_test)
    if method == 'catboost':
        test_pred = catboost_inference(x_test)
    return test_pred


for method in CFG.METHOD_LIST:
    test_df[f'{method}_pred'] = gradient_boosting_model_inference(method, test_df, features, categorical_features)

test_df['final_pred'] = 0.4 * test_df['lightgbm_pred'] + 0.2 * test_df['xgboost_pred'] + 0.4 * test_df['catboost_pred']

# 結果を保存
test_df[['id','final_pred']].to_csv('submission.csv', index=False, header=False)


Unnamed: 0,19075,37779.411026584334
0,19076,18255.540461
1,19077,25724.500203
2,19078,17926.242928
3,19079,18044.249286
4,19080,14721.822971
...,...,...
453,20893,28927.997498
454,20894,20255.052594
455,20895,26215.179150
456,20896,28748.995518
