# LightGBM の実装

In [124]:
# ====================================================
# Library
# ====================================================
import os
import gc
import datetime
import warnings
warnings.filterwarnings('ignore')
import random
import numpy as np
import pandas as pd
from pathlib import Path
import pickle

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from category_encoders import CatBoostEncoder, OneHotEncoder, TargetEncoder

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

In [125]:
!mkdir oof
!mkdir models

mkdir: oof: File exists
mkdir: models: File exists


In [126]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    DATA_PATH = Path('.')
    OOF_DATA_PATH = Path('./oof')
    MODEL_DATA_PATH = Path('./models')
    METHOD_LIST = ['lightgbm', 'xgboost', 'catboost']
    seed = random.randint(0, 100)
    n_folds = 10
    target_col = 'attendance'
    USE_PLAYER_FEATURES = False
    num_boost_round = 1000000000
    early_stopping_round = 100
    verbose = 2000
    boosting_type = 'dart'

    lgb_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting': boosting_type,
        'n_jobs': -1,
        'seed': seed,
        # 'num_leaves': 68,
        # 'learning_rate': 0.0917207610719146,
        # 'n_estimators': 126,
        # 'reg_alpha': 2.6106934092144427,
        # 'reg_lambda': 0.00446229217366385,
        # 'max_depth': 8,
        # 'subsample': 0.4404671460367654,
        # 'colsample_bytree': 0.23342095401809768,
        # 'min_child_samples': 24
    }

    xgb_params = {
        'objective': 'reg:squarederro',
        'metric': 'rmse',
        'boosting': boosting_type,
        'n_jobs': -1,
        'seed': seed,
        # 'learning_rate': 0.23498767612469987,
        # 'n_estimators': 204,
        # 'reg_alpha': 0.005238591899555238,
        # 'reg_lambda': 0.12160245524657458,
        # 'max_depth': 3,
        # 'subsample': 0.351775011081568,
        # 'colsample_bytree': 0.6249935770702273
    }

    cat_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting': boosting_type,
        'n_jobs': -1,
        'seed': seed,
        # 'learning_rate': 0.061465958510475996,
        # 'n_estimators': 268,
        # 'reg_lambda': 0.0743319710342665,
        # 'max_depth': 8,
        # 'subsample': 0.9826703311621121,
        # 'colsample_bylevel': 0.6484589738878385
    }

In [127]:
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.seed)

In [128]:
# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'rmse', np.sqrt(mean_squared_error(y_true, y_pred)), False

def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'rmse', np.sqrt(mean_squared_error(y_true, y_pred))

# ====================================================
# Catboost Metric
# ====================================================
class CatboostMetric(object):
    def get_final_error(self, error, weight): return error
    def is_max_optimal(self): return False
    def evaluate(self, approxes, target, weight):
        error = np.sqrt(mean_squared_error(np.array(target), approxes))
        return error, 0

In [129]:
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame):
    # OneHot encode categorical features
    one_hot_enc = OneHotEncoder(cols=x_train.columns, handle_unknown='indicator')
    x_train = one_hot_enc.fit_transform(x_train)
    x_valid = one_hot_enc.transform(x_valid)

    # Create LightGBM datasets
    train_data = lgb.Dataset(x_train, label=y_train)
    valid_data = lgb.Dataset(x_valid, label=y_valid)

    # Train model
    model = lgb.train(CFG.lgb_params, train_data, num_boost_round=CFG.num_boost_round, valid_sets=[valid_data], early_stopping_rounds=CFG.early_stopping_round, verbose_eval=CFG.verbose)

    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred

def xgboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame):
    # OneHot encode categorical features
    one_hot_enc = OneHotEncoder(cols=x_train.columns, handle_unknown='indicator')
    x_train = one_hot_enc.fit_transform(x_train)
    x_valid = one_hot_enc.transform(x_valid)

    # Create XGBoost datasets
    train_data = xgb.DMatrix(x_train, label=y_train)
    valid_data = xgb.DMatrix(x_valid, label=y_valid)

    # Train model
    model = xgb.train(CFG.xgb_params, train_data, num_boost_round=CFG.num_boost_round, evals=[(train_data, 'train'), (valid_data, 'eval')], early_stopping_rounds=CFG.early_stopping_round, verbose_eval=CFG.verbose)

    # Predict validation
    valid_pred = model.predict(valid_data)
    return model, valid_pred

def catboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, categorical_features: list):
    # Define model
    model = CatBoostRegressor(CFG.cat_params, cat_features=categorical_features)

    # Fit model
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], early_stopping_rounds=CFG.early_stopping_round, verbose=CFG.verbose)

    # Predict validation
    valid_pred = model.predict(x_valid)

    return model, valid_pred

def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list, categorical_features: list):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    kfold = KFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    encoders = {}
    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df, train_df[CFG.target_col])):
        print('-'*50)
        print(f'{method} training fold {fold + 1}')
        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        if method == 'lightgbm':
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid)
        if method == 'xgboost':
            model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid)
        if method == 'catboost':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, categorical_features)

        # Save best model
        pickle.dump(model, open(CFG.MODEL_DATA_PATH / f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.boosting_type}.pkl', 'wb'))
        # Save encoders only for lightgbm and xgboost
        # if method in ['lightgbm', 'xgboost']:
        #     pickle.dump(encoders[fold], open(CFG.MODEL_DATA_PATH / f'{method}_encoders_fold{fold + 1}_seed{CFG.seed}_ver{CFG.boosting_type}.pkl', 'wb'))

        # Add to out of folds array
        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    # Compute out of folds metric
    score = np.sqrt(mean_squared_error(train_df[CFG.target_col], oof_predictions))
    print(f'{method} our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    # oof_df = pd.DataFrame({'id': train_df['id'], CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    # oof_df.to_csv(CFG.MODEL_DATA_PATH / f'oof_{method}_seed{CFG.seed}_ver{CFG.boosting_type}.csv', index = False)

In [130]:
train_df = pd.read_csv(CFG.DATA_PATH / 'train.csv')
venue_info_df = pd.read_csv(CFG.DATA_PATH / 'venue_information.csv')
test_df = pd.read_csv(CFG.DATA_PATH / 'test.csv')
test_df[CFG.target_col] = -1
match_reports_df = pd.read_csv('match_reports.csv')
holidays_in_japan_df = pd.read_csv('holidays_in_japan.csv')
submission_df = pd.read_csv(CFG.DATA_PATH / 'sample_submit.csv')
all_df = pd.concat([train_df, test_df])

In [131]:
# match_reports_df を 'id' カラムで all_df と結合します
all_df = pd.merge(all_df, match_reports_df, on='id', how='left')

# venue_info_df を 'venue' カラムで all_df と結合します
all_df = pd.merge(all_df, venue_info_df, on='venue', how='left')

# holidays_in_japan_df を 'match_date' カラムで all_df と結合します
all_df['match_date'] = pd.to_datetime(all_df['match_date'])
holidays_in_japan_df['holiday_date'] = pd.to_datetime(holidays_in_japan_df['holiday_date'])
all_df['match_date'] = all_df['match_date'].dt.date
holidays_in_japan_df['holiday_date'] = holidays_in_japan_df['holiday_date'].dt.date

# もう一度 datetime 型に戻します
all_df['match_date'] = pd.to_datetime(all_df['match_date'])
holidays_in_japan_df['holiday_date'] = pd.to_datetime(holidays_in_japan_df['holiday_date'])

all_df = pd.merge(all_df, holidays_in_japan_df, left_on='match_date', right_on='holiday_date', how='left')

In [132]:
import feature_engineering as fe
all_df = fe.apply_feature_engineering(all_df)
all_df = fe.process_periodic_features(all_df)
all_df = fe.add_geographical_features(all_df, venue_info_df)

all_df = all_df.drop(['venue', 'address', 'description', 'match_date', 'kick_off_time'], axis=1)

In [133]:
all_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 106 columns):
 #    Column                         Dtype  
---   ------                         -----  
 0    id                             int64  
 1    section                        int64  
 2    round                          int64  
 3    home_team                      object 
 4    away_team                      object 
 5    weather                        int64  
 6    temperature                    float64
 7    humidity                       float64
 8    attendance                     int64  
 9    home_team_player11             object 
 10   home_team_player10             object 
 11   home_team_player9              object 
 12   home_team_player8              object 
 13   home_team_player7              object 
 14   home_team_player6              object 
 15   home_team_player5              object 
 16   home_team_player4              object 
 17   home_team_player3              

In [134]:
if not CFG.USE_PLAYER_FEATURES:
    for i in range(1, 12):
        all_df = all_df.drop([f'home_team_player{i}'],axis=1)
        all_df = all_df.drop([f'away_team_player{i}'],axis=1)

# 最後に、訓練データとテストデータに再度分割します
train_df = all_df[all_df['attendance'] != -1]
test_df = all_df[all_df['attendance'] == -1]

# 'Id'や'Target'といった特定のカラムを除外した全てのカラムを特徴量とする場合
features = train_df.columns.drop(['id', 'attendance'])

# または、データ型が 'object'（文字列）または 'category' のカラムをカテゴリカル特徴量とする場合
categorical_features = train_df.select_dtypes(include=['object', 'category']).columns.tolist()

In [135]:
# train_df.to_csv("train_all.csv", index=False)

In [136]:
all_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 84 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             3825 non-null   int64  
 1   section                        3825 non-null   int64  
 2   round                          3825 non-null   int64  
 3   home_team                      3825 non-null   object 
 4   away_team                      3825 non-null   object 
 5   weather                        3825 non-null   int64  
 6   temperature                    3825 non-null   float64
 7   humidity                       3825 non-null   float64
 8   attendance                     3825 non-null   int64  
 9   home_team_score                3825 non-null   int64  
 10  away_team_score                3825 non-null   int64  
 11  capacity                       3825 non-null   int64  
 12  スカパー!                          3825 non-null   i

In [137]:
from model_tuner import tune_model

# パラメータチューニング
# tune_model(train_df[features], train_df[CFG.target_col], model_type='xgboost', n_trials=15)

In [138]:
# パラメータチューニング
# tune_model(train_df[features], train_df[CFG.target_col], model_type='lightgbm', n_trials=50)

In [139]:
# パラメータチューニング
# tune_model(train_df[features], train_df[CFG.target_col], model_type='catboost', n_trials=50)

In [140]:
for method in CFG.METHOD_LIST:
    gradient_boosting_model_cv_training(method, train_df, features, categorical_features)

--------------------------------------------------
lightgbm training fold 1
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2264
[LightGBM] [Info] Number of data points in the train set: 3029, number of used features: 1132
[LightGBM] [Info] Start training from score 17953.603169
[2000]	valid_0's rmse: 3814.51
[4000]	valid_0's rmse: 3839.95
[6000]	valid_0's rmse: 3873.17
[8000]	valid_0's rmse: 3880.78
[10000]	valid_0's rmse: 3876.17
[12000]	valid_0's rmse: 3873.46


In [None]:
def lightgbm_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(CFG.MODEL_DATA_PATH / f'lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.boosting_type}.pkl', 'rb'))
        one_hot_enc = OneHotEncoder(cols=x_test.columns, handle_unknown='indicator')
        x_test_enc = one_hot_enc.fit_transform(x_test)
        test_pred += model.predict(x_test_enc)
    return test_pred / CFG.n_folds

def xgboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(CFG.MODEL_DATA_PATH / f'xgboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.boosting_type}.pkl', 'rb'))
        one_hot_enc = OneHotEncoder(cols=x_test.columns, handle_unknown='indicator')
        x_test_enc = one_hot_enc.fit_transform(x_test)
        test_pred += model.predict(xgb.DMatrix(x_test_enc), iteration_range=(0, model.best_ntree_limit))
    return test_pred / CFG.n_folds

def catboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(CFG.MODEL_DATA_PATH / f'catboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.boosting_type}.pkl', 'rb'))
        test_pred += model.predict(x_test)
    return test_pred / CFG.n_folds

def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list):
    x_test = test_df[features]
    if method in ['lightgbm', 'xgboost']:
        test_pred = lightgbm_inference(x_test) if method == 'lightgbm' else xgboost_inference(x_test)
    if method == 'catboost':
        test_pred = catboost_inference(x_test)
    return test_pred


for method in CFG.METHOD_LIST:
    test_df[f'{method}_pred'] = gradient_boosting_model_inference(method, test_df, features, categorical_features)

test_df['final_pred'] = 0.4 * test_df['lightgbm_pred'] + 0.2 * test_df['xgboost_pred'] + 0.4 * test_df['catboost_pred']

# 結果を保存
test_df[['id','final_pred']].to_csv(f'submissions/{CFG.n_folds}folds_submission_{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}_{CFG.seed}.csv', index=False, header=False)
