# LightGBM の実装

In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import datetime
import warnings
warnings.filterwarnings('ignore')
import random
import numpy as np
import pandas as pd
from pathlib import Path
import pickle

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

In [2]:
!mkdir oof
!mkdir models

mkdir: oof: File exists
mkdir: models: File exists


In [27]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    DATA_PATH = Path('.')
    OOF_DATA_PATH = Path('./oof')
    MODEL_DATA_PATH = Path('./models')
    METHOD_LIST = ['catboost', 'lightgbm']
    seed = random.randint(0, 100)
    n_folds = 5
    target_col = 'attendance'
    USE_PLAYER_FEATURES = False
    num_boost_round = 50500
    early_stopping_round = 50
    verbose = 2000
    boosting_type = 'gbdt'
    lgb_params = {
        'objective':'regression',
        'metric': 'rmse',
        'boosting': boosting_type,
        'n_jobs': -1,
        'seed': seed,
        'force_col_wise':'true',
        'num_boost_round': num_boost_round
        }
    cat_params = {
        'loss_function': 'RMSE',
        'iterations': num_boost_round,
        'random_seed': seed,
        }

In [4]:
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.seed)

In [5]:
# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'rmse', np.sqrt(mean_squared_error(y_true, y_pred)), False

# ====================================================
# Catboost Metric
# ====================================================
class CatboostMetric(object):
    def get_final_error(self, error, weight): return error
    def is_max_optimal(self): return False
    def evaluate(self, approxes, target, weight):
        error = np.sqrt(mean_squared_error(np.array(target), approxes))
        return error, 0

In [6]:
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame):
    # Apply the encoding map to the relevant columns
    categorical_cols = x_train.select_dtypes(include=['object']).columns.tolist()

    # Convert categorical columns to 'category' dtype
    for col in categorical_cols:
        x_train[col] = x_train[col].astype('category')
        x_valid[col] = x_valid[col].astype('category')

    # Create LightGBM datasets
    train_data = lgb.Dataset(x_train, label=y_train, categorical_feature=categorical_cols)
    valid_data = lgb.Dataset(x_valid, label=y_valid, categorical_feature=categorical_cols)

    # Train model
    model = lgb.train(
        params=CFG.lgb_params,
        train_set=train_data,
        num_boost_round=20000,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(CFG.early_stopping_round, verbose=True), lgb.log_evaluation(CFG.verbose)]
        )

    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred

def catboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, categorical_features: list):
    # Define model
    model = CatBoostRegressor(**CFG.cat_params, cat_features=categorical_features)

    # Fit model
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], early_stopping_rounds=CFG.early_stopping_round, verbose=CFG.verbose)

    # Predict validation
    valid_pred = model.predict(x_valid)

    return model, valid_pred

def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list, categorical_features: list):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    kfold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df, train_df[CFG.target_col])):
        print('-'*50)
        print(f'{method} training fold {fold + 1}')
        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        if method == 'lightgbm':
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid)
        if method == 'catboost':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, categorical_features)

        # Save best model
        pickle.dump(model, open(CFG.MODEL_DATA_PATH / f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.boosting_type}.pkl', 'wb'))
        # Save encoders only for lightgbm and xgboost
        # if method in ['lightgbm', 'xgboost']:
        #     pickle.dump(encoders[fold], open(CFG.MODEL_DATA_PATH / f'{method}_encoders_fold{fold + 1}_seed{CFG.seed}_ver{CFG.boosting_type}.pkl', 'wb'))

        # Add to out of folds array
        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    # Compute out of folds metric
    score = np.sqrt(mean_squared_error(train_df[CFG.target_col], oof_predictions))
    print(f'{method} our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    # oof_df = pd.DataFrame({'id': train_df['id'], CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    # oof_df.to_csv(CFG.MODEL_DATA_PATH / f'oof_{method}_seed{CFG.seed}_ver{CFG.boosting_type}.csv', index = False)

In [7]:
train_df = pd.read_csv(CFG.DATA_PATH / 'train.csv')
venue_info_df = pd.read_csv(CFG.DATA_PATH / 'venue_information.csv')
test_df = pd.read_csv(CFG.DATA_PATH / 'test.csv')
test_df[CFG.target_col] = -1
match_reports_df = pd.read_csv('match_reports.csv')
holidays_in_japan_df = pd.read_csv('holidays_in_japan.csv')
submission_df = pd.read_csv(CFG.DATA_PATH / 'sample_submit.csv')
all_df = pd.concat([train_df, test_df])

In [8]:
# match_reports_df を 'id' カラムで all_df と結合します
all_df = pd.merge(all_df, match_reports_df, on='id', how='left')

# venue_info_df を 'venue' カラムで all_df と結合します
all_df = pd.merge(all_df, venue_info_df, on='venue', how='left')

# holidays_in_japan_df を 'match_date' カラムで all_df と結合します
all_df['match_date'] = pd.to_datetime(all_df['match_date'])
holidays_in_japan_df['holiday_date'] = pd.to_datetime(holidays_in_japan_df['holiday_date'])
all_df['match_date'] = all_df['match_date'].dt.date
holidays_in_japan_df['holiday_date'] = holidays_in_japan_df['holiday_date'].dt.date

# もう一度 datetime 型に戻します
all_df['match_date'] = pd.to_datetime(all_df['match_date'])
holidays_in_japan_df['holiday_date'] = pd.to_datetime(holidays_in_japan_df['holiday_date'])

all_df = pd.merge(all_df, holidays_in_japan_df, left_on='match_date', right_on='holiday_date', how='left')

In [9]:
import feature_engineering as fe
all_df = fe.standardize_features(all_df, 'attendance', 'id')
all_df = fe.apply_feature_engineering(all_df)
all_df = fe.process_periodic_features(all_df)
home_stadium_df = all_df[['home_team', 'venue', 'address']].drop_duplicates()
all_df = fe.add_grouped_statistics(all_df)
all_df = fe.add_geographical_features(all_df, venue_info_df, home_stadium_df)
all_df = fe.standardize_features(all_df, 'attendance', 'id')
all_df = fe.HDBSCAN_featuring(all_df)
all_df = all_df.drop(['venue', 'address', 'description', 'match_date', 'kick_off_time', 'home_team', 'away_team'], axis=1)

# 最後に、訓練データとテストデータに再度分割します
train_df = all_df[all_df['attendance'] != -1]
test_df = all_df[all_df['attendance'] == -1]

all_df = fe.standardize_features(all_df, 'attendance', 'id')
all_df = fe.compute_knn_features_and_preprocess(train_df, test_df, CFG.target_col, k=10, folds=CFG.n_folds)
all_df = fe.perform_target_encoding(['venue_prefecture', 'away_prefecture', 'venue_region'], all_df, 'attendance', n_folds=CFG.n_folds, seed=CFG.seed)
all_df = fe.standardize_features(all_df, 'attendance', 'id')

82it [00:50,  1.62it/s]


In [10]:
all_df.head()

Unnamed: 0,id,section,round,weather,temperature,humidity,attendance,home_team_score,away_team_score,capacity,...,knn_avg_dist_4,knn_avg_dist_5,knn_avg_dist_6,knn_avg_dist_7,knn_avg_dist_8,knn_avg_dist_9,knn_avg_dist_10,venue_prefecture_target_enc,away_prefecture_target_enc,venue_region_target_enc
0,9190,-1.543581,-0.485306,0.79192,-2.080526,-1.119617,20916,-0.379199,-0.22967,-0.663229,...,-0.558139,-0.592688,-0.614364,-0.643787,-0.65601,-0.652789,-0.654105,-0.380308,1.321249,-1.097246
1,9191,-1.543581,1.882141,0.79192,-1.327887,-1.765889,14277,-1.168553,0.645833,-0.981972,...,2.222533,2.201567,2.157051,2.091899,2.033226,1.973125,1.900047,-1.19787,-0.297889,0.081675
2,9192,-1.543581,1.882141,0.79192,-1.45878,-1.388897,22531,0.410154,-1.105172,1.072716,...,-0.48019,-0.418347,-0.374684,-0.322221,-0.282498,-0.255458,-0.2181,1.049552,-0.326958,0.931789
3,9193,-1.543581,1.882141,0.79192,-1.540589,-1.011905,28564,-0.379199,-0.22967,1.239993,...,0.241695,0.445034,0.724695,1.143666,1.603565,2.017364,2.420211,-0.636912,-0.54206,0.183648
4,9194,-1.543581,1.882141,0.79192,-1.295163,-1.550465,17199,1.199508,0.645833,-0.711403,...,1.861361,1.85146,1.818728,1.788677,1.763612,1.716015,1.674749,0.041983,0.616069,0.081675


In [11]:
all_df = fe.standardize_features(all_df, 'attendance', 'id')
all_df.to_csv("all_data.csv", index=False)

In [12]:
missing_rows = all_df[all_df.isna().any(axis=1)]
print(missing_rows)

Empty DataFrame
Columns: [id, section, round, weather, temperature, humidity, attendance, home_team_score, away_team_score, capacity, スカパー!, スカパー!プレミアムサービス, スカパー, e2, スカパー(スカチャンHD、スカチャン), DAZN, スカパー!(パーフェクト チョイス), スカパー光, e2スカチャン, J SPORTS(録), NHK BS1, e2スカチャンHD, BS, e2(スカチャン!), テレ玉, BS-i, BS-TBS, e2(スカチャン!HV), 静岡放送, NHK総合, holiday_flag, long_weekend_flag, discomfort_index, home_team_rank, away_team_rank, home_team_last_year_rank, away_team_last_year_rank, rank_diff, rank_diff_abs, last_year_rank_diff, last_year_rank_diff_abs, diff_score, home_team_avg_conceded_last_3, away_team_avg_conceded_last_3, home_team_scored, away_team_scored, home_team_conceded, away_team_conceded, home_team_avg_points_last_3, home_team_avg_points_last_5, away_team_avg_points_last_3, away_team_avg_points_last_5, home_team_avg_scored_last_3, home_team_avg_scored_last_5, away_team_avg_scored_last_3, away_team_avg_scored_last_5, home_team_winning_streak, away_team_winning_streak, home_team_losing_streak, away_team

In [13]:
import model_tuner
# all_df = model_tuner.select_features(all_df)

# 最後に、訓練データとテストデータに再度分割します
train_df = all_df[all_df['attendance'] != -1]
test_df = all_df[all_df['attendance'] == -1]

In [16]:
categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
df = train_df.drop(categorical_cols + ['id'], axis=1)

frufs = model_tuner.FRUFS(df, method="lgb")
frufs.calc_coef()
importances = frufs.get_feature_importance()

Calculating coefficient/importances: 100%|██████████| 142/142 [00:55<00:00,  2.58it/s]

['capacity', 'section', 'day', 'attendance', 'year', 'venue_prefecture_target_enc', 'temperature_max_by_venue', 'temperature', 'home_team_conceded', 'humidity', 'discomfort_index_sum_by_venue', 'home_team_scored', 'temperature_min_by_venue', 'away_team_conceded', 'day_sin', 'rank_diff', 'away_team_scored', 'away_prefecture_target_enc', 'knn_avg_dist_3', 'discomfort_index', 'distance', 'knn_avg_dist_10', 'last_year_rank_diff', 'home_team_avg_scored_last_5', 'venue_region_target_enc', 'away_team_avg_conceded_last_3', 'section_sin', 'away_team_lon', 'away_team_avg_scored_last_5', 'away_team_last_year_rank', 'temperature_var_by_venue', 'away_team_avg_scored_last_3', 'diff_score', 'home_team_avg_scored_last_3', 'home_team_rank', 'home_team_avg_conceded_last_3', 'home_team_last_year_rank', 'away_team_rank', 'knn_avg_dist_9', 'knn_avg_dist_4', 'knn_avg_dist_2', 'temperature_mean_by_venue', 'knn_avg_dist_6', 'knn_avg_dist_7', 'humidity_var_by_venue', 'knn_avg_dist_8', 'knn_avg_dist_5', 'away_t




In [22]:
# 'Id'や'Target'といった特定のカラムを除外した全てのカラムを特徴量とする場合
categorical_features = train_df.select_dtypes(include=['object', 'category']).columns.tolist()

# features = train_df.columns.drop(['id', 'attendance'])
features = importances.loc[importances['importances']>0, 'columns'].tolist() + categorical_features

In [23]:
train_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 3366 entries, 0 to 3365
Data columns (total 146 columns):
 #    Column                                 Dtype  
---   ------                                 -----  
 0    id                                     int64  
 1    section                                float64
 2    round                                  float64
 3    weather                                float64
 4    temperature                            float64
 5    humidity                               float64
 6    attendance                             int64  
 7    home_team_score                        float64
 8    away_team_score                        float64
 9    capacity                               float64
 10   スカパー!                                  float64
 11   スカパー!プレミアムサービス                         float64
 12   スカパー                                   float64
 13   e2                                     float64
 14   スカパー(スカチャンHD、スカチャン)                    floa

In [24]:
# パラメータチューニング
# CFG.lgb_params = model_tuner.tune_model(train_df[features], train_df[CFG.target_col], 'lgb', CFG.lgb_params, n_trials=15)

In [25]:
# パラメータチューニング
# CFG.cat_params = model_tuner.tune_model(train_df[features], train_df[CFG.target_col], 'cat', CFG.cat_params, n_trials=15)

In [28]:
for method in CFG.METHOD_LIST:
    gradient_boosting_model_cv_training(method, train_df, features, categorical_features)

--------------------------------------------------
catboost training fold 1
Learning rate set to 0.005438
0:	learn: 9477.0779901	test: 9215.8829860	best: 9215.8829860 (0)	total: 4.72ms	remaining: 3m 58s
2000:	learn: 157.4370629	test: 228.3327893	best: 228.3327893 (2000)	total: 6.14s	remaining: 2m 28s
4000:	learn: 91.9197008	test: 208.6242188	best: 208.5942831 (3997)	total: 12.1s	remaining: 2m 20s
6000:	learn: 63.4438605	test: 204.1209048	best: 204.1005770 (5968)	total: 18s	remaining: 2m 13s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 203.7316418
bestIteration = 6330

Shrink model to first 6331 iterations.
--------------------------------------------------
catboost training fold 2
Learning rate set to 0.005438
0:	learn: 9426.2765204	test: 9427.5304329	best: 9427.5304329 (0)	total: 2.76ms	remaining: 2m 19s
2000:	learn: 159.2963878	test: 258.7375299	best: 258.7375299 (2000)	total: 5.92s	remaining: 2m 23s
4000:	learn: 89.1319305	test: 236.2054634	best: 236.1790973 (39

In [29]:
def lightgbm_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(CFG.MODEL_DATA_PATH / f'lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.boosting_type}.pkl', 'rb'))
        categorical_cols = x_test.select_dtypes(include=['object', 'category']).columns
        for col in categorical_cols:
            x_test[col] = x_test[col].astype('category')
        test_pred += model.predict(x_test)
    return test_pred / CFG.n_folds

def catboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(CFG.MODEL_DATA_PATH / f'catboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.boosting_type}.pkl', 'rb'))
        test_pred += model.predict(x_test)
    return test_pred / CFG.n_folds

def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list):
    x_test = test_df[features]
    if method == 'lightgbm':
        test_pred = lightgbm_inference(x_test)
    if method == 'catboost':
        test_pred = catboost_inference(x_test)
    return test_pred


for method in CFG.METHOD_LIST:
    test_df[f'{method}_pred'] = gradient_boosting_model_inference(method, test_df, features)


# アンサンブルの結果を保存
test_df['final_pred'] = 0.5 * test_df['lightgbm_pred'] + 0.5 * test_df['catboost_pred']
test_df[['id','final_pred']].to_csv(f'submissions/EMS_{CFG.n_folds}folds_submission_{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}_{CFG.seed}.csv', index=False, header=False)

# LGBのみの結果を保存
test_df['final_pred'] = 1.0 * test_df['lightgbm_pred'] + 0.0 * test_df['catboost_pred']
test_df[['id','final_pred']].to_csv(f'submissions/OnlyLGB_{CFG.n_folds}folds_submission_{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}_{CFG.seed}.csv', index=False, header=False)

# Catのみの結果を保存
test_df['final_pred'] = 0.0 * test_df['lightgbm_pred'] + 1.0 * test_df['catboost_pred']
test_df[['id','final_pred']].to_csv(f'submissions/OnlyCat_{CFG.n_folds}folds_submission_{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}_{CFG.seed}.csv', index=False, header=False)