In [1]:
import pandas as pd
# Import necessary libraries
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.inspection import permutation_importance
import numpy as np

In [2]:
# Open data/original
df_home = pd.read_csv('./TRAIN_DATA/train_home_team_statistics_df.csv', index_col='ID')
df_away = pd.read_csv('./TRAIN_DATA/train_away_team_statistics_df.csv', index_col='ID')
df_y = pd.read_csv('./Y_train_1rknArQ.csv', index_col='ID')

In [3]:
# Merge by index df_home and df_away
df = pd.merge(df_home, df_away, left_index=True, right_index=True)

# Change column names, last _x per _home and last _y per _away
df.columns = df.columns.str.replace('_x', '_home')
df.columns = df.columns.str.replace('_y', '_away')

df = pd.merge(df, df_y, left_index=True, right_index=True)
y_cols = ['HOME_WINS', 'AWAY_WINS', 'DRAW']
df.head()

Unnamed: 0_level_0,LEAGUE_home,TEAM_NAME_home,TEAM_SHOTS_TOTAL_season_sum_home,TEAM_SHOTS_INSIDEBOX_season_sum_home,TEAM_SHOTS_OFF_TARGET_season_sum_home,TEAM_SHOTS_ON_TARGET_season_sum_home,TEAM_SHOTS_OUTSIDEBOX_season_sum_home,TEAM_PASSES_season_sum_home,TEAM_SUCCESSFUL_PASSES_season_sum_home,TEAM_SAVES_season_sum_home,...,TEAM_ATTACKS_5_last_match_std_away,TEAM_PENALTIES_5_last_match_std_away,TEAM_SUBSTITUTIONS_5_last_match_std_away,TEAM_BALL_SAFE_5_last_match_std_away,TEAM_DANGEROUS_ATTACKS_5_last_match_std_away,TEAM_INJURIES_5_last_match_std_away,TEAM_GOALS_5_last_match_std_away,HOME_WINS,DRAW,AWAY_WINS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Ligue 1,Toulouse,3.0,2.0,5.0,2.0,1.0,2.0,2.0,5.0,...,0.0,6.0,8.0,4.0,3.0,2.0,3.0,0,0,1
1,Ligue 2,Brest,6.0,8.0,3.0,6.0,5.0,8.0,7.0,10.0,...,1.0,8.0,4.0,10.0,0.0,5.0,3.0,0,1,0
2,Serie A,Sampdoria,4.0,2.0,5.0,2.0,8.0,1.0,1.0,2.0,...,4.0,0.0,8.0,3.0,0.0,9.0,6.0,0,0,1
3,League One,Coventry City,7.0,5.0,5.0,6.0,6.0,9.0,9.0,2.0,...,2.0,0.0,5.0,6.0,3.0,,2.0,1,0,0
4,Premier League,Wolverhampton Wanderers,3.0,3.0,2.0,3.0,4.0,4.0,3.0,4.0,...,4.0,9.0,4.0,1.0,4.0,6.0,5.0,0,1,0


First, we will try all the columns and all the rows and prepare the data

In [4]:
cols_to_drop = ['LEAGUE_home', 'TEAM_NAME_home', 'LEAGUE_away', 'TEAM_NAME_away', 'HOME_WINS', 'AWAY_WINS', 
                'DRAW']

# Split the data into features and target
X = df.drop(cols_to_drop, axis=1)
# Transform three binary columns into one
y = df[y_cols].idxmax(axis=1)

# Encode classes as integers
le = LabelEncoder()
y = le.fit_transform(y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### XGBoost and LightGBM

In [5]:
# List of models to test
models = [
    ('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')),
    ('LightGBM', LGBMClassifier())
]

# For each model
for name, model in models:
    # Fit the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: %.2f%%" % (accuracy * 100.0))

XGBoost Accuracy: 45.96%
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013886 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3261
[LightGBM] [Info] Number of data points in the train set: 9842, number of used features: 280
[LightGBM] [Info] Start training from score -1.187047
[LightGBM] [Info] Start training from score -1.344311
[LightGBM] [Info] Start training from score -0.834343
LightGBM Accuracy: 47.99%


### Stacking (XGBoost and lightGMB)

In [5]:
# Definir los modelos base
base_models = [
    ('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')),
    ('LightGBM', LGBMClassifier())
]

# Definir el meta-clasificador
meta_classifier = LogisticRegression()

# Crear el modelo de stacking
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_classifier, cv=5)

# Entrenar el modelo de stacking
stacking_model.fit(X_train, y_train)

# Hacer predicciones
y_pred_stacking = stacking_model.predict(X_test)

# Evaluar las predicciones
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
print(f"Stacking Model Accuracy: {accuracy_stacking}%")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3261
[LightGBM] [Info] Number of data points in the train set: 9842, number of used features: 280
[LightGBM] [Info] Start training from score -1.187047
[LightGBM] [Info] Start training from score -1.344311
[LightGBM] [Info] Start training from score -0.834343
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3251
[LightGBM] [Info] Number of data points in the train set: 7873, number of used features: 280
[LightGBM] [Info] Start training from score -1.187137
[LightGBM] [Info] Start training from score -1.344624
[LightGBM] [Info] Start training from score -0.834091
[LightGBM] [Info] Auto-choosing co

We optimize the hyperparameters with the GridSearchCV function

In [6]:
# Parámetros para GridSearchCV en XGBoost
params = {
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1]
}

grid_search = GridSearchCV(estimator=XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), 
                           param_grid=params, 
                           scoring='accuracy', 
                           cv=3)

grid_search.fit(X_train, y_train)
best_xgb = grid_search.best_estimator_
# Ahora, best_xgb podría ser uno de los estimadores en tu stack

In [7]:

# Parámetros para GridSearchCV en LightGBM
lgbm_params = {
    'num_leaves': [31, 62, 127],  # Número de hojas en un árbol completo
    'max_depth': [10, 20, 40],    # Profundidad máxima de los árboles
    'learning_rate': [0.01, 0.05, 0.1],  # Tasa de aprendizaje
    'n_estimators': [100, 200, 300]  # Número de árboles en el modelo
}

# Crear el objeto GridSearchCV
grid_search_lgbm = GridSearchCV(
    estimator=LGBMClassifier(), 
    param_grid=lgbm_params, 
    scoring='accuracy', 
    cv=3
)

# Entrenar GridSearchCV
grid_search_lgbm.fit(X_train, y_train)
best_lgbm = grid_search_lgbm.best_estimator_

# Mostrar el mejor modelo
print("Mejor modelo LightGBM:", best_lgbm)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012214 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011845 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011425 sec

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009730 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003932 sec

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003335 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012030 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing ro

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003925 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013757 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010054 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012671 sec

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010073 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003686 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing ro

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003664 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004374 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing co

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010944 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003740 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing ro

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003909 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start 

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004319 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing co

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012905 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003806 sec

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015193 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014443 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014901 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012583 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003728 sec

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013425 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003605 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010490 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003686 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013104 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing co

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014084 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013237 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012914 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012965 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010088 sec

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013007 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing co

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012950 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013046 sec

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013797 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003952 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing co

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013264 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013624 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011614 sec

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003737 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015768 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing co

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004097 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start 

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004416 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013141 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012236 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005379 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing co

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003926 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004154 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing co

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013642 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing ro

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013351 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004124 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing co

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013381 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 6562, number of used features: 280
[LightGBM] [Info] Start training from score -1.187149
[LightGBM] [Info] Start training from score -1.344217
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010705 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011030 sec

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014795 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344650
[LightGBM] [Info] Start training from score -0.834175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015463 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3260
[LightGBM] [Info] Number of data points in the train set: 6561, number of used features: 280
[LightGBM] [Info] Start training from score -1.186996
[LightGBM] [Info] Start training from score -1.344065
[LightGBM] [Info] Start training from score -0.834526
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004790 sec

In [None]:
# Actualizar los modelos base con ambos modelos optimizados
base_models_updated = [
    ('XGBoost', best_xgb),
    ('LightGBM', LGBMClassifier())
]

# Crear de nuevo el modelo de stacking
stacking_model_updated = StackingClassifier(
    estimators=base_models_updated, 
    final_estimator=LogisticRegression(), 
    cv=5
)

# Entrenar el modelo de stacking actualizado
stacking_model_updated.fit(X_train, y_train)

# Hacer predicciones con el modelo de stacking actualizado
y_pred_stacking_updated = stacking_model_updated.predict(X_test)

# Evaluar las predicciones con el modelo de stacking actualizado
accuracy_stacking_updated = accuracy_score(y_test, y_pred_stacking_updated)
print(f"Updated Stacking Model Accuracy with optimized LightGBM and XGBoost: %.2f%%" % (accuracy_stacking * 100.0))

### Bagging with LightGBM

In [11]:
# Definir el modelo base
base_model = LGBMClassifier()

# Crear el objeto BaggingClassifier
bagging_model = BaggingClassifier(
    base_estimator=base_model,
    n_estimators=10,  # Número de modelos en el ensemble
    max_samples=0.8,  # Proporción de muestras a utilizar para cada modelo
    max_features=0.8,  # Proporción de características a utilizar para cada modelo
    random_state=42
)

# Entrenar el modelo de bagging
bagging_model.fit(X_train, y_train)

# Hacer predicciones
y_pred_bagging = bagging_model.predict(X_test)

# Evaluar las predicciones
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
print(f"Bagging Model Accuracy: %.2f%%" % (accuracy_bagging * 100.0))




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2613
[LightGBM] [Info] Number of data points in the train set: 9842, number of used features: 224
[LightGBM] [Info] Start training from score -1.166535
[LightGBM] [Info] Start training from score -1.363316
[LightGBM] [Info] Start training from score -0.837607
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2614
[LightGBM] [Info] Number of data points in the train set: 9842, number of used features: 224
[LightGBM] [Info] Start training from score -1.197179
[LightGBM] [Info] Start training from score -1.338793
[LightGBM] [Info] Start training from score -0.830587
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003968 sec

Next, we are going to eliminate the most correlated variables and we tested the same models as before

### XGBoost and LightGBM

In [8]:
cols_to_drop = ['LEAGUE_home', 'TEAM_NAME_home', 'LEAGUE_away', 'TEAM_NAME_away', 'HOME_WINS', 'AWAY_WINS', 
                'DRAW', 'TEAM_SUCCESSFUL_PASSES_season_std_home','TEAM_SUCCESSFUL_PASSES_season_sum_home',
                'TEAM_SUCCESSFUL_PASSES_5_last_match_average_home','TEAM_SUCCESSFUL_PASSES_5_last_match_std_home',
                'TEAM_SUCCESSFUL_PASSES_season_average_home','TEAM_SUCCESSFUL_PASSES_5_last_match_sum_home', 
                'TEAM_SUCCESSFUL_PASSES_season_std_away','TEAM_SUCCESSFUL_PASSES_season_sum_away',
                'TEAM_SUCCESSFUL_PASSES_5_last_match_average_away','TEAM_SUCCESSFUL_PASSES_5_last_match_std_away',
                'TEAM_SUCCESSFUL_PASSES_season_average_away','TEAM_SUCCESSFUL_PASSES_5_last_match_sum_away']

# Split the data into features and target
X = df.drop(cols_to_drop, axis=1)
# Transform three binary columns into one
y = df[y_cols].idxmax(axis=1)

# Encode classes as integers
le = LabelEncoder()
y = le.fit_transform(y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of models to test
models = [
    ('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')),
    ('LightGBM', LGBMClassifier())
]

# For each model
for name, model in models:
    # Fit the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: %.2f%%" % (accuracy * 100.0))

XGBoost Accuracy: 46.81%
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004797 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3117
[LightGBM] [Info] Number of data points in the train set: 9842, number of used features: 268
[LightGBM] [Info] Start training from score -1.187047
[LightGBM] [Info] Start training from score -1.344311
[LightGBM] [Info] Start training from score -0.834343
LightGBM Accuracy: 46.85%


### Stacking (XGBoost and lightGMB)

In [None]:
# Definir los modelos base
base_models = [
    ('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')),
    ('LightGBM', LGBMClassifier())
]

# Definir el meta-clasificador
meta_classifier = LogisticRegression()

# Crear el modelo de stacking
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_classifier, cv=5)

# Entrenar el modelo de stacking
stacking_model.fit(X_train, y_train)

# Hacer predicciones
y_pred_stacking = stacking_model.predict(X_test)

# Evaluar las predicciones
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
print(f"Stacking Model Accuracy: {accuracy_stacking}%")

We optimize the hyperparameters with the GridSearchCV function

In [10]:
# Parámetros para GridSearchCV en XGBoost
params = {
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1]
}

grid_search = GridSearchCV(estimator=XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), 
                           param_grid=params, 
                           scoring='accuracy', 
                           cv=3)

grid_search.fit(X_train, y_train)
best_xgb = grid_search.best_estimator_
# Ahora, best_xgb podría ser uno de los estimadores en tu stack

In [11]:
# Definir los modelos base
base_models = [
    ('XGBoost', best_xgb),
    ('LightGBM', LGBMClassifier())
]

# Definir el meta-clasificador
meta_classifier = LogisticRegression()

# Crear el modelo de stacking
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_classifier, cv=5)

# Entrenar el modelo de stacking
stacking_model.fit(X_train, y_train)

# Hacer predicciones
y_pred_stacking = stacking_model.predict(X_test)

# Evaluar las predicciones
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
print(f"Stacking Model Accuracy: {accuracy_stacking}%")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004524 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3117
[LightGBM] [Info] Number of data points in the train set: 9842, number of used features: 268
[LightGBM] [Info] Start training from score -1.187047
[LightGBM] [Info] Start training from score -1.344311
[LightGBM] [Info] Start training from score -0.834343
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3107
[LightGBM] [Info] Number of data points in the train set: 7873, number of used features: 268
[LightGBM] [Info] Start training from score -1.187137
[LightGBM] [Info] Start training from score -1.344624
[LightGBM] [Info] Start training from score -0.834091
[LightGBM] [Info] Auto-choosing ro

Finally, we calculate the most important features

In [12]:
# Obtener importancias de características de los modelos base
importances = np.zeros(X_train.shape[1])

result = permutation_importance(stacking_model, X_train, y_train, n_repeats=30, random_state=42, n_jobs=2)
importances += result.importances_mean

# Crear un DataFrame con las importancias de características
feature_importances1 = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})

# Ordenar las características por importancia
feature_importances1 = feature_importances1.sort_values(by='Importance', ascending=False)

In [13]:
print(feature_importances1[:8])

                                        Feature  Importance
22         TEAM_SHOTS_TOTAL_season_average_home    0.013381
165    TEAM_BALL_POSSESSION_season_average_away    0.009314
116  TEAM_SHOTS_ON_TARGET_5_last_match_std_home    0.009023
31     TEAM_BALL_POSSESSION_season_average_home    0.008579
21               TEAM_GAME_LOST_season_sum_home    0.008406
43            TEAM_GAME_WON_season_average_home    0.008200
40   TEAM_DANGEROUS_ATTACKS_season_average_home    0.007631
45           TEAM_GAME_LOST_season_average_home    0.006747


In [14]:
print(feature_importances1[260:])

                                      Feature  Importance
101   TEAM_REDCARDS_5_last_match_average_home    0.000085
7                TEAM_CORNERS_season_sum_home    0.000078
246  TEAM_GAME_LOST_5_last_match_average_away    0.000034
213        TEAM_ATTACKS_5_last_match_sum_away    0.000030
245  TEAM_GAME_DRAW_5_last_match_average_away    0.000000
111  TEAM_GAME_DRAW_5_last_match_average_home    0.000000
219          TEAM_GOALS_5_last_match_sum_away   -0.000007
112  TEAM_GAME_LOST_5_last_match_average_home   -0.000051
