In [1]:
import os
import json

import pandas as pd

from xgboost import XGBClassifier

from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

import category_encoders as ce

import warnings

warnings.filterwarnings("ignore")

In [2]:
# def evaluate_model(X_train, X_val, y_train, y_val):
#     encoder = ce.OneHotEncoder(cols=X_train.select_dtypes(include=['object', 'category']).columns, use_cat_names=True, drop_invariant=True, return_df=True)
# 
#     X_train_encoded = encoder.fit_transform(X_train, y_train)
#     X_val_encoded = encoder.transform(X_val)
#     
#     model = XGBClassifier(random_state=42, enable_categorical=True, early_stopping_rounds=100)
#     model.fit(X_train_encoded, y_train, eval_set=[(X_train_encoded, y_train), (X_val_encoded, y_val)], verbose=0)
#     y_pred = model.predict(X_val_encoded)
#     return f1_score(y_val, y_pred, average='weighted'), model


def evaluate_model(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    best_f1 = 0.0

    model = XGBClassifier(random_state=42, enable_categorical=True)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0, early_stopping_rounds=25)
    
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='weighted')
            
    return f1, model

In [4]:
# X = pd.read_csv('../../data/new_features/df_.csv')
# y = pd.read_csv('../../data/new_features/y.csv')

In [5]:
for col in X.filter(like='_binned').columns:
    X[col] = X[col].astype('category')

X.filter(like='_binned').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 5 columns):
 #   Column                                         Non-Null Count  Dtype   
---  ------                                         --------------  -----   
 0   avg_home_team_rating_x_ewm_shoton_away_binned  3022 non-null   category
 1   average_rating_away_binned                     3040 non-null   category
 2   average_rating_home_binned                     3040 non-null   category
 3   ewm_possession_home_binned                     3024 non-null   category
 4   avg_away_team_rating_binned                    3040 non-null   category
dtypes: category(5)
memory usage: 16.0 KB


In [6]:
# X['ewm_shoton_diff'] = X['ewm_shoton_home'] - X['ewm_shoton_away']
# X['ewm_shoton_ratio'] = X['ewm_shoton_home'] / X['ewm_shoton_away']

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

f1, xgb = evaluate_model(X, y)
f1

0.6772014777016293

In [8]:
from sklearn.feature_selection import SelectFromModel

def evaluate_model_fe(X, y, threshold):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Fit and transform the training data
    selection = SelectFromModel(xgb, threshold=threshold, prefit=True)
    select_X_train = selection.transform(X_train)
    select_X_val = selection.transform(X_val)
    
    # Get the support of the selected features
    selected_features = X_train.columns[(selection.get_support())]
    
    # Convert the resulting arrays to DataFrames with column names
    select_X_train = pd.DataFrame(select_X_train, columns=selected_features)
    select_X_val = pd.DataFrame(select_X_val, columns=selected_features)
    
    select_X_train_df = select_X_train.apply(lambda col: col.astype('category') if col.dtype == 'object' else col)
    select_X_val_df = select_X_val.apply(lambda col: col.astype('category') if col.dtype == 'object' else col)

    model = XGBClassifier(random_state=42, enable_categorical=True, early_stopping_rounds=100)
    model.fit(select_X_train_df, y_train, eval_set=[(select_X_train_df, y_train), (select_X_val_df, y_val)], verbose=0)

    y_pred = model.predict(select_X_val_df)
    return f1_score(y_val, y_pred, average='weighted')

# Your existing code for getting feature importances
feature_importances = xgb.feature_importances_
thresholds = sorted(feature_importances, reverse=True)

results = []

# Iterate over thresholds
for thresh in thresholds:
    f1 = evaluate_model_fe(X, y, thresh)
    results.append({'threshold': thresh, 'f1': f1})

# Sorting and printing results
sorted_results = sorted(results, key=lambda x: x['f1'], reverse=True)

print("Top 5 feature selection results:")
for result in sorted_results[:5]:
    print(f"Threshold: {result['threshold']}, F1: {result['f1']}")

Top 5 feature selection results:
Threshold: 0.028790410608053207, F1: 0.605643326737279
Threshold: 0.03097396157681942, F1: 0.5953158536498407
Threshold: 0.0301752220839262, F1: 0.5929390412473119
Threshold: 0.0426974780857563, F1: 0.58964125710157
Threshold: 0.03765176609158516, F1: 0.5846790986204261


In [9]:
best_threshold = max(results, key=lambda x: x['f1'])['threshold']

selection = SelectFromModel(model, threshold=best_threshold, prefit=True)
selection.fit(X_train, y_train)

# Get the names of the selected features
selected_features = X_train.columns[selection.get_support()]

print("Selected features for the best threshold:")
print(list(selected_features))

NameError: name 'model' is not defined