In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold


import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

# Re-load the dataset
df = pd.read_csv('../data/transform/df_feature_eng.csv')
df_ = pd.read_csv('../data/transform/df_more_features.csv')

df.drop(['Unnamed: 0', 'player_rating_home_player_7', 'player_rating_home_player_8',
       'player_rating_home_player_9', 'player_rating_home_player_10',
       'player_rating_home_player_11', 'player_rating_away_player_7',
       'player_rating_away_player_8', 'player_rating_away_player_9',
       'player_rating_away_player_10', 'player_rating_away_player_11',], axis=1, inplace=True)
df_.drop(['Unnamed: 0', 'player_rating_home_player_7', 'player_rating_home_player_8',
       'player_rating_home_player_9', 'player_rating_home_player_10',
       'player_rating_home_player_11', 'player_rating_away_player_7',
       'player_rating_away_player_8', 'player_rating_away_player_9',
       'player_rating_away_player_10', 'player_rating_away_player_11',], axis=1, inplace=True)

In [2]:
df_.shape

(3040, 33)

In [3]:
# Separating features and target variable
X = df.drop('result_match', axis=1)
y = df['result_match']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Filter Method: Selecting features using ANOVA F-test
k_values = range(6, X_train.shape[1], 2)  # Testing different numbers of features
filter_accuracies = []

selected_features

for k in k_values:
    # Select top k features
    selector = SelectKBest(f_classif, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Train XGBoost classifier
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train_selected, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    filter_accuracies.append(accuracy)
    selected_features = (selector.get_support())

# Plotting the results
plt.plot(k_values, filter_accuracies, marker='o', label='Filter Method (ANOVA F-test)')
plt.xlabel('Number of Features')
plt.ylabel('Accuracy')
plt.title('Filter Method Feature Selection')
plt.legend()
plt.grid(True)
plt.show()

NameError: name 'selected_features' is not defined

In [None]:
# Choose the number of features with the best accuracy
best_k = k_values[filter_accuracies.index(max(filter_accuracies))]

# Re-run the selection process for the best k
selector = SelectKBest(f_classif, k=best_k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Retrieve the selected feature names
selected_features = X_train.columns[selector.get_support()]

print("Selected features for k =", best_k, ":\n", selected_features)

In [None]:
# Separate the target variable and features
X = df[selected_features]
y = df['result_match']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Initialize XGBoost classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

rfecv = RFECV(estimator=xgb, step=1, scoring='accuracy')
# Run RFECV on the smaller subset
rfecv.fit(X_train, y_train)

In [None]:
import seaborn as sns

# Assuming rfecv is your fitted RFECV object
optimal_features = rfecv.n_features_
cv_scores = rfecv.cv_results_['mean_test_score'] 
feature_rankings = rfecv.ranking_

print(f"Optimal number of features: {optimal_features}")

# Identify and print the most important features
important_features = X.columns[rfecv.support_]
print(f"Most important features:\n{important_features}")

# Plotting the CV Scores
plt.figure(figsize=(10, 6))
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation score (accuracy)")
plt.plot(range(1, len(cv_scores) + 1), cv_scores)
plt.title("RFECV - Optimal Number of Features")
plt.show()

# Plotting feature rankings
plt.figure(figsize=(15, 8))
sns.barplot(x=X.columns, y=feature_rankings)
plt.xlabel("Features")
plt.ylabel("Ranking")
plt.title("Feature Rankings")
plt.xticks(rotation=90)  # Rotate feature names for better readability
plt.show()

In [None]:
# Splitting the original dataset
X = df[selected_features]
y = df['result_match']
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize the XGBClassifier
xgb_full = XGBClassifier(random_state=42)

# Train the model on the full feature set
xgb_full.fit(X_train_full, y_train)

# Predictions and evaluation on the full feature set
y_pred_full = xgb_full.predict(X_test_full)
accuracy_full = accuracy_score(y_test, y_pred_full)

accuracy_full

In [None]:
import xgbfir

xgbfir.saveXgbFI(xgb_full, feature_names=X_train.columns, OutputXlsxFile='fir.xlsx')

In [None]:
file_path = 'fir.xlsx'
fir = pd.read_excel(file_path)

# Extracting top features based on different metrics
top_gain_features = fir.sort_values(by='Gain', ascending=False).head(10)
top_fscore_features = fir.sort_values(by='FScore', ascending=False).head(10)
top_wfscore_features = fir.sort_values(by='wFScore', ascending=False).head(10)
top_avg_gain_features = fir.sort_values(by='Average Gain', ascending=False).head(10)
top_expected_gain_features = fir.sort_values(by='Expected Gain', ascending=False).head(10)

# Displaying the top features
top_features_summary = {
    "Top Features by Gain": top_gain_features['Interaction'].values,
    "Top Features by FScore": top_fscore_features['Interaction'].values,
    "Top Features by wFScore": top_wfscore_features['Interaction'].values,
    "Top Features by Average Gain": top_avg_gain_features['Interaction'].values,
    "Top Features by Expected Gain": top_expected_gain_features['Interaction'].values
}

top_features_summary

In [None]:
# Load the data from the "Interaction Depth 1" and "Interaction Depth 2" sheets
interaction_depth_1 = pd.read_excel(file_path, sheet_name='Interaction Depth 1')
interaction_depth_2 = pd.read_excel(file_path, sheet_name='Interaction Depth 2')

# Display the first few rows of each sheet to understand their structure
interaction_depth_1_head = interaction_depth_1.head(10)
interaction_depth_2_head = interaction_depth_2.head(10)

interaction_depth_1_head

In [None]:
interaction_depth_2_head

In [None]:
interaction_depth_2_head

In [6]:
df.shape

(3040, 179)

In [8]:
# Assuming 'result_match' is the target variable
X = df.drop(['result_match'], axis=1)
y = df['result_match']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a range for n_features_to_select
feature_range = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 179] # Adjust this based on your dataset size and computational capacity

best_score = 0
best_n_features = 0

# Iterate over the range
for n_features in feature_range:
    # Create a Decision Tree Classifier (can be replaced with another model for actual use)
    model = XGBClassifier(random_state=42)

    # Create the RFE model and select n features
    rfe = RFE(estimator=model, n_features_to_select=n_features)
    rfe = rfe.fit(X_train, y_train)

    # Transform the training and testing sets
    X_train_rfe = rfe.transform(X_train)
    X_test_rfe = rfe.transform(X_test)

    # Train the model on the reduced dataset
    model.fit(X_train_rfe, y_train)

    # Make predictions and evaluate using F1 score
    y_pred = model.predict(X_test_rfe)
    score = f1_score(y_test, y_pred, average='macro')

    # Compare and store the best score and corresponding number of features
    if score > best_score:
        best_score = score
        best_n_features = n_features

print(f"Best F1 Score: {best_score}")
print(f"Best number of features: {best_n_features}")

Best F1 Score: 0.46191665512331187
Best number of features: 50
