In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold


import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

# Re-load the dataset
df = pd.read_csv('../data/transform/df_engineered.csv')

df.drop(['Unnamed: 0',], axis=1, inplace=True)

In [2]:
df.shape

(3040, 40)

In [3]:
# Splitting the original dataset
X = df.drop('result_match', axis=True)
y = df['result_match']
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

xgb_full = XGBClassifier(random_state=42)

xgb_full.fit(X_train_full, y_train)

# Predictions and evaluation on the full feature set
y_pred_full = xgb_full.predict(X_test_full)
accuracy_full = accuracy_score(y_test, y_pred_full)

accuracy_full

0.4868421052631579

In [4]:
import xgbfir

xgbfir.saveXgbFI(xgb_full, feature_names=X_train_full.columns, OutputXlsxFile='fir.xlsx')

In [5]:
file_path = 'fir.xlsx'
fir = pd.read_excel(file_path)

# Extracting top features based on different metrics
top_gain_features = fir.sort_values(by='Gain', ascending=False).head(10)
top_fscore_features = fir.sort_values(by='FScore', ascending=False).head(10)
top_wfscore_features = fir.sort_values(by='wFScore', ascending=False).head(10)
top_avg_gain_features = fir.sort_values(by='Average Gain', ascending=False).head(10)
top_expected_gain_features = fir.sort_values(by='Expected Gain', ascending=False).head(10)

# Displaying the top features
top_features_summary = {
    "Top Features by Gain": top_gain_features['Interaction'].values,
    "Top Features by FScore": top_fscore_features['Interaction'].values,
    "Top Features by wFScore": top_wfscore_features['Interaction'].values,
    "Top Features by Average Gain": top_avg_gain_features['Interaction'].values,
    "Top Features by Expected Gain": top_expected_gain_features['Interaction'].values
}

top_features_summary

{'Top Features by Gain': array(['new_feature_4', 'combined_1', 'avg_possession_home',
        'avg_player_rating_home', 'shots_on_target_ratio_rolling',
        'ratio_home_rating_player_7_8_9_10_11', 'new_feature_6',
        'avg_possession_away', 'goals_ratio_rolling', 'diff_player_10'],
       dtype=object),
 'Top Features by FScore': array(['avg_possession_home', 'avg_player_rating_home',
        'avg_possession_away', 'new_feature_6',
        'ratio_home_rating_player_7_8_9_10_11', 'away_avg_goals_scored',
        'possession_difference_rolling',
        'shots_on_target_difference_rolling', 'diff_player_7',
        'home_avg_goals_scored'], dtype=object),
 'Top Features by wFScore': array(['away_avg_goals_scored', 'avg_player_rating_home',
        'shots_on_target_ratio_rolling', 'diff_player_10', 'diff_player_7',
        'avg_possession_home', 'diff_player_9',
        'ratio_away_rating_player_7_8_9_10_11', 'possession_ratio_rolling',
        'goals_ratio_rolling'], dtype=object

In [17]:
# Load the data from the "Interaction Depth 1" and "Interaction Depth 2" sheets
interaction_depth_1 = pd.read_excel(file_path, sheet_name='Interaction Depth 1')
interaction_depth_2 = pd.read_excel(file_path, sheet_name='Interaction Depth 2')

# Display the first few rows of each sheet to understand their structure
interaction_depth_1_head = interaction_depth_1.sort_values('Gain', ascending=False).head(20)
interaction_depth_2_head = interaction_depth_2.sort_values('Gain', ascending=False).head(20)

interaction_depth_1_head

Unnamed: 0,Interaction,Gain,FScore,wFScore,Average wFScore,Average Gain,Expected Gain,Gain Rank,FScore Rank,wFScore Rank,Avg wFScore Rank,Avg Gain Rank,Expected Gain Rank,Average Rank,Average Tree Index,Average Tree Depth
0,combined_3_alt|new_feature_4,275.750405,6,2.103756,0.350626,45.958401,169.675922,1,87,30,10,2,1,21.833333,22.666667,2.166667
1,combined_1|combined_3_alt,188.829286,9,3.727744,0.414194,20.981032,66.434534,2,60,7,4,5,3,13.5,60.444444,3.333333
2,combined_1|diff_player_10,150.508566,7,1.835898,0.262271,21.501224,82.601271,3,78,38,20,4,2,24.166667,19.0,3.0
3,combined_2|new_feature_4,143.509885,3,0.693339,0.231113,47.836628,39.716764,4,97,78,24,1,5,34.833333,37.666667,3.0
4,combined_1|new_feature_4,105.389938,4,0.791649,0.197912,26.347485,64.083905,5,93,73,34,3,4,35.333333,22.0,3.25
5,new_feature_4|new_feature_5,101.201998,7,0.710642,0.10152,14.457428,23.850253,6,79,77,71,9,13,42.5,23.714286,3.571429
6,combined_1|diff_player_7,98.541946,7,0.7458,0.106543,14.077421,36.723528,7,80,74,69,11,6,41.166667,25.142857,4.142857
7,avg_possession_home|diff_player_11,92.841571,21,2.442058,0.116288,4.421027,10.80983,8,2,17,63,43,35,28.0,35.761905,3.857143
8,diff_player_10|diff_player_7,91.342723,12,1.716711,0.143059,7.611894,24.043931,9,27,40,53,21,11,26.833333,34.416667,3.25
9,avg_possession_home|combined_3_alt,82.7967,8,0.656347,0.082043,10.349588,10.733499,10,74,82,78,15,37,49.333333,45.875,3.375


In [None]:
interaction_depth_2_head
# .iloc[10]['Interaction']

In [None]:
'avg_possession_home', 'combined_3_alt', 'new_feature_4', 
'combined_1', 'diff_player_10', 'diff_player_7'
'avg_player_rating_away', 'combined_3_alt', 'new_feature_4'
'combined_2','goals_ratio_rolling', 'new_feature_4'
'avg_player_rating_away', 'avg_player_rating_home', 'combined_3_alt'


'combined_3_alt', 'new_feature_4'
'combined_1', 'combined_3_alt'
'combined_1', 'diff_player_10'
'combined_2', 'new_feature_4'
'combined_1', 'new_feature_4'
'new_feature_4', 'new_feature_5'
'combined_1', 'diff_player_7'
'avg_possession_home', 'diff_player_11'
'diff_player_10', 'diff_player_7'
'avg_possession_home', 'combined_3_alt'
'avg_possession_home', 'diff_player_8'

In [None]:
# Pairplot for components of new features
sns.pairplot(df[['new_feature_3', 'avg_player_rating_away']])
plt.show()

In [None]:
s = pd.concat([interaction_depth_1_head['Interaction'], (interaction_depth_2_head['Interaction'])])

# Split each string by '|' and flatten the resulting lists
columns = [item for sublist in s.str.split('|') for item in sublist]

# Get unique values
unique_columns = list(set(columns))

print(unique_columns)

In [None]:
# Assuming 'result_match' is the target variable
X = df.drop('result_match', axis=True)
y = df['result_match']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Filter Method: Selecting features using ANOVA F-test
k_values = range(6, X_train.shape[1], 1)
filter_accuracies = []

best_score = 0
best_n_features = 0

for k in k_values:
    # Select top k features
    selector = SelectKBest(f_classif, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Train XGBoost classifier
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train_selected, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test_selected)
    f1 = f1_score(y_test, y_pred, average='weighted')
    filter_accuracies.append(f1)
    
    # Compare and store the best score and corresponding number of features
    if f1 > best_score:
        best_score = f1

# Plotting the results
plt.plot(k_values, filter_accuracies, marker='o', label='Filter Method (ANOVA F-test)')
plt.xlabel('Number of Features')
plt.ylabel('Accuracy')
plt.title('Filter Method Feature Selection')
plt.legend()
plt.grid(True)
plt.show()

print(f"Best accuray Score: {best_score}")

In [None]:
# Choose the number of features with the best accuracy
best_k = k_values[filter_accuracies.index(max(filter_accuracies))]

# Re-run the selection process for the best k
selector = SelectKBest(f_classif, k=best_k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Retrieve the selected feature names
selected_features = X_train.columns[selector.get_support()]

print("Selected features for k =", best_k, ":\n", selected_features)

In [None]:
# Assuming 'result_match' is the target variable
X = df[selected_features]
y = df['result_match']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Define a range for n_features_to_select
feature_range = range(6, X_train.shape[1], 2)

best_score = 0
best_n_features = 0

# Iterate over the range
for n_features in feature_range:
    model = XGBClassifier(random_state=42)

    # Create the RFE model and select n features
    rfe = RFE(estimator=model, n_features_to_select=n_features)
    rfe = rfe.fit(X_train, y_train)

    # Transform the training and testing sets
    X_train_rfe = rfe.transform(X_train)
    X_test_rfe = rfe.transform(X_test)

    # Train the model on the reduced dataset
    model.fit(X_train_rfe, y_train)

    # Make predictions and evaluate using F1 score
    y_pred = model.predict(X_test_rfe)
    score = f1_score(y_test, y_pred, average='weighted')

    # Compare and store the best score and corresponding number of features
    if score > best_score:
        best_score = score
        best_n_features = n_features

print(f"Best F1 Score: {best_score}")
print(f"Best number of features: {best_n_features}")

In [36]:
X = df.drop('result_match', axis=1)
y = df['result_match']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize the XGBClassifier
xgb_full = XGBClassifier(random_state=42)

# Train the model on the full feature set
xgb_full.fit(X_train_full, y_train)

# Predictions and evaluation on the full feature set
y_pred_full = xgb_full.predict(X_test_full)
f1_score = f1_score(y_test, y_pred_full,average='weighted')

f1_score

TypeError: 'numpy.float64' object is not callable

In [18]:
X = df[selected_features]
y = df['result_match']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize the XGBClassifier
xgb_full = XGBClassifier(random_state=42)

# Train the model on the full feature set
xgb_full.fit(X_train_full, y_train)

# Predictions and evaluation on the full feature set
y_pred_full = xgb_full.predict(X_test_full)
f1_score = f1_score(y_test, y_pred_full,average='weighted')

classification_report()

f1_score

0.5032894736842105