In [15]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold


import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

# Re-load the dataset
df = pd.read_csv('../data/transform/df_engineered.csv')

df.drop(['Unnamed: 0',], axis=1, inplace=True)

In [16]:
df.shape

(3040, 36)

In [17]:
# Splitting the original dataset
X = df.drop('result_match', axis=True)
y = df['result_match']
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

xgb_full = XGBClassifier(random_state=42)

xgb_full.fit(X_train_full, y_train)

# Predictions and evaluation on the full feature set
y_pred_full = xgb_full.predict(X_test_full)
accuracy_full = accuracy_score(y_test, y_pred_full)

accuracy_full

0.48355263157894735

In [None]:
from sklearn.metrics import f1_score

# Feature Importance
importances = xgb_full.feature_importances_
feature_names = X_train_full.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Finding the best threshold
thresholds = np.sort(np.unique(importances))  # Unique sorted importance values
best_threshold = 0
best_f1 = 0
best_selected_features = []

for thresh in thresholds:
    # Selecting features based on threshold
    selected_features = feature_importance_df[feature_importance_df['Importance'] >= thresh]['Feature']
    X_train_reduced = X_train_full[selected_features]
    X_test_reduced = X_test_full[selected_features]

    # Retrain with reduced features
    xgb_full.fit(X_train_reduced, y_train)
    y_pred = xgb_full.predict(X_test_reduced)

    # Evaluation
    f1 = f1_score(y_test, y_pred, average='weighted')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = thresh
        best_selected_features = selected_features

print(f"Best Threshold: {best_threshold}, Best F1 Score: {best_f1}")

print(f"Best selected features: {best_selected_features}")

In [None]:
import xgbfir

xgbfir.saveXgbFI(xgb_full, feature_names=X_train_full.columns, OutputXlsxFile='fir.xlsx')

In [18]:
import xgbfir

xgbfir.saveXgbFI(xgb_full, feature_names=X_train_full.columns, OutputXlsxFile='fir.xlsx')

In [19]:
file_path = 'fir.xlsx'
fir = pd.read_excel(file_path)

# Extracting top features based on different metrics
top_gain_features = fir.sort_values(by='Gain', ascending=False).head(10)
top_fscore_features = fir.sort_values(by='FScore', ascending=False).head(10)
top_wfscore_features = fir.sort_values(by='wFScore', ascending=False).head(10)
top_avg_gain_features = fir.sort_values(by='Average Gain', ascending=False).head(10)
top_expected_gain_features = fir.sort_values(by='Expected Gain', ascending=False).head(10)

# Displaying the top features
top_features_summary = {
    "Top Features by Gain": top_gain_features['Interaction'].values,
    "Top Features by FScore": top_fscore_features['Interaction'].values,
    "Top Features by wFScore": top_wfscore_features['Interaction'].values,
    "Top Features by Average Gain": top_avg_gain_features['Interaction'].values,
    "Top Features by Expected Gain": top_expected_gain_features['Interaction'].values
}

top_features_summary

{'Top Features by Gain': array(['new_feature_4', 'new_feature_99', 'shots_on_target_ratio_rolling',
        'avg_shoton_home', 'ratio_home_rating_player_7_8_9_10_11',
        'ratio_home_possession_rating', 'ratio_away_possession_rating',
        'diff_player_10', 'avg_player_rating_home', 'new_feature_5'],
       dtype=object),
 'Top Features by FScore': array(['avg_shoton_home', 'ratio_home_rating_player_7_8_9_10_11',
        'ratio_away_possession_rating', 'ratio_home_possession_rating',
        'shots_on_target_ratio_rolling', 'home_avg_goals_conceded',
        'diff_player_10', 'avg_player_rating_home', 'new_feature_5',
        'diff_player_11'], dtype=object),
 'Top Features by wFScore': array(['ratio_away_possession_rating', 'goals_ratio_rolling',
        'new_feature_99', 'home_avg_goals_scored',
        'ratio_home_rating_player_7_8_9_10_11',
        'shots_on_target_ratio_rolling', 'avg_player_rating_home',
        'new_feature_4', 'diff_player_8',
        'shots_on_target_di

In [20]:
# Load the data from the "Interaction Depth 1" and "Interaction Depth 2" sheets
interaction_depth_1 = pd.read_excel(file_path, sheet_name='Interaction Depth 1')
interaction_depth_2 = pd.read_excel(file_path, sheet_name='Interaction Depth 2')

# Display the first few rows of each sheet to understand their structure
interaction_depth_1_head = interaction_depth_1.sort_values('Gain', ascending=False).head(20)
interaction_depth_2_head = interaction_depth_2.sort_values('Gain', ascending=False).head(20)

interaction_depth_1_head

Unnamed: 0,Interaction,Gain,FScore,wFScore,Average wFScore,Average Gain,Expected Gain,Gain Rank,FScore Rank,wFScore Rank,Avg wFScore Rank,Avg Gain Rank,Expected Gain Rank,Average Rank,Average Tree Index,Average Tree Depth
0,new_feature_4|new_feature_99,447.085099,10,3.58886,0.358886,44.70851,259.942031,1,57,9,8,1,1,12.833333,26.7,1.9
1,combined_2|new_feature_4,207.520282,6,2.335708,0.389285,34.586714,60.468862,2,93,22,4,2,3,21.0,30.666667,1.666667
2,combined_1|diff_player_10,138.321366,7,1.119028,0.159861,19.760195,79.238466,3,87,65,35,4,2,32.666667,24.714286,3.857143
3,combined_1|combined_3_alt,115.845637,5,1.877496,0.375499,23.169127,35.198285,4,96,35,6,3,7,25.166667,36.8,3.6
4,new_feature_99|shots_on_target_difference_rolling,112.095857,6,2.346316,0.391053,18.682643,58.912613,5,94,21,3,5,4,22.0,27.333333,2.666667
5,diff_player_10|new_feature_99,107.245115,11,1.762888,0.160263,9.749556,36.129102,6,49,36,34,12,6,23.833333,41.181818,3.272727
6,new_feature_4|new_feature_5,102.020461,9,1.125835,0.125093,11.335607,25.600586,7,74,64,54,10,13,37.0,41.111111,3.444444
7,avg_player_rating_home|new_feature_99,88.894191,7,0.823929,0.117704,12.69917,39.027213,8,88,75,56,7,5,39.833333,35.0,4.0
8,away_avg_goals_scored|ratio_home_rating_player...,81.791846,19,3.523994,0.185473,4.304834,14.902223,9,3,10,28,44,21,19.166667,32.0,4.052632
9,combined_3_alt|new_feature_99,81.131278,6,0.550816,0.091803,13.52188,27.306642,10,95,87,72,6,10,46.666667,60.333333,3.166667


In [21]:
interaction_depth_2_head
# .iloc[10]['Interaction']

Unnamed: 0,Interaction,Gain,FScore,wFScore,Average wFScore,Average Gain,Expected Gain,Gain Rank,FScore Rank,wFScore Rank,Avg wFScore Rank,Avg Gain Rank,Expected Gain Rank,Average Rank,Average Tree Index,Average Tree Depth
0,new_feature_4|new_feature_99|shots_on_target_d...,332.484098,3,1.660925,0.553642,110.828033,183.832302,1,26,1,2,1,1,5.333333,5.0,2.0
1,new_feature_4|new_feature_4|new_feature_99,319.947866,3,0.46849,0.156163,106.649289,51.238004,2,27,29,36,4,3,16.833333,5.0,2.0
2,combined_2|home_avg_goals_conceded|new_feature_4,187.649666,2,0.245462,0.122731,93.824833,22.657209,3,51,56,44,5,8,27.833333,5.0,2.0
3,combined_2|new_feature_4|shots_on_target_ratio...,150.648195,3,0.614706,0.204902,50.216065,24.922011,4,28,18,28,11,7,16.0,38.333333,2.666667
4,combined_1|diff_player_10|shots_on_target_diff...,139.000528,2,0.815059,0.407529,69.500264,63.540024,5,52,13,12,8,2,15.333333,1.0,3.0
5,avg_shoton_home|diff_player_11|new_feature_99,135.019472,5,0.517297,0.103459,27.003894,20.991686,6,4,23,54,28,11,21.0,7.8,3.0
6,combined_1|combined_3_alt|ratio_home_possessio...,115.203795,2,0.117766,0.058883,57.601898,10.394429,7,53,81,77,10,29,42.833333,42.5,3.5
7,combined_1|diff_player_10|new_feature_99,110.492233,1,0.229898,0.229898,110.492233,25.401907,8,81,59,25,2,6,30.166667,0.0,2.0
8,combined_1|combined_3_alt|diff_player_11,107.201866,1,0.195175,0.195175,107.201866,20.923172,9,82,67,29,3,12,33.666667,0.0,2.0
9,avg_shoton_away|new_feature_4|new_feature_5,93.932923,3,0.476522,0.158841,31.310974,20.169874,10,29,27,33,23,14,22.666667,15.0,3.0


In [22]:
X_train_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2736 entries, 1173 to 860
Data columns (total 35 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   home_avg_goals_scored                 2736 non-null   float64
 1   away_avg_goals_scored                 2736 non-null   float64
 2   home_avg_goals_conceded               2736 non-null   float64
 3   away_avg_goals_conceded               2736 non-null   float64
 4   home_streak_wins                      2736 non-null   int64  
 5   away_streak_wins                      2736 non-null   int64  
 6   home_sum_points                       2736 non-null   int64  
 7   away_sum_points                       2736 non-null   int64  
 8   points_diff                           2736 non-null   int64  
 9   win_eachother_home                    2736 non-null   int64  
 10  win_eachother_away                    2736 non-null   int64  
 11  avg_shoton_home     

In [12]:
'avg_possession_home', 'combined_3_alt', 'new_feature_4', 
'combined_1', 'diff_player_10', 'diff_player_7'
'avg_player_rating_away', 'combined_3_alt', 'new_feature_4'
'combined_2','goals_ratio_rolling', 'new_feature_4'
'avg_player_rating_away', 'avg_player_rating_home', 'combined_3_alt'


'combined_3_alt', 'new_feature_4'
'combined_1', 'combined_3_alt'
'combined_1', 'diff_player_10'
'combined_2', 'new_feature_4'
'combined_1', 'new_feature_4'
'new_feature_4', 'new_feature_5'
'combined_1', 'diff_player_7'
'avg_possession_home', 'diff_player_11'
'diff_player_10', 'diff_player_7'
'avg_possession_home', 'combined_3_alt'
'avg_possession_home', 'diff_player_8'

('avg_possession_home', 'diff_player_8')

In [13]:
# Pairplot for components of new features
sns.pairplot(df[['new_feature_3', 'avg_player_rating_away']])
plt.show()

NameError: name 'sns' is not defined

In [14]:
s = pd.concat([interaction_depth_1_head['Interaction'], (interaction_depth_2_head['Interaction'])])

# Split each string by '|' and flatten the resulting lists
columns = [item for sublist in s.str.split('|') for item in sublist]

# Get unique values
unique_columns = list(set(columns))

print(unique_columns)

['diff_player_10', 'diff_player_8', 'new_feature_4', 'combined_2', 'avg_possession_home', 'avg_possession_away', 'new_feature_99', 'shots_on_target_ratio_rolling', 'shots_on_target_difference_rolling', 'ratio_home_rating_player_7_8_9_10_11', 'home_avg_goals_conceded', 'away_avg_goals_scored', 'new_feature_5', 'avg_player_rating_home', 'combined_1', 'diff_player_7', 'home_sum_points', 'combined_3_alt', 'diff_player_11']


In [35]:
# Assuming 'result_match' is the target variable
X = df.drop('result_match', axis=True)
y = df['result_match']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Filter Method: Selecting features using ANOVA F-test
k_values = range(6, X_train.shape[1], 1)
filter_accuracies = []

best_score = 0
best_n_features = 0

for k in k_values:
    # Select top k features
    selector = SelectKBest(f_classif, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Train XGBoost classifier
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train_selected, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test_selected)
    f1 = f1_score(y_test, y_pred, average='weighted')
    filter_accuracies.append(f1)
    
    # Compare and store the best score and corresponding number of features
    if f1 > best_score:
        best_score = f1

# Plotting the results
plt.plot(k_values, filter_accuracies, marker='o', label='Filter Method (ANOVA F-test)')
plt.xlabel('Number of Features')
plt.ylabel('Accuracy')
plt.title('Filter Method Feature Selection')
plt.legend()
plt.grid(True)
plt.show()

print(f"Best accuray Score: {best_score}")

TypeError: 'numpy.float64' object is not callable

In [17]:
# Choose the number of features with the best accuracy
best_k = k_values[filter_accuracies.index(max(filter_accuracies))]

# Re-run the selection process for the best k
selector = SelectKBest(f_classif, k=best_k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Retrieve the selected feature names
selected_features = X_train.columns[selector.get_support()]

print("Selected features for k =", best_k, ":\n", selected_features)

Selected features for k = 32 :
 Index(['player_rating_home_player_7', 'player_rating_home_player_8',
       'player_rating_home_player_9', 'player_rating_home_player_10',
       'player_rating_home_player_11', 'player_rating_away_player_7',
       'player_rating_away_player_8', 'player_rating_away_player_9',
       'player_rating_away_player_10', 'player_rating_away_player_11',
       'home_avg_goals_scored', 'away_avg_goals_scored',
       'home_avg_goals_conceded', 'home_sum_points', 'away_sum_points',
       'points_diff', 'avg_player_rating_home', 'avg_player_rating_away',
       'diff_player_7', 'diff_player_8', 'diff_player_9', 'diff_player_10',
       'diff_player_11', 'new_feature_2', 'new_feature_3', 'new_feature_4',
       'new_feature_5', 'goal_difference_rolling',
       'shots_on_target_difference_rolling', 'goals_ratio_rolling',
       'shots_on_target_ratio_rolling', 'cluster'],
      dtype='object')


In [18]:
# Assuming 'result_match' is the target variable
X = df[selected_features]
y = df['result_match']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Define a range for n_features_to_select
feature_range = range(6, X_train.shape[1], 2)

best_score = 0
best_n_features = 0

# Iterate over the range
for n_features in feature_range:
    model = XGBClassifier(random_state=42)

    # Create the RFE model and select n features
    rfe = RFE(estimator=model, n_features_to_select=n_features)
    rfe = rfe.fit(X_train, y_train)

    # Transform the training and testing sets
    X_train_rfe = rfe.transform(X_train)
    X_test_rfe = rfe.transform(X_test)

    # Train the model on the reduced dataset
    model.fit(X_train_rfe, y_train)

    # Make predictions and evaluate using F1 score
    y_pred = model.predict(X_test_rfe)
    score = f1_score(y_test, y_pred, average='weighted')

    # Compare and store the best score and corresponding number of features
    if score > best_score:
        best_score = score
        best_n_features = n_features

print(f"Best F1 Score: {best_score}")
print(f"Best number of features: {best_n_features}")

Best F1 Score: 0.5089454313605335
Best number of features: 10


In [36]:
X = df.drop('result_match', axis=1)
y = df['result_match']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize the XGBClassifier
xgb_full = XGBClassifier(random_state=42)

# Train the model on the full feature set
xgb_full.fit(X_train_full, y_train)

# Predictions and evaluation on the full feature set
y_pred_full = xgb_full.predict(X_test_full)
f1_score = f1_score(y_test, y_pred_full,average='weighted')

f1_score

TypeError: 'numpy.float64' object is not callable

In [18]:
X = df[selected_features]
y = df['result_match']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize the XGBClassifier
xgb_full = XGBClassifier(random_state=42)

# Train the model on the full feature set
xgb_full.fit(X_train_full, y_train)

# Predictions and evaluation on the full feature set
y_pred_full = xgb_full.predict(X_test_full)
f1_score = f1_score(y_test, y_pred_full,average='weighted')

classification_report()

f1_score

0.5032894736842105