# Modelling Appendix

To get the individual breakdown of all models' performance on each role, we run the code with y as a single column rather than as a whole in 02-modelling. This allows us to store all the results for each role as an individual dataframe.

### Contents

- [Traditional Keeper](#Traditional-Keeper)
- [Sweeper Keeper](#Sweeper-Keeper)
- [Ball-Playing Defender](#Ball-Playing-Defender)
- [No-Nonsense Defender](#No-Nonsense-Defender)
- [Full-Back](#Full-Back)
- [All-Action Midfielder](#All-Action-Midfielder)
- [Midfield Playmaker](#Midfield-Playmaker)
- [Traditional Winger](#Traditional-Winger)
- [Inverted Winger](#Inverted-Winger)
- [Goal Poacher](#Goal-Poacher)
- [Target Man](#Target-Man)

## Traditional Keeper

In [14]:
import pandas as pd
from pycaret.classification import setup, compare_models, predict_model, finalize_model, pull
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# Load the CSV file into a DataFrame
df = pd.read_csv('../data/players_df_sin_reco.csv')

# Select features (columns 7 to 20, 0-based index: 6 to 19)
X = df.iloc[:, 6:20]

# Select the 11th column from the end as the target
y = df.iloc[:, -11]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an empty DataFrame to store results
traditionalkeeper_df = pd.DataFrame(columns=['model', 'train_accuracy', 'test_accuracy', 'train_f1', 'test_f1', 'TT'])

# Concatenate X_train and y_train into a DataFrame for PyCaret setup
df_target = pd.concat([X_train, y_train], axis=1)

# Setup PyCaret environment
clf = setup(data=df_target, target=y.name, session_id=42)

# Compare different models and store the results
best_model = compare_models()  # This returns the top model
compare_results = pull()  # Pulls all model results as a DataFrame

# Define the mapping from model names to PyCaret identifiers
model_mapping = {
    'K Neighbors Classifier': 'knn',
    'Random Forest Classifier': 'rf',
    'Logistic Regression': 'lr',
    'Decision Tree Classifier': 'dt',
    'Ada Boost Classifier': 'ada',
    'Gradient Boosting Classifier': 'gbc',
    'Naive Bayes': 'nb',
    'SVM - Linear Kernel': 'svm',
    'Extra Trees Classifier': 'et',
    'Light Gradient Boosting Machine': 'lightgbm',
    'Ridge Classifier': 'ridge',
    'Linear Discriminant Analysis': 'lda',
    'Extreme Gradient Boosting': 'xgboost',
    'Dummy Classifier': 'dummy'
}

# Loop over each model in the compare_results DataFrame
for idx, row in compare_results.iterrows():
    model_name = row['Model']
    testing_time = row['TT (Sec)']

    # Get the PyCaret identifier for the model
    model_identifier = model_mapping.get(model_name)

    # Instead of finalizing by the model name, select the model by identifier
    if model_identifier:
        best_model = compare_models(include=[model_identifier], n_select=1)

        # Finalize the model (train it on the entire dataset)
        final_model = finalize_model(best_model)

        # Predict on train and test sets
        predictions_train = predict_model(final_model, data=X_train)
        predictions_test = predict_model(final_model, data=X_test)

        # Calculate test accuracy and F1 score
        y_pred_test = predictions_test['prediction_label']
        test_accuracy = accuracy_score(y_test, y_pred_test)
        test_f1 = f1_score(y_test, y_pred_test, average='weighted')

        # Calculate train accuracy and F1 score
        y_pred_train = predictions_train['prediction_label']
        train_accuracy = accuracy_score(y_train, y_pred_train)
        train_f1 = f1_score(y_train, y_pred_train, average='weighted')

        # Append the metrics for each model to the results DataFrame
        new_row = pd.DataFrame({
            'model': model_name,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'train_f1': train_f1,
            'test_f1': test_f1,
            'TT': testing_time
        }, index=[0])

        traditionalkeeper_df = pd.concat([traditionalkeeper_df, new_row], ignore_index=True)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Class_Traditional Keeper
2,Target type,Binary
3,Original data shape,"(91, 15)"
4,Transformed data shape,"(91, 15)"
5,Transformed train set shape,"(63, 15)"
6,Transformed test set shape,"(28, 15)"
7,Numeric features,14
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9833,0.9,0.9,0.85,0.8667,,0.8632,0.022
dt,Decision Tree Classifier,0.9833,0.89,0.9,0.85,0.8667,,0.8632,0.014
ridge,Ridge Classifier,0.9833,0.88,0.9,0.85,0.8667,,0.8632,0.016
rf,Random Forest Classifier,0.9833,0.9,0.8,0.8,0.8,,0.8,0.095
lda,Linear Discriminant Analysis,0.9833,0.88,0.9,0.85,0.8667,,0.8632,0.015
ada,Ada Boost Classifier,0.9667,0.89,0.8,0.75,0.7667,,0.7632,0.046
gbc,Gradient Boosting Classifier,0.9667,0.89,0.8,0.75,0.7667,,0.7632,0.046
et,Extra Trees Classifier,0.9667,0.9,0.8,0.75,0.7667,,0.7632,0.072
xgboost,Extreme Gradient Boosting,0.9667,0.9,0.8,0.75,0.7667,,0.7632,0.151
lr,Logistic Regression,0.9524,0.88,0.8,0.7,0.7333,,0.7278,1.628


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9833,0.9,0.9,0.85,0.8667,,0.8632,0.025


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.9833,0.89,0.9,0.85,0.8667,,0.8632,0.018


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.9833,0.88,0.9,0.85,0.8667,,0.8632,0.016


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9833,0.9,0.8,0.8,0.8,,0.8,0.098


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9833,0.88,0.9,0.85,0.8667,,0.8632,0.015


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.9667,0.89,0.8,0.75,0.7667,,0.7632,0.052


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9667,0.89,0.8,0.75,0.7667,,0.7632,0.047


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9667,0.9,0.8,0.75,0.7667,,0.7632,0.077


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9667,0.9,0.8,0.75,0.7667,,0.7632,0.035


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9524,0.88,0.8,0.7,0.7333,,0.7278,0.023


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.9524,0.89,0.9,0.75,0.8,,0.791,0.015


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.9524,0.9,0.7,0.65,0.6667,,0.6632,0.02


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9524,0.8433,0.6,0.6,0.6,,0.6,0.038


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.8571,0.45,0.0,0.0,0.0,,0.0,0.016


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [13]:
traditionalkeeper_df

Unnamed: 0,model,train_accuracy,test_accuracy,train_f1,test_f1,TT
0,K Neighbors Classifier,0.978022,1.0,0.978664,1.0,0.034
1,Decision Tree Classifier,1.0,1.0,1.0,1.0,0.016
2,Ridge Classifier,0.978022,1.0,0.978664,1.0,0.015
3,Random Forest Classifier,1.0,1.0,1.0,1.0,0.096
4,Linear Discriminant Analysis,0.978022,1.0,0.978664,1.0,0.015
5,Ada Boost Classifier,1.0,1.0,1.0,1.0,0.058
6,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,0.08
7,Extra Trees Classifier,1.0,1.0,1.0,1.0,0.083
8,Extreme Gradient Boosting,1.0,1.0,1.0,1.0,0.025
9,Logistic Regression,1.0,1.0,1.0,1.0,0.03


## Sweeper Keeper

In [15]:
import pandas as pd
from pycaret.classification import setup, compare_models, predict_model, finalize_model, pull
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# Load the CSV file into a DataFrame
df = pd.read_csv('../data/players_df_sin_reco.csv')

# Select features (columns 7 to 20, 0-based index: 6 to 19)
X = df.iloc[:, 6:20]

# Select the 10th column from the end as the target
y = df.iloc[:, -10]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an empty DataFrame to store results
sweeperkeeper_df = pd.DataFrame(columns=['model', 'train_accuracy', 'test_accuracy', 'train_f1', 'test_f1', 'TT'])

# Concatenate X_train and y_train into a DataFrame for PyCaret setup
df_target = pd.concat([X_train, y_train], axis=1)

# Setup PyCaret environment
clf = setup(data=df_target, target=y.name, session_id=42)

# Compare different models and store the results
best_model = compare_models()  # This returns the top model
compare_results = pull()  # Pulls all model results as a DataFrame

# Define the mapping from model names to PyCaret identifiers
model_mapping = {
    'K Neighbors Classifier': 'knn',
    'Random Forest Classifier': 'rf',
    'Logistic Regression': 'lr',
    'Decision Tree Classifier': 'dt',
    'Ada Boost Classifier': 'ada',
    'Gradient Boosting Classifier': 'gbc',
    'Naive Bayes': 'nb',
    'SVM - Linear Kernel': 'svm',
    'Extra Trees Classifier': 'et',
    'Light Gradient Boosting Machine': 'lightgbm',
    'Ridge Classifier': 'ridge',
    'Linear Discriminant Analysis': 'lda',
    'Extreme Gradient Boosting': 'xgboost',
    'Dummy Classifier': 'dummy'
}

# Loop over each model in the compare_results DataFrame
for idx, row in compare_results.iterrows():
    model_name = row['Model']
    testing_time = row['TT (Sec)']

    # Get the PyCaret identifier for the model
    model_identifier = model_mapping.get(model_name)

    # Instead of finalizing by the model name, select the model by identifier
    if model_identifier:
        best_model = compare_models(include=[model_identifier], n_select=1)

        # Finalize the model (train it on the entire dataset)
        final_model = finalize_model(best_model)

        # Predict on train and test sets
        predictions_train = predict_model(final_model, data=X_train)
        predictions_test = predict_model(final_model, data=X_test)

        # Calculate test accuracy and F1 score
        y_pred_test = predictions_test['prediction_label']
        test_accuracy = accuracy_score(y_test, y_pred_test)
        test_f1 = f1_score(y_test, y_pred_test, average='weighted')

        # Calculate train accuracy and F1 score
        y_pred_train = predictions_train['prediction_label']
        train_accuracy = accuracy_score(y_train, y_pred_train)
        train_f1 = f1_score(y_train, y_pred_train, average='weighted')

        # Append the metrics for each model to the results DataFrame
        new_row = pd.DataFrame({
            'model': model_name,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'train_f1': train_f1,
            'test_f1': test_f1,
            'TT': testing_time
        }, index=[0])

        sweeperkeeper_df = pd.concat([sweeperkeeper_df, new_row], ignore_index=True)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Class_Sweeper Keeper
2,Target type,Binary
3,Original data shape,"(91, 15)"
4,Transformed data shape,"(91, 15)"
5,Transformed train set shape,"(63, 15)"
6,Transformed test set shape,"(28, 15)"
7,Numeric features,14
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9833,0.9875,0.95,1.0,0.9667,0.9571,0.9632,1.373
et,Extra Trees Classifier,0.9548,0.9875,0.85,1.0,0.9,0.8748,0.8923,0.11
nb,Naive Bayes,0.9238,0.96,0.75,1.0,0.8333,0.7908,0.8201,0.014
rf,Random Forest Classifier,0.9238,0.9875,0.75,1.0,0.8333,0.7908,0.8201,0.098
knn,K Neighbors Classifier,0.9214,0.9688,0.75,0.9,0.8,0.7748,0.7923,0.025
ridge,Ridge Classifier,0.9214,0.9475,0.8,0.95,0.85,0.7998,0.8173,0.018
lda,Linear Discriminant Analysis,0.9214,0.9475,0.8,0.95,0.85,0.7998,0.8173,0.02
xgboost,Extreme Gradient Boosting,0.9071,0.95,0.8,0.9167,0.8133,0.7574,0.7909,0.25
svm,SVM - Linear Kernel,0.8905,0.9675,0.85,0.75,0.7733,0.716,0.7399,0.013
lightgbm,Light Gradient Boosting Machine,0.8881,0.92,0.8,0.85,0.7667,0.7034,0.7453,0.037


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9833,0.9875,0.95,1.0,0.9667,0.9571,0.9632,0.027


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9548,0.9875,0.85,1.0,0.9,0.8748,0.8923,0.082


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.9238,0.96,0.75,1.0,0.8333,0.7908,0.8201,0.018


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9238,0.9875,0.75,1.0,0.8333,0.7908,0.8201,0.082


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9214,0.9688,0.75,0.9,0.8,0.7748,0.7923,0.019


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.9214,0.9475,0.8,0.95,0.85,0.7998,0.8173,0.019


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9214,0.9475,0.8,0.95,0.85,0.7998,0.8173,0.019


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9071,0.95,0.8,0.9167,0.8133,0.7574,0.7909,0.028


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.8905,0.9675,0.85,0.75,0.7733,0.716,0.7399,0.017


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8881,0.92,0.8,0.85,0.7667,0.7034,0.7453,0.033


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.869,0.9375,0.85,0.8,0.7867,0.6969,0.725,0.046


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8381,0.7737,0.7,0.7,0.6533,0.5557,0.5895,0.058


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.7905,0.7175,0.55,0.5333,0.5067,0.4017,0.4227,0.013


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.731,0.5,0.0,0.0,0.0,0.0,0.0,0.012


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [16]:
sweeperkeeper_df

Unnamed: 0,model,train_accuracy,test_accuracy,train_f1,test_f1,TT
0,Logistic Regression,1.0,1.0,1.0,1.0,1.373
1,Extra Trees Classifier,1.0,0.913043,1.0,0.900725,0.11
2,Naive Bayes,0.934066,0.913043,0.930769,0.900725,0.014
3,Random Forest Classifier,1.0,0.913043,1.0,0.900725,0.098
4,K Neighbors Classifier,0.989011,0.956522,0.988935,0.953974,0.025
5,Ridge Classifier,0.945055,0.913043,0.943825,0.900725,0.018
6,Linear Discriminant Analysis,0.945055,0.913043,0.943825,0.900725,0.02
7,Extreme Gradient Boosting,1.0,0.913043,1.0,0.900725,0.25
8,SVM - Linear Kernel,0.956044,0.869565,0.956044,0.861921,0.013
9,Light Gradient Boosting Machine,1.0,0.913043,1.0,0.900725,0.037


## Ball-Playing Defender

In [17]:
import pandas as pd
from pycaret.classification import setup, compare_models, predict_model, finalize_model, pull
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# Load the CSV file into a DataFrame
df = pd.read_csv('../data/players_df_sin_reco.csv')

# Select features (columns 7 to 20, 0-based index: 6 to 19)
X = df.iloc[:, 6:20]

# Select the 9th column from the end as the target
y = df.iloc[:, -9]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an empty DataFrame to store results
ballplayingdefender_df = pd.DataFrame(columns=['model', 'train_accuracy', 'test_accuracy', 'train_f1', 'test_f1', 'TT'])

# Concatenate X_train and y_train into a DataFrame for PyCaret setup
df_target = pd.concat([X_train, y_train], axis=1)

# Setup PyCaret environment
clf = setup(data=df_target, target=y.name, session_id=42)

# Compare different models and store the results
best_model = compare_models()  # This returns the top model
compare_results = pull()  # Pulls all model results as a DataFrame

# Define the mapping from model names to PyCaret identifiers
model_mapping = {
    'K Neighbors Classifier': 'knn',
    'Random Forest Classifier': 'rf',
    'Logistic Regression': 'lr',
    'Decision Tree Classifier': 'dt',
    'Ada Boost Classifier': 'ada',
    'Gradient Boosting Classifier': 'gbc',
    'Naive Bayes': 'nb',
    'SVM - Linear Kernel': 'svm',
    'Extra Trees Classifier': 'et',
    'Light Gradient Boosting Machine': 'lightgbm',
    'Ridge Classifier': 'ridge',
    'Linear Discriminant Analysis': 'lda',
    'Extreme Gradient Boosting': 'xgboost',
    'Dummy Classifier': 'dummy'
}

# Loop over each model in the compare_results DataFrame
for idx, row in compare_results.iterrows():
    model_name = row['Model']
    testing_time = row['TT (Sec)']

    # Get the PyCaret identifier for the model
    model_identifier = model_mapping.get(model_name)

    # Instead of finalizing by the model name, select the model by identifier
    if model_identifier:
        best_model = compare_models(include=[model_identifier], n_select=1)

        # Finalize the model (train it on the entire dataset)
        final_model = finalize_model(best_model)

        # Predict on train and test sets
        predictions_train = predict_model(final_model, data=X_train)
        predictions_test = predict_model(final_model, data=X_test)

        # Calculate test accuracy and F1 score
        y_pred_test = predictions_test['prediction_label']
        test_accuracy = accuracy_score(y_test, y_pred_test)
        test_f1 = f1_score(y_test, y_pred_test, average='weighted')

        # Calculate train accuracy and F1 score
        y_pred_train = predictions_train['prediction_label']
        train_accuracy = accuracy_score(y_train, y_pred_train)
        train_f1 = f1_score(y_train, y_pred_train, average='weighted')

        # Append the metrics for each model to the results DataFrame
        new_row = pd.DataFrame({
            'model': model_name,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'train_f1': train_f1,
            'test_f1': test_f1,
            'TT': testing_time
        }, index=[0])

        ballplayingdefender_df = pd.concat([ballplayingdefender_df, new_row], ignore_index=True)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Class_Ball Playing Defender
2,Target type,Binary
3,Original data shape,"(91, 15)"
4,Transformed data shape,"(91, 15)"
5,Transformed train set shape,"(63, 15)"
6,Transformed test set shape,"(28, 15)"
7,Numeric features,14
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9833,1.0,1.0,0.9833,0.9909,0.9,0.9,0.065
ridge,Ridge Classifier,0.95,1.0,0.975,0.9667,0.9675,0.7667,0.7707,0.018
lda,Linear Discriminant Analysis,0.95,1.0,0.975,0.9667,0.9675,0.7667,0.7707,0.017
knn,K Neighbors Classifier,0.9357,0.985,0.98,0.9467,0.9596,0.7267,0.7363,0.025
rf,Random Forest Classifier,0.9357,1.0,0.98,0.95,0.9598,0.7696,0.773,0.104
lightgbm,Light Gradient Boosting Machine,0.9357,1.0,0.955,0.9633,0.9544,0.7934,0.807,0.051
lr,Logistic Regression,0.9333,1.0,0.975,0.95,0.9584,0.6667,0.6707,0.02
nb,Naive Bayes,0.9333,0.93,0.955,0.96,0.9546,0.8038,0.814,0.024
ada,Ada Boost Classifier,0.9333,0.99,0.975,0.9467,0.9564,0.7238,0.734,0.061
xgboost,Extreme Gradient Boosting,0.9214,1.0,0.955,0.9467,0.9453,0.7522,0.7715,0.03


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9833,1.0,1.0,0.9833,0.9909,0.9,0.9,0.09


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.95,1.0,0.975,0.9667,0.9675,0.7667,0.7707,0.025


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.95,1.0,0.975,0.9667,0.9675,0.7667,0.7707,0.017


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9357,0.985,0.98,0.9467,0.9596,0.7267,0.7363,0.029


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9357,1.0,0.98,0.95,0.9598,0.7696,0.773,0.095


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9357,1.0,0.955,0.9633,0.9544,0.7934,0.807,0.054


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9333,1.0,0.975,0.95,0.9584,0.6667,0.6707,0.017


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.9333,0.93,0.955,0.96,0.9546,0.8038,0.814,0.013


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.9333,0.99,0.975,0.9467,0.9564,0.7238,0.734,0.048


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9214,1.0,0.955,0.9467,0.9453,0.7522,0.7715,0.026


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.9167,1.0,0.955,0.95,0.9473,0.6238,0.634,0.018


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.8738,0.83,0.91,0.925,0.9112,0.636,0.6611,0.014


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8262,0.8838,0.89,0.8981,0.8814,0.4998,0.5214,0.047


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.7643,0.5,1.0,0.7643,0.8645,0.0,0.0,0.012


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [18]:
ballplayingdefender_df

Unnamed: 0,model,train_accuracy,test_accuracy,train_f1,test_f1,TT
0,Extra Trees Classifier,1.0,0.956522,1.0,0.960339,0.065
1,Ridge Classifier,0.978022,0.956522,0.977656,0.960339,0.018
2,Linear Discriminant Analysis,0.978022,0.956522,0.977656,0.960339,0.017
3,K Neighbors Classifier,0.956044,0.956522,0.954465,0.960339,0.025
4,Random Forest Classifier,1.0,0.956522,1.0,0.960339,0.104
5,Light Gradient Boosting Machine,1.0,0.956522,1.0,0.960339,0.051
6,Logistic Regression,1.0,0.956522,1.0,0.960339,0.02
7,Naive Bayes,0.956044,0.956522,0.956676,0.960339,0.024
8,Ada Boost Classifier,1.0,0.956522,1.0,0.960339,0.061
9,Extreme Gradient Boosting,1.0,0.956522,1.0,0.960339,0.03


## No-Nonsense Defender

In [19]:
import pandas as pd
from pycaret.classification import setup, compare_models, predict_model, finalize_model, pull
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# Load the CSV file into a DataFrame
df = pd.read_csv('../data/players_df_sin_reco.csv')

# Select features (columns 7 to 20, 0-based index: 6 to 19)
X = df.iloc[:, 6:20]

# Select the 8th column from the end as the target
y = df.iloc[:, -8]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an empty DataFrame to store results
nononsensedefender_df = pd.DataFrame(columns=['model', 'train_accuracy', 'test_accuracy', 'train_f1', 'test_f1', 'TT'])

# Concatenate X_train and y_train into a DataFrame for PyCaret setup
df_target = pd.concat([X_train, y_train], axis=1)

# Setup PyCaret environment
clf = setup(data=df_target, target=y.name, session_id=42)

# Compare different models and store the results
best_model = compare_models()  # This returns the top model
compare_results = pull()  # Pulls all model results as a DataFrame

# Define the mapping from model names to PyCaret identifiers
model_mapping = {
    'K Neighbors Classifier': 'knn',
    'Random Forest Classifier': 'rf',
    'Logistic Regression': 'lr',
    'Decision Tree Classifier': 'dt',
    'Ada Boost Classifier': 'ada',
    'Gradient Boosting Classifier': 'gbc',
    'Naive Bayes': 'nb',
    'SVM - Linear Kernel': 'svm',
    'Extra Trees Classifier': 'et',
    'Light Gradient Boosting Machine': 'lightgbm',
    'Ridge Classifier': 'ridge',
    'Linear Discriminant Analysis': 'lda',
    'Extreme Gradient Boosting': 'xgboost',
    'Dummy Classifier': 'dummy'
}

# Loop over each model in the compare_results DataFrame
for idx, row in compare_results.iterrows():
    model_name = row['Model']
    testing_time = row['TT (Sec)']

    # Get the PyCaret identifier for the model
    model_identifier = model_mapping.get(model_name)

    # Instead of finalizing by the model name, select the model by identifier
    if model_identifier:
        best_model = compare_models(include=[model_identifier], n_select=1)

        # Finalize the model (train it on the entire dataset)
        final_model = finalize_model(best_model)

        # Predict on train and test sets
        predictions_train = predict_model(final_model, data=X_train)
        predictions_test = predict_model(final_model, data=X_test)

        # Calculate test accuracy and F1 score
        y_pred_test = predictions_test['prediction_label']
        test_accuracy = accuracy_score(y_test, y_pred_test)
        test_f1 = f1_score(y_test, y_pred_test, average='weighted')

        # Calculate train accuracy and F1 score
        y_pred_train = predictions_train['prediction_label']
        train_accuracy = accuracy_score(y_train, y_pred_train)
        train_f1 = f1_score(y_train, y_pred_train, average='weighted')

        # Append the metrics for each model to the results DataFrame
        new_row = pd.DataFrame({
            'model': model_name,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'train_f1': train_f1,
            'test_f1': test_f1,
            'TT': testing_time
        }, index=[0])

        nononsensedefender_df = pd.concat([nononsensedefender_df, new_row], ignore_index=True)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Class_No Nonsense Defender
2,Target type,Binary
3,Original data shape,"(91, 15)"
4,Transformed data shape,"(91, 15)"
5,Transformed train set shape,"(63, 15)"
6,Transformed test set shape,"(28, 15)"
7,Numeric features,14
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.969,1.0,0.98,0.98,0.9778,0.9267,0.9363,0.03
et,Extra Trees Classifier,0.9381,1.0,0.975,0.9433,0.9544,0.8451,0.866,0.079
lightgbm,Light Gradient Boosting Machine,0.9381,0.9575,0.975,0.9433,0.9544,0.8451,0.866,0.036
ada,Ada Boost Classifier,0.9238,0.975,0.955,0.9433,0.9433,0.8147,0.8391,0.066
xgboost,Extreme Gradient Boosting,0.9238,0.9792,0.975,0.9183,0.9437,0.8148,0.8327,0.021
knn,K Neighbors Classifier,0.9167,0.9875,0.975,0.92,0.9413,0.7952,0.8237,0.028
ridge,Ridge Classifier,0.9048,0.9875,0.955,0.9183,0.9326,0.7677,0.7891,0.017
lda,Linear Discriminant Analysis,0.9048,0.9875,0.955,0.9183,0.9326,0.7677,0.7891,0.013
rf,Random Forest Classifier,0.8905,0.99,0.93,0.9233,0.9179,0.7385,0.773,0.122
nb,Naive Bayes,0.8738,0.9238,0.905,0.925,0.9062,0.6921,0.7083,0.014


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.969,1.0,0.98,0.98,0.9778,0.9267,0.9363,0.041


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9381,1.0,0.975,0.9433,0.9544,0.8451,0.866,0.069


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9381,0.9575,0.975,0.9433,0.9544,0.8451,0.866,0.036


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.9238,0.975,0.955,0.9433,0.9433,0.8147,0.8391,0.066


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9238,0.9792,0.975,0.9183,0.9437,0.8148,0.8327,0.036


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9167,0.9875,0.975,0.92,0.9413,0.7952,0.8237,0.022


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.9048,0.9875,0.955,0.9183,0.9326,0.7677,0.7891,0.02


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9048,0.9875,0.955,0.9183,0.9326,0.7677,0.7891,0.018


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8905,0.99,0.93,0.9233,0.9179,0.7385,0.773,0.089


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.8738,0.9238,0.905,0.925,0.9062,0.6921,0.7083,0.014


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8738,0.9288,0.905,0.9233,0.9036,0.7052,0.7437,0.068


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.8548,0.9875,0.955,0.87,0.9011,0.6236,0.6613,0.025


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.7929,0.7675,0.835,0.8733,0.8397,0.5317,0.5755,0.016


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.6667,0.5,1.0,0.6667,0.7994,0.0,0.0,0.012


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [20]:
nononsensedefender_df

Unnamed: 0,model,train_accuracy,test_accuracy,train_f1,test_f1,TT
0,Logistic Regression,1.0,0.956522,1.0,0.954694,0.03
1,Extra Trees Classifier,1.0,0.826087,1.0,0.808924,0.079
2,Light Gradient Boosting Machine,1.0,0.826087,1.0,0.808924,0.036
3,Ada Boost Classifier,1.0,0.869565,1.0,0.846632,0.066
4,Extreme Gradient Boosting,1.0,0.869565,1.0,0.846632,0.021
5,K Neighbors Classifier,0.901099,0.782609,0.897531,0.773469,0.028
6,Ridge Classifier,0.956044,0.869565,0.955672,0.864081,0.017
7,Linear Discriminant Analysis,0.956044,0.869565,0.955672,0.864081,0.013
8,Random Forest Classifier,1.0,0.826087,1.0,0.808924,0.122
9,Naive Bayes,0.912088,0.782609,0.912717,0.773469,0.014


## Full-Back

In [21]:
import pandas as pd
from pycaret.classification import setup, compare_models, predict_model, finalize_model, pull
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# Load the CSV file into a DataFrame
df = pd.read_csv('../data/players_df_sin_reco.csv')

# Select features (columns 7 to 20, 0-based index: 6 to 19)
X = df.iloc[:, 6:20]

# Select the 7th column from the end as the target
y = df.iloc[:, -7]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an empty DataFrame to store results
fullback_df = pd.DataFrame(columns=['model', 'train_accuracy', 'test_accuracy', 'train_f1', 'test_f1', 'TT'])

# Concatenate X_train and y_train into a DataFrame for PyCaret setup
df_target = pd.concat([X_train, y_train], axis=1)

# Setup PyCaret environment
clf = setup(data=df_target, target=y.name, session_id=42)

# Compare different models and store the results
best_model = compare_models()  # This returns the top model
compare_results = pull()  # Pulls all model results as a DataFrame

# Define the mapping from model names to PyCaret identifiers
model_mapping = {
    'K Neighbors Classifier': 'knn',
    'Random Forest Classifier': 'rf',
    'Logistic Regression': 'lr',
    'Decision Tree Classifier': 'dt',
    'Ada Boost Classifier': 'ada',
    'Gradient Boosting Classifier': 'gbc',
    'Naive Bayes': 'nb',
    'SVM - Linear Kernel': 'svm',
    'Extra Trees Classifier': 'et',
    'Light Gradient Boosting Machine': 'lightgbm',
    'Ridge Classifier': 'ridge',
    'Linear Discriminant Analysis': 'lda',
    'Extreme Gradient Boosting': 'xgboost',
    'Dummy Classifier': 'dummy'
}

# Loop over each model in the compare_results DataFrame
for idx, row in compare_results.iterrows():
    model_name = row['Model']
    testing_time = row['TT (Sec)']

    # Get the PyCaret identifier for the model
    model_identifier = model_mapping.get(model_name)

    # Instead of finalizing by the model name, select the model by identifier
    if model_identifier:
        best_model = compare_models(include=[model_identifier], n_select=1)

        # Finalize the model (train it on the entire dataset)
        final_model = finalize_model(best_model)

        # Predict on train and test sets
        predictions_train = predict_model(final_model, data=X_train)
        predictions_test = predict_model(final_model, data=X_test)

        # Calculate test accuracy and F1 score
        y_pred_test = predictions_test['prediction_label']
        test_accuracy = accuracy_score(y_test, y_pred_test)
        test_f1 = f1_score(y_test, y_pred_test, average='weighted')

        # Calculate train accuracy and F1 score
        y_pred_train = predictions_train['prediction_label']
        train_accuracy = accuracy_score(y_train, y_pred_train)
        train_f1 = f1_score(y_train, y_pred_train, average='weighted')

        # Append the metrics for each model to the results DataFrame
        new_row = pd.DataFrame({
            'model': model_name,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'train_f1': train_f1,
            'test_f1': test_f1,
            'TT': testing_time
        }, index=[0])

        fullback_df = pd.concat([fullback_df, new_row], ignore_index=True)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Class_Full Back
2,Target type,Binary
3,Original data shape,"(91, 15)"
4,Transformed data shape,"(91, 15)"
5,Transformed train set shape,"(63, 15)"
6,Transformed test set shape,"(28, 15)"
7,Numeric features,14
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9214,0.9675,0.955,0.95,0.9455,0.7951,0.8083,0.034
ridge,Ridge Classifier,0.8881,0.955,0.955,0.9033,0.9211,0.7236,0.7613,0.019
et,Extra Trees Classifier,0.8738,0.95,0.955,0.8914,0.9135,0.6648,0.6967,0.108
lda,Linear Discriminant Analysis,0.8714,0.955,0.93,0.8983,0.9072,0.6915,0.723,0.013
knn,K Neighbors Classifier,0.869,0.9938,0.955,0.8933,0.9124,0.6505,0.6702,0.022
rf,Random Forest Classifier,0.8405,0.9438,0.955,0.8648,0.8957,0.5505,0.5702,0.091
ada,Ada Boost Classifier,0.8405,0.8875,0.93,0.865,0.8894,0.5948,0.6168,0.058
nb,Naive Bayes,0.8381,0.89,0.88,0.8767,0.8719,0.6427,0.6641,0.016
xgboost,Extreme Gradient Boosting,0.8381,0.9025,0.93,0.8717,0.8894,0.5772,0.5965,0.029
lightgbm,Light Gradient Boosting Machine,0.8071,0.9125,0.905,0.8514,0.864,0.4934,0.507,0.031


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9214,0.9675,0.955,0.95,0.9455,0.7951,0.8083,0.035


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.8881,0.955,0.955,0.9033,0.9211,0.7236,0.7613,0.014


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8738,0.95,0.955,0.8914,0.9135,0.6648,0.6967,0.078


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.8714,0.955,0.93,0.8983,0.9072,0.6915,0.723,0.024


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.869,0.9938,0.955,0.8933,0.9124,0.6505,0.6702,0.032


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8405,0.9438,0.955,0.8648,0.8957,0.5505,0.5702,0.087


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.8405,0.8875,0.93,0.865,0.8894,0.5948,0.6168,0.048


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.8381,0.89,0.88,0.8767,0.8719,0.6427,0.6641,0.02


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.8381,0.9025,0.93,0.8717,0.8894,0.5772,0.5965,0.036


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8071,0.9125,0.905,0.8514,0.864,0.4934,0.507,0.033


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.8048,0.7675,0.885,0.8567,0.8606,0.5247,0.5364,0.019


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7881,0.8088,0.88,0.8217,0.837,0.4905,0.5008,0.058


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.7214,0.9275,0.65,0.8633,0.6992,0.5114,0.5646,0.015


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.6976,0.5,1.0,0.6976,0.8209,0.0,0.0,0.015


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [22]:
fullback_df

Unnamed: 0,model,train_accuracy,test_accuracy,train_f1,test_f1,TT
0,Logistic Regression,1.0,0.956522,1.0,0.952704,0.034
1,Ridge Classifier,0.967033,0.913043,0.966441,0.913043,0.019
2,Extra Trees Classifier,1.0,0.913043,1.0,0.913043,0.108
3,Linear Discriminant Analysis,0.967033,0.913043,0.966441,0.913043,0.013
4,K Neighbors Classifier,0.901099,0.913043,0.899323,0.913043,0.022
5,Random Forest Classifier,1.0,0.913043,1.0,0.913043,0.091
6,Ada Boost Classifier,1.0,0.956522,1.0,0.952704,0.058
7,Naive Bayes,0.912088,0.869565,0.912088,0.87721,0.016
8,Extreme Gradient Boosting,1.0,0.913043,1.0,0.913043,0.029
9,Light Gradient Boosting Machine,1.0,0.956522,1.0,0.952704,0.031


## All-Action Midfielder

In [23]:
import pandas as pd
from pycaret.classification import setup, compare_models, predict_model, finalize_model, pull
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# Load the CSV file into a DataFrame
df = pd.read_csv('../data/players_df_sin_reco.csv')

# Select features (columns 7 to 20, 0-based index: 6 to 19)
X = df.iloc[:, 6:20]

# Select the 6th column from the end as the target
y = df.iloc[:, -6]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an empty DataFrame to store results
allactionmidfielder_df = pd.DataFrame(columns=['model', 'train_accuracy', 'test_accuracy', 'train_f1', 'test_f1', 'TT'])

# Concatenate X_train and y_train into a DataFrame for PyCaret setup
df_target = pd.concat([X_train, y_train], axis=1)

# Setup PyCaret environment
clf = setup(data=df_target, target=y.name, session_id=42)

# Compare different models and store the results
best_model = compare_models()  # This returns the top model
compare_results = pull()  # Pulls all model results as a DataFrame

# Define the mapping from model names to PyCaret identifiers
model_mapping = {
    'K Neighbors Classifier': 'knn',
    'Random Forest Classifier': 'rf',
    'Logistic Regression': 'lr',
    'Decision Tree Classifier': 'dt',
    'Ada Boost Classifier': 'ada',
    'Gradient Boosting Classifier': 'gbc',
    'Naive Bayes': 'nb',
    'SVM - Linear Kernel': 'svm',
    'Extra Trees Classifier': 'et',
    'Light Gradient Boosting Machine': 'lightgbm',
    'Ridge Classifier': 'ridge',
    'Linear Discriminant Analysis': 'lda',
    'Extreme Gradient Boosting': 'xgboost',
    'Dummy Classifier': 'dummy'
}

# Loop over each model in the compare_results DataFrame
for idx, row in compare_results.iterrows():
    model_name = row['Model']
    testing_time = row['TT (Sec)']

    # Get the PyCaret identifier for the model
    model_identifier = model_mapping.get(model_name)

    # Instead of finalizing by the model name, select the model by identifier
    if model_identifier:
        best_model = compare_models(include=[model_identifier], n_select=1)

        # Finalize the model (train it on the entire dataset)
        final_model = finalize_model(best_model)

        # Predict on train and test sets
        predictions_train = predict_model(final_model, data=X_train)
        predictions_test = predict_model(final_model, data=X_test)

        # Calculate test accuracy and F1 score
        y_pred_test = predictions_test['prediction_label']
        test_accuracy = accuracy_score(y_test, y_pred_test)
        test_f1 = f1_score(y_test, y_pred_test, average='weighted')

        # Calculate train accuracy and F1 score
        y_pred_train = predictions_train['prediction_label']
        train_accuracy = accuracy_score(y_train, y_pred_train)
        train_f1 = f1_score(y_train, y_pred_train, average='weighted')

        # Append the metrics for each model to the results DataFrame
        new_row = pd.DataFrame({
            'model': model_name,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'train_f1': train_f1,
            'test_f1': test_f1,
            'TT': testing_time
        }, index=[0])

        allactionmidfielder_df = pd.concat([allactionmidfielder_df, new_row], ignore_index=True)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Class_All Action Midfielder
2,Target type,Binary
3,Original data shape,"(91, 15)"
4,Transformed data shape,"(91, 15)"
5,Transformed train set shape,"(63, 15)"
6,Transformed test set shape,"(28, 15)"
7,Numeric features,14
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9381,1.0,0.955,0.96,0.9524,0.8559,0.8745,0.033
knn,K Neighbors Classifier,0.9333,0.9375,1.0,0.9267,0.9578,0.8143,0.8265,0.023
ridge,Ridge Classifier,0.9048,0.9792,0.98,0.9,0.9333,0.7677,0.799,0.017
gbc,Gradient Boosting Classifier,0.9048,0.9317,0.905,0.95,0.9246,0.7916,0.798,0.054
lda,Linear Discriminant Analysis,0.9048,0.9792,0.98,0.9,0.9333,0.7677,0.799,0.013
nb,Naive Bayes,0.8905,0.9192,0.885,0.955,0.9103,0.767,0.7887,0.014
dt,Decision Tree Classifier,0.8881,0.8775,0.905,0.93,0.9135,0.7487,0.7613,0.015
xgboost,Extreme Gradient Boosting,0.8738,0.9292,0.93,0.89,0.9056,0.7077,0.7294,0.024
rf,Random Forest Classifier,0.8714,0.9667,0.95,0.87,0.9056,0.6952,0.7196,0.105
lightgbm,Light Gradient Boosting Machine,0.869,0.975,0.98,0.8733,0.9156,0.641,0.6628,0.045


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9381,1.0,0.955,0.96,0.9524,0.8559,0.8745,0.033


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9333,0.9375,1.0,0.9267,0.9578,0.8143,0.8265,0.025


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.9048,0.9792,0.98,0.9,0.9333,0.7677,0.799,0.028


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9048,0.9317,0.905,0.95,0.9246,0.7916,0.798,0.052


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9048,0.9792,0.98,0.9,0.9333,0.7677,0.799,0.02


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.8905,0.9192,0.885,0.955,0.9103,0.767,0.7887,0.016


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.8881,0.8775,0.905,0.93,0.9135,0.7487,0.7613,0.02


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.8738,0.9292,0.93,0.89,0.9056,0.7077,0.7294,0.04


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8714,0.9667,0.95,0.87,0.9056,0.6952,0.7196,0.102


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.869,0.975,0.98,0.8733,0.9156,0.641,0.6628,0.041


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.8571,0.9667,0.91,0.895,0.8913,0.6717,0.7078,0.049


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8571,0.9417,0.93,0.87,0.8944,0.6648,0.6927,0.065


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.8548,0.9625,0.905,0.8867,0.8873,0.6772,0.7065,0.016


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.6667,0.5,1.0,0.6667,0.7994,0.0,0.0,0.012


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [24]:
allactionmidfielder_df

Unnamed: 0,model,train_accuracy,test_accuracy,train_f1,test_f1,TT
0,Logistic Regression,1.0,0.869565,1.0,0.875049,0.033
1,K Neighbors Classifier,0.934066,0.869565,0.932758,0.875049,0.023
2,Ridge Classifier,0.945055,0.869565,0.944268,0.875049,0.017
3,Gradient Boosting Classifier,1.0,0.913043,1.0,0.913043,0.054
4,Linear Discriminant Analysis,0.956044,0.869565,0.955641,0.875049,0.013
5,Naive Bayes,0.923077,0.826087,0.92339,0.826087,0.014
6,Decision Tree Classifier,1.0,0.782609,1.0,0.802936,0.015
7,Extreme Gradient Boosting,1.0,0.869565,1.0,0.861921,0.024
8,Random Forest Classifier,1.0,0.869565,1.0,0.861921,0.105
9,Light Gradient Boosting Machine,1.0,0.869565,1.0,0.861921,0.045


## Midfield Playmaker

In [25]:
import pandas as pd
from pycaret.classification import setup, compare_models, predict_model, finalize_model, pull
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# Load the CSV file into a DataFrame
df = pd.read_csv('../data/players_df_sin_reco.csv')

# Select features (columns 7 to 20, 0-based index: 6 to 19)
X = df.iloc[:, 6:20]

# Select the 5thth column from the end as the target
y = df.iloc[:, -5]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an empty DataFrame to store results
midfieldplaymaker_df = pd.DataFrame(columns=['model', 'train_accuracy', 'test_accuracy', 'train_f1', 'test_f1', 'TT'])

# Concatenate X_train and y_train into a DataFrame for PyCaret setup
df_target = pd.concat([X_train, y_train], axis=1)

# Setup PyCaret environment
clf = setup(data=df_target, target=y.name, session_id=42)

# Compare different models and store the results
best_model = compare_models()  # This returns the top model
compare_results = pull()  # Pulls all model results as a DataFrame

# Define the mapping from model names to PyCaret identifiers
model_mapping = {
    'K Neighbors Classifier': 'knn',
    'Random Forest Classifier': 'rf',
    'Logistic Regression': 'lr',
    'Decision Tree Classifier': 'dt',
    'Ada Boost Classifier': 'ada',
    'Gradient Boosting Classifier': 'gbc',
    'Naive Bayes': 'nb',
    'SVM - Linear Kernel': 'svm',
    'Extra Trees Classifier': 'et',
    'Light Gradient Boosting Machine': 'lightgbm',
    'Ridge Classifier': 'ridge',
    'Linear Discriminant Analysis': 'lda',
    'Extreme Gradient Boosting': 'xgboost',
    'Dummy Classifier': 'dummy'
}

# Loop over each model in the compare_results DataFrame
for idx, row in compare_results.iterrows():
    model_name = row['Model']
    testing_time = row['TT (Sec)']

    # Get the PyCaret identifier for the model
    model_identifier = model_mapping.get(model_name)

    # Instead of finalizing by the model name, select the model by identifier
    if model_identifier:
        best_model = compare_models(include=[model_identifier], n_select=1)

        # Finalize the model (train it on the entire dataset)
        final_model = finalize_model(best_model)

        # Predict on train and test sets
        predictions_train = predict_model(final_model, data=X_train)
        predictions_test = predict_model(final_model, data=X_test)

        # Calculate test accuracy and F1 score
        y_pred_test = predictions_test['prediction_label']
        test_accuracy = accuracy_score(y_test, y_pred_test)
        test_f1 = f1_score(y_test, y_pred_test, average='weighted')

        # Calculate train accuracy and F1 score
        y_pred_train = predictions_train['prediction_label']
        train_accuracy = accuracy_score(y_train, y_pred_train)
        train_f1 = f1_score(y_train, y_pred_train, average='weighted')

        # Append the metrics for each model to the results DataFrame
        new_row = pd.DataFrame({
            'model': model_name,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'train_f1': train_f1,
            'test_f1': test_f1,
            'TT': testing_time
        }, index=[0])

        midfieldplaymaker_df = pd.concat([midfieldplaymaker_df, new_row], ignore_index=True)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Class_Midfield Playmaker
2,Target type,Binary
3,Original data shape,"(91, 15)"
4,Transformed data shape,"(91, 15)"
5,Transformed train set shape,"(63, 15)"
6,Transformed test set shape,"(28, 15)"
7,Numeric features,14
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9571,0.98,0.9633,0.9833,0.9707,0.8872,0.9021,0.026
et,Extra Trees Classifier,0.9571,0.9733,0.9833,0.9714,0.9742,0.8588,0.8645,0.085
xgboost,Extreme Gradient Boosting,0.9429,0.9633,0.9633,0.9714,0.9631,0.8284,0.8376,0.029
lightgbm,Light Gradient Boosting Machine,0.9405,0.9833,0.9833,0.9548,0.9652,0.7588,0.7645,0.034
ada,Ada Boost Classifier,0.9381,0.9483,0.9633,0.9667,0.9616,0.7748,0.7923,0.054
nb,Naive Bayes,0.9262,0.9233,0.9433,0.9667,0.9525,0.7924,0.802,0.022
lda,Linear Discriminant Analysis,0.9262,0.99,0.9633,0.9467,0.9527,0.7476,0.7591,0.015
knn,K Neighbors Classifier,0.9238,0.9533,0.9633,0.95,0.954,0.6993,0.7111,0.025
rf,Random Forest Classifier,0.9238,0.9633,0.9633,0.9514,0.9542,0.7388,0.7445,0.085
svm,SVM - Linear Kernel,0.9095,0.9633,0.9833,0.9214,0.947,0.6176,0.6291,0.016


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9571,0.98,0.9633,0.9833,0.9707,0.8872,0.9021,0.027


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9571,0.9733,0.9833,0.9714,0.9742,0.8588,0.8645,0.067


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9429,0.9633,0.9633,0.9714,0.9631,0.8284,0.8376,0.028


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9405,0.9833,0.9833,0.9548,0.9652,0.7588,0.7645,0.037


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.9381,0.9483,0.9633,0.9667,0.9616,0.7748,0.7923,0.05


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.9262,0.9233,0.9433,0.9667,0.9525,0.7924,0.802,0.016


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9262,0.99,0.9633,0.9467,0.9527,0.7476,0.7591,0.02


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9238,0.9533,0.9633,0.95,0.954,0.6993,0.7111,0.051


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9238,0.9633,0.9633,0.9514,0.9542,0.7388,0.7445,0.103


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.9095,0.9633,0.9833,0.9214,0.947,0.6176,0.6291,0.023


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.9095,0.99,0.9633,0.93,0.9436,0.6476,0.6591,0.018


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9095,0.8267,0.9633,0.9381,0.9449,0.6284,0.6376,0.064


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.8929,0.8217,0.9433,0.9348,0.934,0.6084,0.6176,0.023


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.8119,0.5,1.0,0.8119,0.8953,0.0,0.0,0.016


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [26]:
midfieldplaymaker_df

Unnamed: 0,model,train_accuracy,test_accuracy,train_f1,test_f1,TT
0,Logistic Regression,1.0,1.0,1.0,1.0,0.026
1,Extra Trees Classifier,1.0,0.956522,1.0,0.949781,0.085
2,Extreme Gradient Boosting,0.989011,0.956522,0.989131,0.949781,0.029
3,Light Gradient Boosting Machine,1.0,0.956522,1.0,0.949781,0.034
4,Ada Boost Classifier,1.0,0.956522,1.0,0.949781,0.054
5,Naive Bayes,0.945055,1.0,0.946714,1.0,0.022
6,Linear Discriminant Analysis,0.989011,1.0,0.988881,1.0,0.015
7,K Neighbors Classifier,0.934066,0.956522,0.930538,0.949781,0.025
8,Random Forest Classifier,1.0,0.956522,1.0,0.949781,0.085
9,SVM - Linear Kernel,0.945055,1.0,0.947616,1.0,0.016


## Traditional Winger

In [27]:
import pandas as pd
from pycaret.classification import setup, compare_models, predict_model, finalize_model, pull
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# Load the CSV file into a DataFrame
df = pd.read_csv('../data/players_df_sin_reco.csv')

# Select features (columns 7 to 20, 0-based index: 6 to 19)
X = df.iloc[:, 6:20]

# Select the 4th column from the end as the target
y = df.iloc[:, -4]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an empty DataFrame to store results
traditionalwinger_df = pd.DataFrame(columns=['model', 'train_accuracy', 'test_accuracy', 'train_f1', 'test_f1', 'TT'])

# Concatenate X_train and y_train into a DataFrame for PyCaret setup
df_target = pd.concat([X_train, y_train], axis=1)

# Setup PyCaret environment
clf = setup(data=df_target, target=y.name, session_id=42)

# Compare different models and store the results
best_model = compare_models()  # This returns the top model
compare_results = pull()  # Pulls all model results as a DataFrame

# Define the mapping from model names to PyCaret identifiers
model_mapping = {
    'K Neighbors Classifier': 'knn',
    'Random Forest Classifier': 'rf',
    'Logistic Regression': 'lr',
    'Decision Tree Classifier': 'dt',
    'Ada Boost Classifier': 'ada',
    'Gradient Boosting Classifier': 'gbc',
    'Naive Bayes': 'nb',
    'SVM - Linear Kernel': 'svm',
    'Extra Trees Classifier': 'et',
    'Light Gradient Boosting Machine': 'lightgbm',
    'Ridge Classifier': 'ridge',
    'Linear Discriminant Analysis': 'lda',
    'Extreme Gradient Boosting': 'xgboost',
    'Dummy Classifier': 'dummy'
}

# Loop over each model in the compare_results DataFrame
for idx, row in compare_results.iterrows():
    model_name = row['Model']
    testing_time = row['TT (Sec)']

    # Get the PyCaret identifier for the model
    model_identifier = model_mapping.get(model_name)

    # Instead of finalizing by the model name, select the model by identifier
    if model_identifier:
        best_model = compare_models(include=[model_identifier], n_select=1)

        # Finalize the model (train it on the entire dataset)
        final_model = finalize_model(best_model)

        # Predict on train and test sets
        predictions_train = predict_model(final_model, data=X_train)
        predictions_test = predict_model(final_model, data=X_test)

        # Calculate test accuracy and F1 score
        y_pred_test = predictions_test['prediction_label']
        test_accuracy = accuracy_score(y_test, y_pred_test)
        test_f1 = f1_score(y_test, y_pred_test, average='weighted')

        # Calculate train accuracy and F1 score
        y_pred_train = predictions_train['prediction_label']
        train_accuracy = accuracy_score(y_train, y_pred_train)
        train_f1 = f1_score(y_train, y_pred_train, average='weighted')

        # Append the metrics for each model to the results DataFrame
        new_row = pd.DataFrame({
            'model': model_name,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'train_f1': train_f1,
            'test_f1': test_f1,
            'TT': testing_time
        }, index=[0])

        traditionalwinger_df = pd.concat([traditionalwinger_df, new_row], ignore_index=True)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Class_Traditional Winger
2,Target type,Binary
3,Original data shape,"(91, 15)"
4,Transformed data shape,"(91, 15)"
5,Transformed train set shape,"(63, 15)"
6,Transformed test set shape,"(28, 15)"
7,Numeric features,14
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9548,1.0,0.9667,0.9417,0.9457,0.9082,0.9187,0.039
ridge,Ridge Classifier,0.8738,0.9583,0.7833,0.9167,0.8133,0.7227,0.7544,0.013
lda,Linear Discriminant Analysis,0.8738,0.9583,0.7833,0.9167,0.8133,0.7227,0.7544,0.014
xgboost,Extreme Gradient Boosting,0.8714,0.9292,0.8667,0.8333,0.83,0.7305,0.7506,0.036
ada,Ada Boost Classifier,0.8524,0.9625,0.9,0.7983,0.8195,0.7057,0.7363,0.047
gbc,Gradient Boosting Classifier,0.8262,0.9375,0.7,0.8667,0.7333,0.6145,0.6535,0.057
nb,Naive Bayes,0.8238,0.8896,0.8667,0.7733,0.7852,0.6472,0.6843,0.017
rf,Random Forest Classifier,0.8119,0.9042,0.75,0.8233,0.7538,0.6104,0.6395,0.096
et,Extra Trees Classifier,0.8095,0.9146,0.7333,0.7817,0.7229,0.5871,0.6162,0.112
dt,Decision Tree Classifier,0.8048,0.7833,0.7167,0.7833,0.7133,0.5698,0.6021,0.016


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9548,1.0,0.9667,0.9417,0.9457,0.9082,0.9187,0.04


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.8738,0.9583,0.7833,0.9167,0.8133,0.7227,0.7544,0.013


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.8738,0.9583,0.7833,0.9167,0.8133,0.7227,0.7544,0.013


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.8714,0.9292,0.8667,0.8333,0.83,0.7305,0.7506,0.025


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.8524,0.9625,0.9,0.7983,0.8195,0.7057,0.7363,0.066


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8262,0.9375,0.7,0.8667,0.7333,0.6145,0.6535,0.07


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.8238,0.8896,0.8667,0.7733,0.7852,0.6472,0.6843,0.018


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8119,0.9042,0.75,0.8233,0.7538,0.6104,0.6395,0.119


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8095,0.9146,0.7333,0.7817,0.7229,0.5871,0.6162,0.115


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.8048,0.7833,0.7167,0.7833,0.7133,0.5698,0.6021,0.016


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.8048,0.9292,0.9167,0.715,0.7795,0.6247,0.6668,0.016


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7476,0.8333,0.6667,0.6667,0.6433,0.4521,0.4742,0.033


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.7143,0.8646,0.5167,0.615,0.5162,0.346,0.3828,0.028


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.6381,0.5,0.0,0.0,0.0,0.0,0.0,0.015


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [28]:
traditionalwinger_df

Unnamed: 0,model,train_accuracy,test_accuracy,train_f1,test_f1,TT
0,Logistic Regression,1.0,1.0,1.0,1.0,0.039
1,Ridge Classifier,0.967033,1.0,0.96692,1.0,0.013
2,Linear Discriminant Analysis,0.967033,1.0,0.96692,1.0,0.014
3,Extreme Gradient Boosting,1.0,0.956522,1.0,0.956356,0.036
4,Ada Boost Classifier,1.0,0.956522,1.0,0.956356,0.047
5,Gradient Boosting Classifier,1.0,0.913043,1.0,0.913043,0.057
6,Naive Bayes,0.835165,0.913043,0.838123,0.91204,0.017
7,Random Forest Classifier,1.0,0.956522,1.0,0.956522,0.096
8,Extra Trees Classifier,1.0,0.913043,1.0,0.912714,0.112
9,Decision Tree Classifier,1.0,0.869565,1.0,0.869068,0.016


## Inverted Winger

In [29]:
import pandas as pd
from pycaret.classification import setup, compare_models, predict_model, finalize_model, pull
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# Load the CSV file into a DataFrame
df = pd.read_csv('../data/players_df_sin_reco.csv')

# Select features (columns 7 to 20, 0-based index: 6 to 19)
X = df.iloc[:, 6:20]

# Select the 3rd column from the end as the target
y = df.iloc[:, -3]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an empty DataFrame to store results
invertedwinger_df = pd.DataFrame(columns=['model', 'train_accuracy', 'test_accuracy', 'train_f1', 'test_f1', 'TT'])

# Concatenate X_train and y_train into a DataFrame for PyCaret setup
df_target = pd.concat([X_train, y_train], axis=1)

# Setup PyCaret environment
clf = setup(data=df_target, target=y.name, session_id=42)

# Compare different models and store the results
best_model = compare_models()  # This returns the top model
compare_results = pull()  # Pulls all model results as a DataFrame

# Define the mapping from model names to PyCaret identifiers
model_mapping = {
    'K Neighbors Classifier': 'knn',
    'Random Forest Classifier': 'rf',
    'Logistic Regression': 'lr',
    'Decision Tree Classifier': 'dt',
    'Ada Boost Classifier': 'ada',
    'Gradient Boosting Classifier': 'gbc',
    'Naive Bayes': 'nb',
    'SVM - Linear Kernel': 'svm',
    'Extra Trees Classifier': 'et',
    'Light Gradient Boosting Machine': 'lightgbm',
    'Ridge Classifier': 'ridge',
    'Linear Discriminant Analysis': 'lda',
    'Extreme Gradient Boosting': 'xgboost',
    'Dummy Classifier': 'dummy'
}

# Loop over each model in the compare_results DataFrame
for idx, row in compare_results.iterrows():
    model_name = row['Model']
    testing_time = row['TT (Sec)']

    # Get the PyCaret identifier for the model
    model_identifier = model_mapping.get(model_name)

    # Instead of finalizing by the model name, select the model by identifier
    if model_identifier:
        best_model = compare_models(include=[model_identifier], n_select=1)

        # Finalize the model (train it on the entire dataset)
        final_model = finalize_model(best_model)

        # Predict on train and test sets
        predictions_train = predict_model(final_model, data=X_train)
        predictions_test = predict_model(final_model, data=X_test)

        # Calculate test accuracy and F1 score
        y_pred_test = predictions_test['prediction_label']
        test_accuracy = accuracy_score(y_test, y_pred_test)
        test_f1 = f1_score(y_test, y_pred_test, average='weighted')

        # Calculate train accuracy and F1 score
        y_pred_train = predictions_train['prediction_label']
        train_accuracy = accuracy_score(y_train, y_pred_train)
        train_f1 = f1_score(y_train, y_pred_train, average='weighted')

        # Append the metrics for each model to the results DataFrame
        new_row = pd.DataFrame({
            'model': model_name,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'train_f1': train_f1,
            'test_f1': test_f1,
            'TT': testing_time
        }, index=[0])

        invertedwinger_df = pd.concat([invertedwinger_df, new_row], ignore_index=True)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Class_Inverted Winger
2,Target type,Binary
3,Original data shape,"(91, 15)"
4,Transformed data shape,"(91, 15)"
5,Transformed train set shape,"(63, 15)"
6,Transformed test set shape,"(28, 15)"
7,Numeric features,14
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.969,0.9889,0.9667,0.975,0.9657,0.9387,0.9457,0.031
ridge,Ridge Classifier,0.9524,0.9889,0.9667,0.9417,0.9524,0.9053,0.9083,0.015
lda,Linear Discriminant Analysis,0.9524,0.9889,0.9667,0.9417,0.9524,0.9053,0.9083,0.015
et,Extra Trees Classifier,0.95,0.9833,0.9,1.0,0.94,0.9,0.9121,0.076
rf,Random Forest Classifier,0.919,0.9444,0.9,0.95,0.9114,0.8387,0.8578,0.089
gbc,Gradient Boosting Classifier,0.919,0.9556,0.9,0.925,0.9057,0.8362,0.8437,0.065
knn,K Neighbors Classifier,0.9167,0.9556,0.8667,0.975,0.8957,0.8333,0.8569,0.03
ada,Ada Boost Classifier,0.9048,0.9778,0.9333,0.8917,0.9038,0.8082,0.8228,0.049
xgboost,Extreme Gradient Boosting,0.8762,0.95,0.9,0.8517,0.8674,0.7545,0.7671,0.028
nb,Naive Bayes,0.869,0.9,0.8333,0.8917,0.8524,0.7362,0.7478,0.016


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.969,0.9889,0.9667,0.975,0.9657,0.9387,0.9457,0.044


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.9524,0.9889,0.9667,0.9417,0.9524,0.9053,0.9083,0.022


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9524,0.9889,0.9667,0.9417,0.9524,0.9053,0.9083,0.022


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.95,0.9833,0.9,1.0,0.94,0.9,0.9121,0.096


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.919,0.9444,0.9,0.95,0.9114,0.8387,0.8578,0.136


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.919,0.9556,0.9,0.925,0.9057,0.8362,0.8437,0.074


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9167,0.9556,0.8667,0.975,0.8957,0.8333,0.8569,0.025


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.9048,0.9778,0.9333,0.8917,0.9038,0.8082,0.8228,0.074


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.8762,0.95,0.9,0.8517,0.8674,0.7545,0.7671,0.031


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.869,0.9,0.8333,0.8917,0.8524,0.7362,0.7478,0.022


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.8548,0.9667,0.8333,0.8917,0.8238,0.7107,0.7435,0.016


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7952,0.8917,0.8,0.7433,0.7598,0.5907,0.5995,0.04


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.7762,0.775,0.7333,0.7833,0.7448,0.5474,0.5629,0.022


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.5214,0.5,0.0,0.0,0.0,0.0,0.0,0.021


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [30]:
invertedwinger_df

Unnamed: 0,model,train_accuracy,test_accuracy,train_f1,test_f1,TT
0,Logistic Regression,1.0,0.869565,1.0,0.870062,0.031
1,Ridge Classifier,0.978022,0.869565,0.978038,0.868548,0.015
2,Linear Discriminant Analysis,0.978022,0.869565,0.978038,0.868548,0.015
3,Extra Trees Classifier,1.0,0.869565,1.0,0.870062,0.076
4,Random Forest Classifier,1.0,0.826087,1.0,0.826087,0.089
5,Gradient Boosting Classifier,1.0,0.869565,1.0,0.869565,0.065
6,K Neighbors Classifier,0.934066,0.826087,0.934114,0.826087,0.03
7,Ada Boost Classifier,1.0,0.913043,1.0,0.913373,0.049
8,Extreme Gradient Boosting,1.0,0.869565,1.0,0.870062,0.028
9,Naive Bayes,0.879121,0.782609,0.879033,0.780914,0.016


## Goal Poacher

In [31]:
import pandas as pd
from pycaret.classification import setup, compare_models, predict_model, finalize_model, pull
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# Load the CSV file into a DataFrame
df = pd.read_csv('../data/players_df_sin_reco.csv')

# Select features (columns 7 to 20, 0-based index: 6 to 19)
X = df.iloc[:, 6:20]

# Select the 2nd column from the end as the target
y = df.iloc[:, -2]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an empty DataFrame to store results
goalpoacher_df = pd.DataFrame(columns=['model', 'train_accuracy', 'test_accuracy', 'train_f1', 'test_f1', 'TT'])

# Concatenate X_train and y_train into a DataFrame for PyCaret setup
df_target = pd.concat([X_train, y_train], axis=1)

# Setup PyCaret environment
clf = setup(data=df_target, target=y.name, session_id=42)

# Compare different models and store the results
best_model = compare_models()  # This returns the top model
compare_results = pull()  # Pulls all model results as a DataFrame

# Define the mapping from model names to PyCaret identifiers
model_mapping = {
    'K Neighbors Classifier': 'knn',
    'Random Forest Classifier': 'rf',
    'Logistic Regression': 'lr',
    'Decision Tree Classifier': 'dt',
    'Ada Boost Classifier': 'ada',
    'Gradient Boosting Classifier': 'gbc',
    'Naive Bayes': 'nb',
    'SVM - Linear Kernel': 'svm',
    'Extra Trees Classifier': 'et',
    'Light Gradient Boosting Machine': 'lightgbm',
    'Ridge Classifier': 'ridge',
    'Linear Discriminant Analysis': 'lda',
    'Extreme Gradient Boosting': 'xgboost',
    'Dummy Classifier': 'dummy'
}

# Loop over each model in the compare_results DataFrame
for idx, row in compare_results.iterrows():
    model_name = row['Model']
    testing_time = row['TT (Sec)']

    # Get the PyCaret identifier for the model
    model_identifier = model_mapping.get(model_name)

    # Instead of finalizing by the model name, select the model by identifier
    if model_identifier:
        best_model = compare_models(include=[model_identifier], n_select=1)

        # Finalize the model (train it on the entire dataset)
        final_model = finalize_model(best_model)

        # Predict on train and test sets
        predictions_train = predict_model(final_model, data=X_train)
        predictions_test = predict_model(final_model, data=X_test)

        # Calculate test accuracy and F1 score
        y_pred_test = predictions_test['prediction_label']
        test_accuracy = accuracy_score(y_test, y_pred_test)
        test_f1 = f1_score(y_test, y_pred_test, average='weighted')

        # Calculate train accuracy and F1 score
        y_pred_train = predictions_train['prediction_label']
        train_accuracy = accuracy_score(y_train, y_pred_train)
        train_f1 = f1_score(y_train, y_pred_train, average='weighted')

        # Append the metrics for each model to the results DataFrame
        new_row = pd.DataFrame({
            'model': model_name,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'train_f1': train_f1,
            'test_f1': test_f1,
            'TT': testing_time
        }, index=[0])

        goalpoacher_df = pd.concat([goalpoacher_df, new_row], ignore_index=True)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Class_Goal Poacher
2,Target type,Binary
3,Original data shape,"(91, 15)"
4,Transformed data shape,"(91, 15)"
5,Transformed train set shape,"(63, 15)"
6,Transformed test set shape,"(28, 15)"
7,Numeric features,14
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.8905,0.9346,0.7833,0.8333,0.7767,0.7285,0.7531,0.031
et,Extra Trees Classifier,0.8905,0.9383,0.7333,0.8667,0.7733,0.7218,0.7446,0.107
lr,Logistic Regression,0.8571,0.9358,0.7667,0.8167,0.7767,0.6734,0.687,0.038
ridge,Ridge Classifier,0.8571,0.9167,0.7167,0.7833,0.73,0.6493,0.6652,0.016
lda,Linear Discriminant Analysis,0.8571,0.9167,0.7167,0.7833,0.73,0.6493,0.6652,0.019
qda,Quadratic Discriminant Analysis,0.8286,0.8333,0.5833,0.75,0.6371,0.5587,0.5807,0.022
rf,Random Forest Classifier,0.8071,0.9058,0.6167,0.7583,0.6557,0.5397,0.5641,0.086
svm,SVM - Linear Kernel,0.7952,0.8742,0.65,0.6817,0.6229,0.53,0.5684,0.017
ada,Ada Boost Classifier,0.7762,0.8667,0.7167,0.6083,0.6424,0.4949,0.5195,0.067
xgboost,Extreme Gradient Boosting,0.75,0.855,0.6333,0.625,0.609,0.4293,0.4466,0.034


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.8905,0.9346,0.7833,0.8333,0.7767,0.7285,0.7531,0.029


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8905,0.9383,0.7333,0.8667,0.7733,0.7218,0.7446,0.102


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8571,0.9358,0.7667,0.8167,0.7767,0.6734,0.687,0.029


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.8571,0.9167,0.7167,0.7833,0.73,0.6493,0.6652,0.027


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.8571,0.9167,0.7167,0.7833,0.73,0.6493,0.6652,0.018


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8071,0.9058,0.6167,0.7583,0.6557,0.5397,0.5641,0.103


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.7952,0.8742,0.65,0.6817,0.6229,0.53,0.5684,0.017


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.7762,0.8667,0.7167,0.6083,0.6424,0.4949,0.5195,0.083


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.75,0.855,0.6333,0.625,0.609,0.4293,0.4466,0.025


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7452,0.8717,0.7167,0.5917,0.639,0.4468,0.4608,0.053


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7452,0.7808,0.5333,0.7,0.57,0.4009,0.4318,0.031


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.731,0.8633,0.7,0.6733,0.6371,0.44,0.4844,0.013


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.6833,0.6658,0.6167,0.5833,0.5657,0.3317,0.3524,0.013


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.6524,0.5,0.0,0.0,0.0,0.0,0.0,0.012


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [32]:
goalpoacher_df

Unnamed: 0,model,train_accuracy,test_accuracy,train_f1,test_f1,TT
0,K Neighbors Classifier,0.923077,0.869565,0.92279,0.864803,0.031
1,Extra Trees Classifier,1.0,0.913043,1.0,0.913043,0.107
2,Logistic Regression,1.0,0.956522,1.0,0.956687,0.038
3,Ridge Classifier,0.89011,0.869565,0.89011,0.870062,0.016
4,Linear Discriminant Analysis,0.89011,0.869565,0.89011,0.870062,0.019
5,Random Forest Classifier,1.0,0.826087,1.0,0.822636,0.086
6,SVM - Linear Kernel,0.912088,0.956522,0.910599,0.956183,0.017
7,Ada Boost Classifier,1.0,0.826087,1.0,0.826087,0.067
8,Extreme Gradient Boosting,1.0,0.913043,1.0,0.913043,0.034
9,Gradient Boosting Classifier,1.0,0.913043,1.0,0.913043,0.07


## Target Man

In [33]:
import pandas as pd
from pycaret.classification import setup, compare_models, predict_model, finalize_model, pull
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# Load the CSV file into a DataFrame
df = pd.read_csv('../data/players_df_sin_reco.csv')

# Select features (columns 7 to 20, 0-based index: 6 to 19)
X = df.iloc[:, 6:20]

# Select the last column from the end as the target
y = df.iloc[:, -1]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an empty DataFrame to store results
targetman_df = pd.DataFrame(columns=['model', 'train_accuracy', 'test_accuracy', 'train_f1', 'test_f1', 'TT'])

# Concatenate X_train and y_train into a DataFrame for PyCaret setup
df_target = pd.concat([X_train, y_train], axis=1)

# Setup PyCaret environment
clf = setup(data=df_target, target=y.name, session_id=42)

# Compare different models and store the results
best_model = compare_models()  # This returns the top model
compare_results = pull()  # Pulls all model results as a DataFrame

# Define the mapping from model names to PyCaret identifiers
model_mapping = {
    'K Neighbors Classifier': 'knn',
    'Random Forest Classifier': 'rf',
    'Logistic Regression': 'lr',
    'Decision Tree Classifier': 'dt',
    'Ada Boost Classifier': 'ada',
    'Gradient Boosting Classifier': 'gbc',
    'Naive Bayes': 'nb',
    'SVM - Linear Kernel': 'svm',
    'Extra Trees Classifier': 'et',
    'Light Gradient Boosting Machine': 'lightgbm',
    'Ridge Classifier': 'ridge',
    'Linear Discriminant Analysis': 'lda',
    'Extreme Gradient Boosting': 'xgboost',
    'Dummy Classifier': 'dummy'
}

# Loop over each model in the compare_results DataFrame
for idx, row in compare_results.iterrows():
    model_name = row['Model']
    testing_time = row['TT (Sec)']

    # Get the PyCaret identifier for the model
    model_identifier = model_mapping.get(model_name)

    # Instead of finalizing by the model name, select the model by identifier
    if model_identifier:
        best_model = compare_models(include=[model_identifier], n_select=1)

        # Finalize the model (train it on the entire dataset)
        final_model = finalize_model(best_model)

        # Predict on train and test sets
        predictions_train = predict_model(final_model, data=X_train)
        predictions_test = predict_model(final_model, data=X_test)

        # Calculate test accuracy and F1 score
        y_pred_test = predictions_test['prediction_label']
        test_accuracy = accuracy_score(y_test, y_pred_test)
        test_f1 = f1_score(y_test, y_pred_test, average='weighted')

        # Calculate train accuracy and F1 score
        y_pred_train = predictions_train['prediction_label']
        train_accuracy = accuracy_score(y_train, y_pred_train)
        train_f1 = f1_score(y_train, y_pred_train, average='weighted')

        # Append the metrics for each model to the results DataFrame
        new_row = pd.DataFrame({
            'model': model_name,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'train_f1': train_f1,
            'test_f1': test_f1,
            'TT': testing_time
        }, index=[0])

        targetman_df = pd.concat([targetman_df, new_row], ignore_index=True)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Class_Target Man
2,Target type,Binary
3,Original data shape,"(91, 15)"
4,Transformed data shape,"(91, 15)"
5,Transformed train set shape,"(63, 15)"
6,Transformed test set shape,"(28, 15)"
7,Numeric features,14
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.969,1.0,1.0,0.955,0.9746,0.9362,0.9437,0.08
xgboost,Extreme Gradient Boosting,0.969,0.9889,1.0,0.955,0.9746,0.9362,0.9437,0.031
et,Extra Trees Classifier,0.9548,1.0,1.0,0.935,0.9635,0.9058,0.9168,0.07
knn,K Neighbors Classifier,0.9381,0.9681,1.0,0.91,0.9492,0.8725,0.8875,0.021
ada,Ada Boost Classifier,0.9214,0.9889,0.95,0.935,0.9349,0.832,0.8527,0.052
lr,Logistic Regression,0.919,0.9556,0.975,0.915,0.9353,0.8362,0.8592,0.022
gbc,Gradient Boosting Classifier,0.9048,0.9167,0.95,0.91,0.9206,0.8058,0.8289,0.049
lightgbm,Light Gradient Boosting Machine,0.8929,0.9722,0.9167,0.9017,0.9052,0.7808,0.7918,0.037
ridge,Ridge Classifier,0.8857,0.9556,0.9083,0.8967,0.8984,0.7625,0.7716,0.015
lda,Linear Discriminant Analysis,0.8857,0.9556,0.9083,0.8967,0.8984,0.7625,0.7716,0.015


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.969,1.0,1.0,0.955,0.9746,0.9362,0.9437,0.088


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.969,0.9889,1.0,0.955,0.9746,0.9362,0.9437,0.03


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9548,1.0,1.0,0.935,0.9635,0.9058,0.9168,0.073


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9381,0.9681,1.0,0.91,0.9492,0.8725,0.8875,0.036


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.9214,0.9889,0.95,0.935,0.9349,0.832,0.8527,0.052


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.919,0.9556,0.975,0.915,0.9353,0.8362,0.8592,0.022


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9048,0.9167,0.95,0.91,0.9206,0.8058,0.8289,0.055


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8929,0.9722,0.9167,0.9017,0.9052,0.7808,0.7918,0.039


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.8857,0.9556,0.9083,0.8967,0.8984,0.7625,0.7716,0.018


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.8857,0.9556,0.9083,0.8967,0.8984,0.7625,0.7716,0.021


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.8762,0.875,0.95,0.8767,0.9006,0.7422,0.776,0.017


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.8714,0.8729,0.9083,0.8683,0.8861,0.7226,0.7348,0.017


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.869,0.9667,0.8583,0.935,0.8659,0.7334,0.7724,0.02


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.5881,0.5,1.0,0.5881,0.7382,0.0,0.0,0.016


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [34]:
targetman_df

Unnamed: 0,model,train_accuracy,test_accuracy,train_f1,test_f1,TT
0,Random Forest Classifier,1.0,0.73913,1.0,0.725064,0.08
1,Extreme Gradient Boosting,1.0,0.869565,1.0,0.857369,0.031
2,Extra Trees Classifier,1.0,0.782609,1.0,0.762281,0.07
3,K Neighbors Classifier,0.956044,0.869565,0.955833,0.857369,0.021
4,Ada Boost Classifier,1.0,0.826087,1.0,0.816709,0.052
5,Logistic Regression,1.0,0.869565,1.0,0.866525,0.022
6,Gradient Boosting Classifier,1.0,0.826087,1.0,0.826087,0.049
7,Light Gradient Boosting Machine,1.0,0.782609,1.0,0.762281,0.037
8,Ridge Classifier,0.934066,0.73913,0.934066,0.701449,0.015
9,Linear Discriminant Analysis,0.934066,0.73913,0.934066,0.701449,0.015
