In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/feature_engineered_data_v2.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFIFA version distribution:")
print(df['fifa_version'].value_counts().sort_index())
print(f"\nTarget distribution:")
print(df['big_potential'].value_counts())

Dataset shape: (41458, 17)

FIFA version distribution:
fifa_version
17.0    8889
18.0    8814
19.0    8777
20.0    8091
21.0    6887
Name: count, dtype: int64

Target distribution:
big_potential
0    31162
1    10296
Name: count, dtype: int64


In [3]:
# Define temporal splits
train_versions = [17.0, 18.0, 19.0,20.0]
val_version = 21.0
test_version = 21.0

# Split data by FIFA version
df_train = df[df['fifa_version'].isin(train_versions)]
df_val = df[df['fifa_version'] == val_version]
df_test = df[df['fifa_version'] == test_version]

print(f"Train (FIFA 17-20): {len(df_train)} samples")
print(f"Validation (FIFA 21): {len(df_val)} samples")
print(f"Test (FIFA 21): {len(df_test)} samples")

Train (FIFA 17-20): 34571 samples
Validation (FIFA 21): 6887 samples
Test (FIFA 21): 6887 samples


In [4]:
# All columns except fifa_version and big_potential are features
feature_columns = [col for col in df.columns if col not in ['fifa_version', 'big_potential']]
target = 'big_potential'

print(f"Number of features: {len(feature_columns)}")
print(f"Features: {feature_columns}")

# Prepare datasets
X_train = df_train[feature_columns]
y_train = df_train[target]

X_val = df_val[feature_columns]
y_val = df_val[target]

X_test = df_test[feature_columns]
y_test = df_test[target]

Number of features: 15
Features: ['age', 'mentality_interceptions', 'defending', 'defending_standing_tackle', 'defending_sliding_tackle', 'attacking_heading_accuracy', 'skill_dribbling', 'attacker_position', 'skill_ball_control', 'midfielder_position', 'attacking_finishing', 'attacking_volleys', 'attacking_short_passing', 'mentality_positioning', 'shooting']


In [5]:
from sklearn.model_selection import train_test_split
# Stratified split: 10% labeled, 90% unlabeled from training data
X_labeled, X_unlabeled, y_labeled, y_unlabeled_true = train_test_split(
    X_train, y_train, 
    test_size=0.8,  # 90% unlabeled
    stratify=y_train, 
    random_state=42
)

# Create unlabeled labels (-1 for semi-supervised algorithms)
y_unlabeled = np.full(len(y_unlabeled_true), -1)

# Combine for semi-supervised training
X_train_ssl = np.vstack([X_labeled.values, X_unlabeled.values])
y_train_ssl = np.concatenate([y_labeled.values, y_unlabeled])

print(f"Labeled samples: {len(y_labeled)} ({len(y_labeled)/len(y_train)*100:.1f}%)")
print(f"Unlabeled samples: {len(y_unlabeled)} ({len(y_unlabeled)/len(y_train)*100:.1f}%)")

Labeled samples: 6914 (20.0%)
Unlabeled samples: 27657 (80.0%)


In [6]:
print(type(y_train_ssl))
print(type(X_train_ssl))
X_val_array = X_val.values if hasattr(X_val, 'values') else X_val
print(type(X_val_array))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_ssl_scaled = scaler.fit_transform(X_train_ssl)
X_val_array_scaled = scaler.transform(X_val_array)
X_labeled_scaled = scaler.transform(X_labeled.to_numpy())

In [8]:
print("Class distribution across splits:")
print(f"Train total: {y_train.value_counts().to_dict()}")
print(f"Labeled only: {y_labeled.value_counts().to_dict()}")
print(f"Validation: {y_val.value_counts().to_dict()}")
print(f"Test: {y_test.value_counts().to_dict()}")

Class distribution across splits:
Train total: {0: 25968, 1: 8603}
Labeled only: {0: 5193, 1: 1721}
Validation: {0: 5194, 1: 1693}
Test: {0: 5194, 1: 1693}


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    f1_score, roc_auc_score, accuracy_score,
    precision_score, recall_score
)
from xgboost import XGBClassifier as xgb


baseline = xgb(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    scale_pos_weight=3,    
    eval_metric="logloss",
    tree_method="hist",
    random_state=42
)
baseline.fit(X_labeled, y_labeled)
# Predictions on validation set
y_val_pred = baseline.predict(X_val)
y_val_proba = baseline.predict_proba(X_val)[:, 1]

# Calculate metrics
print("="*60)
print("BASELINE MODEL - Validation Set (FIFA 21) Metrics")
print("="*60)
print(f"\nAccuracy:  {accuracy_score(y_val, y_val_pred):.4f}")
print(f"Precision: {precision_score(y_val, y_val_pred):.4f}")
print(f"Recall:    {recall_score(y_val, y_val_pred):.4f}")
print(f"F1 Score:  {f1_score(y_val, y_val_pred):.4f}")
print(f"ROC-AUC:   {roc_auc_score(y_val, y_val_proba):.4f}")

print("\n" + "-"*60)
print("Classification Report:")
print("-"*60)
print(classification_report(y_val, y_val_pred, target_names=['No Big Potential', 'Big Potential']))

print("-"*60)
print("Confusion Matrix:")
print("-"*60)
cm = confusion_matrix(y_val, y_val_pred)
print(f"                  Predicted")
print(f"                  No BP    BP")
print(f"Actual No BP     {cm[0,0]:5d}  {cm[0,1]:5d}")
print(f"Actual BP        {cm[1,0]:5d}  {cm[1,1]:5d}")

BASELINE MODEL - Validation Set (FIFA 21) Metrics

Accuracy:  0.6851
Precision: 0.4085
Recall:    0.6273
F1 Score:  0.4948
ROC-AUC:   0.7245

------------------------------------------------------------
Classification Report:
------------------------------------------------------------
                  precision    recall  f1-score   support

No Big Potential       0.85      0.70      0.77      5194
   Big Potential       0.41      0.63      0.49      1693

        accuracy                           0.69      6887
       macro avg       0.63      0.67      0.63      6887
    weighted avg       0.74      0.69      0.70      6887

------------------------------------------------------------
Confusion Matrix:
------------------------------------------------------------
                  Predicted
                  No BP    BP
Actual No BP      3656   1538
Actual BP          631   1062


In [13]:
# Evaluate self-training model on validation set (FIFA 21)
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    f1_score, roc_auc_score, accuracy_score,
    precision_score, recall_score
)

#self-Training classifier
from sklearn.semi_supervised import SelfTrainingClassifier
from xgboost import XGBClassifier as xgb

xgb_baseline = xgb(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    scale_pos_weight=3,    
    eval_metric="logloss",
    tree_method="hist",
    random_state=42
)

self_training = SelfTrainingClassifier(
    xgb_baseline,
    threshold=0.85,     # XGBoost gives good confidence scores
    max_iter=12,
    verbose=1
)

self_training.fit(X_train_ssl_scaled, y_train_ssl)
# Predictions on validation set
y_ssl_pred = self_training.predict(X_val_array_scaled)
y_ssl_proba = self_training.predict_proba(X_val_array_scaled)[:, 1]

# Calculate metrics
print("="*60)
print("SELF-TRAINING MODEL - Validation Set (FIFA 21) Metrics")
print("="*60)
print(f"\nAccuracy:  {accuracy_score(y_val, y_ssl_pred):.4f}")
print(f"Precision: {precision_score(y_val, y_ssl_pred):.4f}")
print(f"Recall:    {recall_score(y_val, y_ssl_pred):.4f}")
print(f"F1 Score:  {f1_score(y_val, y_ssl_pred):.4f}")
print(f"ROC-AUC:   {roc_auc_score(y_val, y_ssl_proba):.4f}")

print("\n" + "-"*60)
print("Classification Report:")
print("-"*60)
print(classification_report(y_val, y_ssl_pred, target_names=['No Big Potential', 'Big Potential']))

print("-"*60)
print("Confusion Matrix:")
print("-"*60)
cm = confusion_matrix(y_val, y_ssl_pred)
print(f"                  Predicted")
print(f"                  No BP    BP")
print(f"Actual No BP     {cm[0,0]:5d}  {cm[0,1]:5d}")
print(f"Actual BP        {cm[1,0]:5d}  {cm[1,1]:5d}")



End of iteration 1, added 3713 new labels.
End of iteration 2, added 2866 new labels.
End of iteration 3, added 2340 new labels.
End of iteration 4, added 1950 new labels.
End of iteration 5, added 1334 new labels.
End of iteration 6, added 912 new labels.
End of iteration 7, added 633 new labels.
End of iteration 8, added 415 new labels.
End of iteration 9, added 414 new labels.
End of iteration 10, added 407 new labels.
End of iteration 11, added 282 new labels.
End of iteration 12, added 240 new labels.
SELF-TRAINING MODEL - Validation Set (FIFA 21) Metrics

Accuracy:  0.6659
Precision: 0.3928
Recall:    0.6580
F1 Score:  0.4919
ROC-AUC:   0.7118

------------------------------------------------------------
Classification Report:
------------------------------------------------------------
                  precision    recall  f1-score   support

No Big Potential       0.86      0.67      0.75      5194
   Big Potential       0.39      0.66      0.49      1693

        accuracy    

In [11]:
y_labeled_pred = self_training.predict(X_labeled_scaled)
y_labeled_proba = self_training.predict_proba(X_labeled_scaled)[:, 1]

# Calculate metrics on original labeled data
print("="*60)
print("SELF-TRAINING MODEL - Training Set Metrics (Original Labeled Data)")
print("="*60)
print(f"\nLabeled samples: {len(y_labeled)}")
print(f"\nAccuracy:  {accuracy_score(y_labeled, y_labeled_pred):.4f}")
print(f"Precision: {precision_score(y_labeled, y_labeled_pred):.4f}")
print(f"Recall:    {recall_score(y_labeled, y_labeled_pred):.4f}")
print(f"F1 Score:  {f1_score(y_labeled, y_labeled_pred):.4f}")
print(f"ROC-AUC:   {roc_auc_score(y_labeled, y_labeled_proba):.4f}")

print("\n" + "-"*60)
print("Classification Report:")
print("-"*60)
print(classification_report(y_labeled, y_labeled_pred, target_names=['No Big Potential', 'Big Potential']))

print("-"*60)
print("Confusion Matrix:")
print("-"*60)
cm = confusion_matrix(y_labeled, y_labeled_pred)
print(f"                  Predicted")
print(f"                  No BP    BP")
print(f"Actual No BP     {cm[0,0]:5d}  {cm[0,1]:5d}")
print(f"Actual BP        {cm[1,0]:5d}  {cm[1,1]:5d}")

SELF-TRAINING MODEL - Training Set Metrics (Original Labeled Data)

Labeled samples: 6914

Accuracy:  0.7102
Precision: 0.4499
Recall:    0.7379
F1 Score:  0.5590
ROC-AUC:   0.7925

------------------------------------------------------------
Classification Report:
------------------------------------------------------------
                  precision    recall  f1-score   support

No Big Potential       0.89      0.70      0.78      5193
   Big Potential       0.45      0.74      0.56      1721

        accuracy                           0.71      6914
       macro avg       0.67      0.72      0.67      6914
    weighted avg       0.78      0.71      0.73      6914

------------------------------------------------------------
Confusion Matrix:
------------------------------------------------------------
                  Predicted
                  No BP    BP
Actual No BP      3640   1553
Actual BP          451   1270
