In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/feature_engineered_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFIFA version distribution:")
print(df['fifa_version'].value_counts().sort_index())
print(f"\nTarget distribution:")
print(df['big_potential'].value_counts())

Dataset shape: (41458, 29)

FIFA version distribution:
fifa_version
17.0    8889
18.0    8814
19.0    8777
20.0    8091
21.0    6887
Name: count, dtype: int64

Target distribution:
big_potential
0    31162
1    10296
Name: count, dtype: int64


In [2]:
# Define temporal splits
train_versions = [17.0, 18.0, 19.0,20.0]
val_version = 21.0
test_version = 21.0

# Split data by FIFA version
df_train = df[df['fifa_version'].isin(train_versions)]
df_val = df[df['fifa_version'] == val_version]
df_test = df[df['fifa_version'] == test_version]

print(f"Train (FIFA 17-20): {len(df_train)} samples")
print(f"Validation (FIFA 21): {len(df_val)} samples")
print(f"Test (FIFA 21): {len(df_test)} samples")

Train (FIFA 17-20): 34571 samples
Validation (FIFA 21): 6887 samples
Test (FIFA 21): 6887 samples


In [3]:
# All columns except fifa_version and big_potential are features
feature_columns = [col for col in df.columns if col not in ['fifa_version', 'big_potential']]
target = 'big_potential'

print(f"Number of features: {len(feature_columns)}")
print(f"Features: {feature_columns}")

# Prepare datasets
X_train = df_train[feature_columns]
y_train = df_train[target]

X_val = df_val[feature_columns]
y_val = df_val[target]

X_test = df_test[feature_columns]
y_test = df_test[target]

Number of features: 27
Features: ['age', 'physic', 'mentality_aggression', 'mentality_interceptions', 'power_stamina', 'power_strength', 'defending_marking_awareness', 'power_jumping', 'defending_standing_tackle', 'defending_sliding_tackle', 'attacking_heading_accuracy', 'mentality_composure', 'movement_reactions', 'skill_long_passing', 'skill_dribbling', 'skill_fk_accuracy', 'skill_ball_control', 'attacking_crossing', 'power_shot_power', 'attacking_finishing', 'skill_curve', 'movement_balance', 'attacking_volleys', 'power_long_shots', 'mentality_vision', 'mentality_penalties', 'movement_agility']


In [4]:
X_train

Unnamed: 0,age,physic,mentality_aggression,mentality_interceptions,power_stamina,power_strength,defending_marking_awareness,power_jumping,defending_standing_tackle,defending_sliding_tackle,...,attacking_crossing,power_shot_power,attacking_finishing,skill_curve,movement_balance,attacking_volleys,power_long_shots,mentality_vision,mentality_penalties,movement_agility
0,32,60.0,58,68,64,59,57,54,57,56,...,79,65,73,80,86,74,74,94,71,82
1,33,58.0,58,66,58,58,57,52,57,56,...,77,65,70,80,84,74,71,94,71,79
2,34,57.0,58,66,55,58,67,47,57,56,...,77,65,70,80,84,74,71,94,71,79
3,35,66.0,87,68,51,64,64,72,66,65,...,78,79,68,81,84,69,77,77,85,66
4,31,66.0,70,81,55,69,80,70,80,76,...,34,68,20,49,63,19,40,46,35,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40815,18,49.0,51,61,60,43,58,55,63,58,...,60,26,23,33,77,28,23,37,39,59
40817,20,70.0,74,57,65,71,58,61,64,62,...,52,42,21,28,54,23,24,42,30,58
40819,18,51.0,32,42,59,53,40,61,43,42,...,39,44,41,35,52,31,39,47,41,65
40821,17,56.0,55,47,61,54,36,59,38,40,...,30,46,28,36,57,31,30,47,39,59


In [5]:

from sklearn.model_selection import train_test_split
# Stratified split: 10% labeled, 90% unlabeled from training data
X_labeled, X_unlabeled, y_labeled, y_unlabeled_true = train_test_split(
    X_train, y_train, 
    test_size=0.9,  # 90% unlabeled
    stratify=y_train, 
    random_state=42
)

# Create unlabeled labels (-1 for semi-supervised algorithms)
y_unlabeled = np.full(len(y_unlabeled_true), -1)

# Combine for semi-supervised training
X_train_ssl = np.vstack([X_labeled.values, X_unlabeled.values])
y_train_ssl = np.concatenate([y_labeled.values, y_unlabeled])

print(f"Labeled samples: {len(y_labeled)} ({len(y_labeled)/len(y_train)*100:.1f}%)")
print(f"Unlabeled samples: {len(y_unlabeled)} ({len(y_unlabeled)/len(y_train)*100:.1f}%)")

Labeled samples: 3457 (10.0%)
Unlabeled samples: 31114 (90.0%)


In [6]:
X_unlabeled

Unnamed: 0,age,physic,mentality_aggression,mentality_interceptions,power_stamina,power_strength,defending_marking_awareness,power_jumping,defending_standing_tackle,defending_sliding_tackle,...,attacking_crossing,power_shot_power,attacking_finishing,skill_curve,movement_balance,attacking_volleys,power_long_shots,mentality_vision,mentality_penalties,movement_agility
30317,20,75.0,76,74,86,68,74,83,76,69,...,52,51,48,52,84,33,46,71,51,79
34485,20,60.0,71,38,68,52,60,57,53,46,...,67,66,62,58,64,49,44,66,51,70
9879,26,74.0,73,62,81,72,57,65,65,62,...,75,74,67,76,82,60,68,70,63,77
11620,25,71.0,70,45,66,74,38,78,40,42,...,76,82,70,72,76,64,74,75,75,85
34520,23,75.0,75,67,74,75,72,71,71,68,...,21,34,21,25,35,24,18,33,34,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22779,21,64.0,61,60,75,60,61,63,64,63,...,61,36,32,34,80,31,31,49,35,72
9689,23,63.0,40,18,79,63,16,78,16,14,...,66,64,73,61,87,58,68,61,48,85
29897,28,54.0,29,31,69,59,33,36,32,24,...,67,56,71,41,77,45,42,69,60,78
37711,24,64.0,59,43,63,67,59,54,44,35,...,58,67,62,63,63,52,59,68,58,58


In [7]:
y_unlabeled

array([-1, -1, -1, ..., -1, -1, -1], shape=(31114,))

In [8]:
print(type(y_train_ssl))
print(type(X_train_ssl))


<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [9]:
# converting to numpy array
X_val_array = X_val.values if hasattr(X_val, 'values') else X_val
print(type(X_val_array))


<class 'numpy.ndarray'>


In [10]:
#apply z-score scaling 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_ssl_scaled = scaler.fit_transform(X_train_ssl)
X_val_array_scaled = scaler.transform(X_val_array)
X_labeled_scaled = scaler.transform(X_labeled.to_numpy())



In [11]:
print("Class distribution across splits:")
print(f"Train total: {y_train.value_counts().to_dict()}")
print(f"Labeled only: {y_labeled.value_counts().to_dict()}")
print(f"Validation: {y_val.value_counts().to_dict()}")
print(f"Test: {y_test.value_counts().to_dict()}")

Class distribution across splits:
Train total: {0: 25968, 1: 8603}
Labeled only: {0: 2597, 1: 860}
Validation: {0: 5194, 1: 1693}
Test: {0: 5194, 1: 1693}


In [12]:
from sklearn.ensemble import RandomForestClassifier

baseline = RandomForestClassifier(class_weight='balanced', random_state=42)
baseline.fit(X_labeled, y_labeled)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
# Evaluate baseline model on validation set (FIFA 20)
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    f1_score, roc_auc_score, accuracy_score,
    precision_score, recall_score
)

# Predictions on validation set
y_val_pred = baseline.predict(X_val)
y_val_proba = baseline.predict_proba(X_val)[:, 1]

# Calculate metrics
print("="*60)
print("BASELINE MODEL - Validation Set (FIFA 21) Metrics")
print("="*60)
print(f"\nAccuracy:  {accuracy_score(y_val, y_val_pred):.4f}")
print(f"Precision: {precision_score(y_val, y_val_pred):.4f}")
print(f"Recall:    {recall_score(y_val, y_val_pred):.4f}")
print(f"F1 Score:  {f1_score(y_val, y_val_pred):.4f}")
print(f"ROC-AUC:   {roc_auc_score(y_val, y_val_proba):.4f}")

print("\n" + "-"*60)
print("Classification Report:")
print("-"*60)
print(classification_report(y_val, y_val_pred, target_names=['No Big Potential', 'Big Potential']))

print("-"*60)
print("Confusion Matrix:")
print("-"*60)
cm = confusion_matrix(y_val, y_val_pred)
print(f"                  Predicted")
print(f"                  No BP    BP")
print(f"Actual No BP     {cm[0,0]:5d}  {cm[0,1]:5d}")
print(f"Actual BP        {cm[1,0]:5d}  {cm[1,1]:5d}")


BASELINE MODEL - Validation Set (FIFA 21) Metrics

Accuracy:  0.7642
Precision: 0.5827
Recall:    0.1435
F1 Score:  0.2303
ROC-AUC:   0.7147

------------------------------------------------------------
Classification Report:
------------------------------------------------------------
                  precision    recall  f1-score   support

No Big Potential       0.78      0.97      0.86      5194
   Big Potential       0.58      0.14      0.23      1693

        accuracy                           0.76      6887
       macro avg       0.68      0.56      0.55      6887
    weighted avg       0.73      0.76      0.71      6887

------------------------------------------------------------
Confusion Matrix:
------------------------------------------------------------
                  Predicted
                  No BP    BP
Actual No BP      5020    174
Actual BP         1450    243


In [51]:
# Evaluate self-training model on validation set (FIFA 21)
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    f1_score, roc_auc_score, accuracy_score,
    precision_score, recall_score
)

#self-Training classifier
from sklearn.semi_supervised import SelfTrainingClassifier
from xgboost import XGBClassifier as xgb
from sklearn.calibration import CalibratedClassifierCV
 # self_training = SelfTrainingClassifier(
  # RandomForestClassifier(
   # n_estimators=200,
   # max_depth=10,
   # min_samples_split=5,
   # min_samples_leaf=4,
   # max_features="sqrt",
   # class_weight="balanced",
   # random_state=42,
   # n_jobs=-1
   # ),
   # threshold=0.9,
   # max_iter=3
#) 
xgb_baseline = xgb(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    scale_pos_weight=3,    
    eval_metric="logloss",
    tree_method="hist",
    random_state=42
)

self_training = SelfTrainingClassifier(
    xgb_baseline,
    threshold=0.90,     # XGBoost gives good confidence scores
    max_iter=16,
    verbose=1
)
self_training.fit(X_train_ssl_scaled, y_train_ssl)
# Predictions on validation set
y_ssl_pred = self_training.predict(X_val_array_scaled)
y_ssl_proba = self_training.predict_proba(X_val_array_scaled)[:, 1]

# Calculate metrics
print("="*60)
print("SELF-TRAINING MODEL - Validation Set (FIFA 21) Metrics")
print("="*60)
print(f"\nAccuracy:  {accuracy_score(y_val, y_ssl_pred):.4f}")
print(f"Precision: {precision_score(y_val, y_ssl_pred):.4f}")
print(f"Recall:    {recall_score(y_val, y_ssl_pred):.4f}")
print(f"F1 Score:  {f1_score(y_val, y_ssl_pred):.4f}")
print(f"ROC-AUC:   {roc_auc_score(y_val, y_ssl_proba):.4f}")

print("\n" + "-"*60)
print("Classification Report:")
print("-"*60)
print(classification_report(y_val, y_ssl_pred, target_names=['No Big Potential', 'Big Potential']))

print("-"*60)
print("Confusion Matrix:")
print("-"*60)
cm = confusion_matrix(y_val, y_ssl_pred)
print(f"                  Predicted")
print(f"                  No BP    BP")
print(f"Actual No BP     {cm[0,0]:5d}  {cm[0,1]:5d}")
print(f"Actual BP        {cm[1,0]:5d}  {cm[1,1]:5d}")



End of iteration 1, added 3865 new labels.
End of iteration 2, added 3009 new labels.
End of iteration 3, added 1744 new labels.
End of iteration 4, added 1793 new labels.
End of iteration 5, added 2218 new labels.
End of iteration 6, added 2307 new labels.
End of iteration 7, added 1933 new labels.
End of iteration 8, added 1310 new labels.
End of iteration 9, added 1056 new labels.
End of iteration 10, added 970 new labels.
End of iteration 11, added 690 new labels.
End of iteration 12, added 428 new labels.
End of iteration 13, added 451 new labels.
End of iteration 14, added 402 new labels.
End of iteration 15, added 321 new labels.
End of iteration 16, added 216 new labels.
SELF-TRAINING MODEL - Validation Set (FIFA 21) Metrics

Accuracy:  0.6572
Precision: 0.3855
Recall:    0.6645
F1 Score:  0.4880
ROC-AUC:   0.7152

------------------------------------------------------------
Classification Report:
------------------------------------------------------------
                  pr

In [52]:
y_labeled_pred = self_training.predict(X_labeled_scaled)
y_labeled_proba = self_training.predict_proba(X_labeled_scaled)[:, 1]

# Calculate metrics on original labeled data
print("="*60)
print("SELF-TRAINING MODEL - Training Set Metrics (Original Labeled Data)")
print("="*60)
print(f"\nLabeled samples: {len(y_labeled)}")
print(f"\nAccuracy:  {accuracy_score(y_labeled, y_labeled_pred):.4f}")
print(f"Precision: {precision_score(y_labeled, y_labeled_pred):.4f}")
print(f"Recall:    {recall_score(y_labeled, y_labeled_pred):.4f}")
print(f"F1 Score:  {f1_score(y_labeled, y_labeled_pred):.4f}")
print(f"ROC-AUC:   {roc_auc_score(y_labeled, y_labeled_proba):.4f}")

print("\n" + "-"*60)
print("Classification Report:")
print("-"*60)
print(classification_report(y_labeled, y_labeled_pred, target_names=['No Big Potential', 'Big Potential']))

print("-"*60)
print("Confusion Matrix:")
print("-"*60)
cm = confusion_matrix(y_labeled, y_labeled_pred)
print(f"                  Predicted")
print(f"                  No BP    BP")
print(f"Actual No BP     {cm[0,0]:5d}  {cm[0,1]:5d}")
print(f"Actual BP        {cm[1,0]:5d}  {cm[1,1]:5d}")

SELF-TRAINING MODEL - Training Set Metrics (Original Labeled Data)

Labeled samples: 3457

Accuracy:  0.7130
Precision: 0.4547
Recall:    0.7698
F1 Score:  0.5717
ROC-AUC:   0.8163

------------------------------------------------------------
Classification Report:
------------------------------------------------------------
                  precision    recall  f1-score   support

No Big Potential       0.90      0.69      0.78      2597
   Big Potential       0.45      0.77      0.57       860

        accuracy                           0.71      3457
       macro avg       0.68      0.73      0.68      3457
    weighted avg       0.79      0.71      0.73      3457

------------------------------------------------------------
Confusion Matrix:
------------------------------------------------------------
                  Predicted
                  No BP    BP
Actual No BP      1803    794
Actual BP          198    662
