In [37]:
import os 
import kagglehub
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer

In [38]:
current_folder = os.getcwd()

pickle_dir = os.path.join(current_folder, "data")
if not os.path.exists(pickle_dir):
    os.makedirs(pickle_dir)
pickle_file = os.path.join(pickle_dir, "players_22.pkl")

if os.path.exists(pickle_file):
    print(f"The pickle file already exists at: {pickle_file}")
else:
    os.environ["KAGGLEHUB_CACHE"] = current_folder + "/kagglehub_cache"
    path = kagglehub.dataset_download(
        "stefanoleone992/fifa-22-complete-player-dataset"
    )
    print("Downloaded path: ", path)
    output_file = os.path.join(path, "players_22.csv")
    pd.read_csv(output_file).to_pickle(pickle_file)

df = pd.read_pickle(pickle_file)

The pickle file already exists at: f:\git-repositories\202511-tec-3d\data\players_22.pkl


In [39]:
df.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,...,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_logo_url,nation_flag_url
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,93,78000000.0,320000.0,34,...,50+3,50+3,50+3,61+3,19+3,https://cdn.sofifa.net/players/158/023/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,https://cdn.sofifa.net/teams/1369/60.png,https://cdn.sofifa.net/flags/ar.png
1,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,ST,92,92,119500000.0,270000.0,32,...,60+3,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/22_120.png,https://cdn.sofifa.net/teams/21/60.png,https://cdn.sofifa.net/flags/de.png,https://cdn.sofifa.net/teams/1353/60.png,https://cdn.sofifa.net/flags/pl.png
2,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",91,91,45000000.0,270000.0,36,...,53+3,53+3,53+3,60+3,20+3,https://cdn.sofifa.net/players/020/801/22_120.png,https://cdn.sofifa.net/teams/11/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1354/60.png,https://cdn.sofifa.net/flags/pt.png
3,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",91,91,129000000.0,270000.0,29,...,50+3,50+3,50+3,62+3,20+3,https://cdn.sofifa.net/players/190/871/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,,https://cdn.sofifa.net/flags/br.png
4,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,"CM, CAM",91,91,125500000.0,350000.0,30,...,69+3,69+3,69+3,75+3,21+3,https://cdn.sofifa.net/players/192/985/22_120.png,https://cdn.sofifa.net/teams/10/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1325/60.png,https://cdn.sofifa.net/flags/be.png


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Columns: 110 entries, sofifa_id to nation_flag_url
dtypes: float64(16), int64(44), object(50)
memory usage: 16.1+ MB


In [41]:
df.describe()

Unnamed: 0,sofifa_id,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,...,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed
count,19239.0,19239.0,19239.0,19165.0,19178.0,19239.0,19239.0,19239.0,19178.0,19178.0,...,19239.0,19239.0,19239.0,19239.0,19239.0,19239.0,19239.0,19239.0,19239.0,2132.0
mean,231468.086959,65.772182,71.07937,2850452.0,9017.989363,25.210822,181.299704,74.943032,50580.498123,1.354364,...,57.92983,46.601746,48.045584,45.9067,16.406102,16.192474,16.055356,16.229274,16.491814,36.439962
std,27039.717497,6.880232,6.086213,7613700.0,19470.176724,4.748235,6.863179,7.069434,54401.868535,0.747865,...,12.159326,20.200807,21.232718,20.755683,17.574028,16.839528,16.564554,17.059779,17.884833,10.751563
min,41.0,47.0,49.0,9000.0,500.0,16.0,155.0,49.0,1.0,1.0,...,12.0,4.0,5.0,5.0,2.0,2.0,2.0,2.0,2.0,15.0
25%,214413.5,61.0,67.0,475000.0,1000.0,21.0,176.0,70.0,479.0,1.0,...,50.0,29.0,28.0,25.0,8.0,8.0,8.0,8.0,8.0,27.0
50%,236543.0,66.0,71.0,975000.0,3000.0,25.0,181.0,75.0,1938.0,1.0,...,59.0,52.0,56.0,53.0,11.0,11.0,11.0,11.0,11.0,36.0
75%,253532.5,70.0,75.0,2000000.0,8000.0,29.0,186.0,80.0,111139.0,1.0,...,66.0,63.0,65.0,63.0,14.0,14.0,14.0,14.0,14.0,45.0
max,264640.0,93.0,95.0,194000000.0,350000.0,54.0,206.0,110.0,115820.0,5.0,...,96.0,93.0,93.0,92.0,91.0,92.0,93.0,92.0,90.0,65.0


In [42]:
features = [
    'value_eur','wage_eur','age',
    'height_cm','weight_kg','shooting','passing','dribbling'
]

In [43]:
df = df.dropna(subset=features)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17041 entries, 0 to 19238
Columns: 110 entries, sofifa_id to nation_flag_url
dtypes: float64(16), int64(44), object(50)
memory usage: 14.4+ MB


In [44]:
df.describe()

Unnamed: 0,sofifa_id,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,...,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed
count,17041.0,17041.0,17041.0,17041.0,17041.0,17041.0,17041.0,17041.0,17041.0,17041.0,...,17041.0,17041.0,17041.0,17041.0,17041.0,17041.0,17041.0,17041.0,17041.0,0.0
mean,232218.270055,65.930286,71.243061,2965082.0,9354.732703,25.065959,180.389179,74.089314,50445.974415,1.356376,...,60.019835,50.699137,52.315768,49.920427,10.346165,10.393815,10.359721,10.372279,10.331377,
std,25812.142254,6.773755,6.048461,7717024.0,19880.724994,4.605071,6.545547,6.696454,54392.633811,0.749833,...,10.262801,17.369045,18.449722,18.354884,3.014197,2.996654,3.038031,3.003834,3.033476,
min,41.0,47.0,49.0,15000.0,500.0,16.0,155.0,49.0,1.0,1.0,...,30.0,10.0,10.0,10.0,2.0,2.0,2.0,2.0,2.0,
25%,215270.0,62.0,67.0,525000.0,1000.0,21.0,176.0,70.0,479.0,1.0,...,53.0,37.0,37.0,34.0,8.0,8.0,8.0,8.0,8.0,
50%,236883.0,66.0,71.0,1000000.0,3000.0,25.0,180.0,74.0,1936.0,1.0,...,60.0,55.0,59.0,56.0,10.0,10.0,10.0,10.0,10.0,
75%,253586.0,70.0,75.0,2100000.0,9000.0,28.0,185.0,78.0,111138.0,1.0,...,67.0,64.0,66.0,64.0,13.0,13.0,13.0,13.0,13.0,
max,264640.0,93.0,95.0,194000000.0,350000.0,39.0,203.0,110.0,115820.0,5.0,...,96.0,93.0,93.0,92.0,32.0,33.0,38.0,33.0,37.0,


In [45]:

X = df[features]
y = df['overall']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
rmse = mean_squared_error(y_test, predictions)
print(f'Root mean squared error: {rmse:.2f}')

Root mean squared error: 0.38


2 

In [47]:
X.isna().mean()

value_eur    0.0
wage_eur     0.0
age          0.0
height_cm    0.0
weight_kg    0.0
shooting     0.0
passing      0.0
dribbling    0.0
dtype: float64

In [48]:
y.isna().mean()

np.float64(0.0)

In [49]:

pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('rf', RandomForestRegressor(random_state=42))
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf 

KFold(n_splits=5, random_state=42, shuffle=True)

In [50]:

rmse_scorer = make_scorer(mean_squared_error)
rmse_scorer

make_scorer(mean_squared_error, response_method='predict')

In [51]:
scores = cross_val_score(pipeline, X, y, cv=kf, scoring=rmse_scorer)
scores

array([0.38084236, 0.38432116, 0.41362661, 0.36608169, 0.3953382 ])

In [52]:

print(f'5-fold CV RMSE: {np.mean(scores):.2f} ± {np.std(scores):.2f}')

5-fold CV RMSE: 0.39 ± 0.02


3

In [53]:
from sklearn.model_selection import GridSearchCV, KFold
import numpy as np

param_grid = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_features': ['log2', 'sqrt'],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_leaf': [1, 2, 4]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid.fit(X, y)

best_rmse = np.sqrt(-grid.best_score_)
print(f'Best CV RMSE: {best_rmse:.2f}')
print('Best params:', grid.best_params_)


Best CV RMSE: 0.74
Best params: {'rf__max_depth': None, 'rf__max_features': 'log2', 'rf__min_samples_leaf': 1, 'rf__n_estimators': 300}


In [54]:
import pandas as pd

best_model = grid.best_estimator_.named_steps['rf']
importances = best_model.feature_importances_
importance_df = pd.DataFrame({
    'feature': features,
    'importance': importances
}).sort_values('importance', ascending=False)

print(importance_df)


     feature  importance
0  value_eur    0.452893
1   wage_eur    0.195875
7  dribbling    0.112164
2        age    0.107224
6    passing    0.087426
5   shooting    0.030199
4  weight_kg    0.009158
3  height_cm    0.005059
