In [1]:
import numpy as np
import pandas as pd

from warnings import filterwarnings

In [2]:
goal_keep = pd.read_csv('player_goalkeeping.csv')
goal_keep.head()

Unnamed: 0,Rk,Player,Pos,Squad,Age,Born,MP,Starts,Min,90s,...,D,L,CS,CS%,PKatt,PKA,PKsv,PKm,Save%.1,Matches
0,1,Altay Bayındır,GK,tr Türkiye,25,1998,1,1,90,1.0,...,0,1,0,0.0,0,0,0,0,,Matches
1,2,Koen Casteels,GK,be Belgium,31,1992,4,4,360,4.0,...,1,2,2,50.0,0,0,0,0,,Matches
2,3,Diogo Costa,GK,pt Portugal,24,1999,5,5,510,5.7,...,2,1,3,60.0,1,1,0,0,0.0,Matches
3,4,Gianluigi Donnarumma,GK,it Italy,24,1999,4,4,360,4.0,...,1,2,0,0.0,1,0,1,0,100.0,Matches
4,5,Martin Dúbravka,GK,sk Slovakia,35,1989,4,4,390,4.3,...,1,2,1,25.0,1,1,0,0,0.0,Matches


In [3]:
goal_keep.shape

(29, 26)

In [4]:
goal_keep.columns = goal_keep.columns.str.strip()
goal_keep.columns

Index(['Rk', 'Player', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts', 'Min',
       '90s', 'GA', 'GA90', 'SoTA', 'Saves', 'Save%', 'W', 'D', 'L', 'CS',
       'CS%', 'PKatt', 'PKA', 'PKsv', 'PKm', 'Save%.1', 'Matches'],
      dtype='object')

In [5]:
goal_keep.drop(columns=['Rk', 'W', 'D', 'L', 'Save%.1', 'Matches'], inplace=True)

In [6]:
goal_keep.columns

Index(['Player', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts', 'Min', '90s',
       'GA', 'GA90', 'SoTA', 'Saves', 'Save%', 'CS', 'CS%', 'PKatt', 'PKA',
       'PKsv', 'PKm'],
      dtype='object')

### Columns Description
    Player
    Pos: Position
    Squad: Country
    Age
    Born
    MP: Matches played
    Starts: Matches started
    Min: Total Minutes played
    90s: Minutes played divided by 90
    GA: Total goals conceded
    GA90: Average goals conceded per match
    SoTA: Shots on target
    Saves: Total shots on target saved
    Save%: Percentage of shots on target saved (Saves/SoTA)*100
    CS: Clean sheet
    PKatt: Number of penalties faced
    PKA: Number of penalties allowed
    PKsv: Number of penalties saved
    PKm: Number of penalties missed by opposition

In [7]:
goal_keep['CS%'] = goal_keep['CS%'].fillna(0)

In [8]:
goal_keep.isnull().sum().sum()

0

In [9]:
goal_keep['Squad'] = goal_keep['Squad'].str.replace(r'^\S+\s+', '', regex=True)

In [10]:
goal_keep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  29 non-null     object 
 1   Pos     29 non-null     object 
 2   Squad   29 non-null     object 
 3   Age     29 non-null     int64  
 4   Born    29 non-null     int64  
 5   MP      29 non-null     int64  
 6   Starts  29 non-null     int64  
 7   Min     29 non-null     int64  
 8   90s     29 non-null     float64
 9   GA      29 non-null     int64  
 10  GA90    29 non-null     float64
 11  SoTA    29 non-null     int64  
 12  Saves   29 non-null     int64  
 13  Save%   29 non-null     float64
 14  CS      29 non-null     int64  
 15  CS%     29 non-null     float64
 16  PKatt   29 non-null     int64  
 17  PKA     29 non-null     int64  
 18  PKsv    29 non-null     int64  
 19  PKm     29 non-null     int64  
dtypes: float64(4), int64(13), object(3)
memory usage: 4.7+ KB


In [11]:
goal_keep.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,29.0,28.896552,4.708236,21.0,25.0,28.0,33.0,37.0
Born,29.0,1994.275862,4.682006,1986.0,1990.0,1995.0,1998.0,2002.0
MP,29.0,3.551724,1.660168,1.0,3.0,4.0,4.0,7.0
Starts,29.0,3.517241,1.724212,0.0,3.0,4.0,4.0,7.0
Min,29.0,326.896552,165.677472,36.0,234.0,360.0,390.0,690.0
90s,29.0,3.627586,1.838263,0.4,2.6,4.0,4.3,7.7
GA,29.0,4.034483,2.026305,0.0,3.0,4.0,5.0,8.0
GA90,29.0,1.312759,0.802019,0.0,0.67,1.17,1.67,3.0
SoTA,29.0,14.517241,7.149529,2.0,11.0,14.0,17.0,38.0
Saves,29.0,10.724138,5.737698,1.0,8.0,11.0,13.0,30.0


In [12]:
gk = goal_keep.copy()
gk.shape

(29, 20)

In [13]:
gk = gk[gk['Min'] >= 180]
gk.shape

(24, 20)

In [14]:
gk.loc[:,'Saves_per90'] = gk['Saves']/gk['90s']
gk.loc[:,'SoTA_per90'] = gk['SoTA']/gk['90s']
gk.loc[:,'GA_perSoTA'] = gk['GA']/gk['SoTA']
gk.loc[:,'PKsv%'] = (gk['PKsv']/gk['PKatt'])*100

In [15]:
gk.shape

(24, 24)

In [16]:
gk['PKsv%'] = gk['PKsv%'].fillna(0)

In [17]:
gk.isnull().sum().sum()

0

In [18]:
gk.columns

Index(['Player', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts', 'Min', '90s',
       'GA', 'GA90', 'SoTA', 'Saves', 'Save%', 'CS', 'CS%', 'PKatt', 'PKA',
       'PKsv', 'PKm', 'Saves_per90', 'SoTA_per90', 'GA_perSoTA', 'PKsv%'],
      dtype='object')

In [19]:
from sklearn.preprocessing import StandardScaler

metrics = ['Save%', 'GA_perSoTA', 'Saves_per90']
sc = StandardScaler()
gk[['Save_z','GAz','Saves90_z']] = sc.fit_transform(gk[metrics])

In [20]:
gk['GAz'] = -gk['GAz'] #Lower is better

# GK A: 0.15 → concedes 1 goal every ~7 shots on target
# GK B: 0.35 → concedes 1 goal every ~3 shots on target

# Lower goals conceded per shot → higher value
# Higher goals conceded per shot → lower value
# Same information flipped direction 

In [21]:
gk['ShotStoppingScore'] = (
    0.5 * gk['Save_z'] + 
    0.3 * gk['GAz'] + 
    0.2 * gk['Saves90_z']
)

    Save% (weight: 0.5)
    
    When shots are on target, how often does the keeper stop them?
    Core goalkeeper skill
    Most stable and widely trusted metric
    Heavily weighted because it directly measures outcomes
    
    That’s why it carries 50% of the score. It’s the backbone.


    GAz (Goals conceded per shot on target) (weight: 0.3)
    
    Given a shot on target, how rarely does it turn into a goal?
    GA_perSoTA = Goals Allowed / Shots on Target Against
    
    This captures:
    shot quality faced
    positioning and reflexes
    ability to stop dangerous shots, not just easy ones
    
    Weighted at 30% because it adds context beyond raw saves.


    Saves90_z (Saves per match) (weight: 0.2)
    How much shot-stopping work does the keeper actually do?
    
    Reflects workload
    Prevents a low-volume keeper with a few easy saves from looking elite
    
    Weighted lower (20%) so volume doesn’t overpower efficiency.

In [22]:
gk = gk.sort_values('ShotStoppingScore', ascending=False).reset_index()

In [23]:
print(gk.head())
print('Rows and Columns:',gk.shape)

   index                Player Pos     Squad  Age  Born  MP  Starts  Min  90s  \
0     27       Anatolii Trubin  GK   Ukraine   22  2001   2       2  180  2.0   
1     15             Jan Oblak  GK  Slovenia   31  1993   4       4  390  4.3   
2     12  Giorgi Mamardashvili  GK   Georgia   23  2000   4       4  360  4.0   
3      1         Koen Casteels  GK   Belgium   31  1992   4       4  360  4.0   
4     11          Mike Maignan  GK    France   28  1995   6       6  570  6.3   

   ...  PKsv  PKm  Saves_per90  SoTA_per90  GA_perSoTA  PKsv%    Save_z  \
0  ...     0    0     4.000000    4.500000    0.111111    0.0  1.494056   
1  ...     1    0     3.255814    3.953488    0.117647  100.0  1.422320   
2  ...     0    0     7.500000    9.500000    0.210526    0.0  0.469268   
3  ...     0    0     3.250000    3.500000    0.142857    0.0  1.166123   
4  ...     0    0     2.380952    2.857143    0.166667    0.0  1.494056   

        GAz  Saves90_z  ShotStoppingScore  
0  1.731347   0.72

In [24]:
adv_goalkeep = pd.read_csv('player_advanced_goalkeeping.csv')
adv_goalkeep.head()

Unnamed: 0,Rk,Player,Pos,Squad,Age,Born,90s,GA,PKA,FK,...,Att.1,Launch%.1,AvgLen.1,Opp,Stp,Stp%,#OPA,#OPA/90,AvgDist,Matches
0,1,Altay Bayındır,GK,tr Türkiye,25,1998,1.0,3,0,0,...,2,0.0,7.5,8,2,25.0,0,0.0,12.2,Matches
1,2,Koen Casteels,GK,be Belgium,31,1992,4.0,2,0,0,...,35,34.3,34.7,48,4,8.3,4,1.0,13.8,Matches
2,3,Diogo Costa,GK,pt Portugal,24,1999,5.7,3,1,0,...,26,3.8,25.6,41,2,4.9,6,1.2,21.0,Matches
3,4,Gianluigi Donnarumma,GK,it Italy,24,1999,4.0,5,0,0,...,16,25.0,27.1,41,3,7.3,2,0.5,10.1,Matches
4,5,Martin Dúbravka,GK,sk Slovakia,35,1989,4.3,5,1,0,...,23,73.9,54.7,57,2,3.5,4,1.0,11.8,Matches


In [25]:
adv_goalkeep.columns

Index(['Rk', 'Player', 'Pos', 'Squad', 'Age', 'Born', '90s', 'GA', 'PKA', 'FK',
       'CK', 'OG', 'PSxG', 'PSxG/SoT', 'PSxG+/-', '/90', 'Cmp', 'Att', 'Cmp%',
       'Att (GK)', 'Thr', 'Launch%', 'AvgLen', 'Att.1', 'Launch%.1',
       'AvgLen.1', 'Opp', 'Stp', 'Stp%', '#OPA', '#OPA/90', 'AvgDist',
       'Matches'],
      dtype='object')

In [26]:
adv_goalkeep = adv_goalkeep.drop(columns=['Rk','Born','Matches'])

In [27]:
adv_goalkeep.columns

Index(['Player', 'Pos', 'Squad', 'Age', '90s', 'GA', 'PKA', 'FK', 'CK', 'OG',
       'PSxG', 'PSxG/SoT', 'PSxG+/-', '/90', 'Cmp', 'Att', 'Cmp%', 'Att (GK)',
       'Thr', 'Launch%', 'AvgLen', 'Att.1', 'Launch%.1', 'AvgLen.1', 'Opp',
       'Stp', 'Stp%', '#OPA', '#OPA/90', 'AvgDist'],
      dtype='object')

In [28]:
adv_goalkeep = adv_goalkeep.drop(columns=['Att','Att.1','Thr','AvgLen.1','Launch%.1','Opp','FK','CK','OG','/90','Att (GK)'])

In [29]:
adv_goalkeep.columns

Index(['Player', 'Pos', 'Squad', 'Age', '90s', 'GA', 'PKA', 'PSxG', 'PSxG/SoT',
       'PSxG+/-', 'Cmp', 'Cmp%', 'Launch%', 'AvgLen', 'Stp', 'Stp%', '#OPA',
       '#OPA/90', 'AvgDist'],
      dtype='object')

In [30]:
adv_goalkeep.columns = adv_goalkeep.columns.str.strip()

In [31]:
adv_goalkeep['Squad'] = adv_goalkeep['Squad'].str.replace(r'^\S+\s+', '', regex=True)

In [32]:
adv_goalkeep.head()

Unnamed: 0,Player,Pos,Squad,Age,90s,GA,PKA,PSxG,PSxG/SoT,PSxG+/-,Cmp,Cmp%,Launch%,AvgLen,Stp,Stp%,#OPA,#OPA/90,AvgDist
0,Altay Bayındır,GK,Türkiye,25,1.0,3,0,1.6,0.54,-0.4,3,42.9,21.2,31.2,2,25.0,0,0.0,12.2
1,Koen Casteels,GK,Belgium,31,4.0,2,0,2.7,0.2,1.7,8,25.8,17.8,27.1,4,8.3,4,1.0,13.8
2,Diogo Costa,GK,Portugal,24,5.7,3,1,3.5,0.18,0.5,6,33.3,13.2,27.2,2,4.9,6,1.2,21.0
3,Gianluigi Donnarumma,GK,Italy,24,4.0,5,0,6.0,0.3,2.0,10,43.5,22.1,26.3,3,7.3,2,0.5,10.1
4,Martin Dúbravka,GK,Slovakia,35,4.3,5,1,5.0,0.25,0.0,18,29.5,31.9,30.1,2,3.5,4,1.0,11.8


In [33]:
adv_goalkeep.isnull().sum()

Player      0
Pos         0
Squad       0
Age         0
90s         0
GA          0
PKA         0
PSxG        0
PSxG/SoT    0
PSxG+/-     0
Cmp         0
Cmp%        1
Launch%     0
AvgLen      0
Stp         0
Stp%        0
#OPA        0
#OPA/90     0
AvgDist     1
dtype: int64

In [34]:
adv_gk = gk.merge(
    adv_goalkeep[[
        'Player','Squad','PSxG','PSxG+/-','#OPA/90','Stp%','AvgDist','Cmp%','Launch%'
    ]], on=['Player','Squad'], how='left'
)

In [35]:
adv_gk.shape

(24, 36)

In [36]:
adv_gk.columns

Index(['index', 'Player', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts', 'Min',
       '90s', 'GA', 'GA90', 'SoTA', 'Saves', 'Save%', 'CS', 'CS%', 'PKatt',
       'PKA', 'PKsv', 'PKm', 'Saves_per90', 'SoTA_per90', 'GA_perSoTA',
       'PKsv%', 'Save_z', 'GAz', 'Saves90_z', 'ShotStoppingScore', 'PSxG',
       'PSxG+/-', '#OPA/90', 'Stp%', 'AvgDist', 'Cmp%', 'Launch%'],
      dtype='object')

### Newly Added Advanced Goal Keeping Columns
    
    PSxG (Post Shot Expected Goals): Expected goals after the shot has been taken, considering shot placement and power
    PSxG/SoT: Average danger per shot and target faced
    PSxG+/ = PSxG − Goals Against [Goals prevented (positive) or goals conceded above expectation (negative)]
    #OPA (Out of Penalty Area Actions): Number of defensive actions made outside the area
    #OPA/90: Number of defensive actions made outside the area per 90 minutes
    AvgDist (Average Distance from Goal): Average distance of GK actions from the goal line
    Stp (Crosses Stopped): Number of opponents cross stopped or claimed
    Stp% (Cross Stop Percentage): Percentage of opponent crosses stopped
    Cmp% (Pass Completion %): Percentage of completed passes
    Launch%: Percentage of passes that are long launches
    AvgLen (Average Pass Length): Average length of GK passes

In [37]:
adv_gk.isnull().sum().sum()

0

In [38]:
adv_gk.shape

(24, 36)

In [39]:
adv_metrics = ['PSxG+/-', 'SoTA_per90', '#OPA/90', 'AvgDist', 'Stp%', 'PKsv%']
sc_adv = StandardScaler()
adv_gk[['PSxG_z', 'SoTA90_z', 'OPA90_z', 'AvgDist_z', 'Stp_z', 'PKsv_z']] = (
    sc_adv.fit_transform(adv_gk[adv_metrics])
)

In [40]:
adv_gk.columns

Index(['index', 'Player', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts', 'Min',
       '90s', 'GA', 'GA90', 'SoTA', 'Saves', 'Save%', 'CS', 'CS%', 'PKatt',
       'PKA', 'PKsv', 'PKm', 'Saves_per90', 'SoTA_per90', 'GA_perSoTA',
       'PKsv%', 'Save_z', 'GAz', 'Saves90_z', 'ShotStoppingScore', 'PSxG',
       'PSxG+/-', '#OPA/90', 'Stp%', 'AvgDist', 'Cmp%', 'Launch%', 'PSxG_z',
       'SoTA90_z', 'OPA90_z', 'AvgDist_z', 'Stp_z', 'PKsv_z'],
      dtype='object')

In [41]:
# Goal Prevention Score
adv_gk['GoalPrevention'] = (
    0.7 * adv_gk['PSxG_z'] +
    0.3 * adv_gk['GAz']
)

# Pressure Handling Score
adv_gk['PressureHandling'] = (
    0.6 * adv_gk['SoTA90_z'] +
    0.4 * adv_gk['Saves90_z']
)

# Sweeper Control Score
adv_gk['SweeperControl'] = (
    0.6 * adv_gk['OPA90_z'] +
    0.4 * adv_gk['AvgDist_z']
)

# Chaos Control Score
adv_gk['ChaosControl'] = (
    0.6 * adv_gk['Stp_z'] +
    0.4 * adv_gk['PKsv_z']
)

In [42]:
adv_gk['WC_GK_Index'] = (
    0.40 * adv_gk['GoalPrevention'] +
    0.25 * adv_gk['PressureHandling'] +
    0.20 * adv_gk['SweeperControl'] +
    0.15 * adv_gk['ChaosControl']
)

In [43]:
adv_gk = adv_gk.sort_values('WC_GK_Index', ascending=False).reset_index().head(10)

In [44]:
adv_gk.columns

Index(['level_0', 'index', 'Player', 'Pos', 'Squad', 'Age', 'Born', 'MP',
       'Starts', 'Min', '90s', 'GA', 'GA90', 'SoTA', 'Saves', 'Save%', 'CS',
       'CS%', 'PKatt', 'PKA', 'PKsv', 'PKm', 'Saves_per90', 'SoTA_per90',
       'GA_perSoTA', 'PKsv%', 'Save_z', 'GAz', 'Saves90_z',
       'ShotStoppingScore', 'PSxG', 'PSxG+/-', '#OPA/90', 'Stp%', 'AvgDist',
       'Cmp%', 'Launch%', 'PSxG_z', 'SoTA90_z', 'OPA90_z', 'AvgDist_z',
       'Stp_z', 'PKsv_z', 'GoalPrevention', 'PressureHandling',
       'SweeperControl', 'ChaosControl', 'WC_GK_Index'],
      dtype='object')

In [46]:
adv_gk = adv_gk.drop(columns=['level_0','index'])

In [49]:
adv_gk.to_csv('gk7_uefa.csv', index=False)

***The World Cup GK Index weights goal prevention most heavily, while also accounting for pressure handling, sweeper ability, 
and knockout-specific chaos control. This aligns goalkeeper evaluation with tournament-specific demands rather than league performance***