# FOOTBALL ANALYSIS DATASET CLEANING

In [478]:
# Import required libraries

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib as mp
import os
import re

In [479]:
# Load the dataset to a pandas dataframe, specifying the encoding as 'latin-1' and delimiter as ';'

data = pd.read_csv('2021-2022 Football Player Stats.csv', encoding='latin-1', delimiter=';')
data.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%
0,1,Max Aarons,ENG,DF,Norwich City,Premier League,22.0,2000,34,32,...,0.03,1.41,1.16,0.0,0.06,0.03,5.53,0.47,1.59,22.7
1,2,Yunis Abdelhamid,MAR,DF,Reims,Ligue 1,34.0,1987,34,34,...,0.0,0.06,1.39,0.0,0.03,0.0,6.77,2.02,1.36,59.8
2,3,Salis Abdul Samed,GHA,MF,Clermont Foot,Ligue 1,22.0,2000,31,29,...,0.0,0.36,1.24,0.0,0.0,0.0,8.76,0.88,0.88,50.0
3,4,Laurent Abergel,FRA,MF,Lorient,Ligue 1,29.0,1993,34,34,...,0.03,0.79,2.23,0.0,0.0,0.0,8.87,0.43,0.43,50.0
4,5,Charles Abi,FRA,FW,Saint-Étienne,Ligue 1,22.0,2000,1,1,...,0.0,2.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,100.0


## Exploratory Data Analysis

In [480]:
# Check the number of rows and columns (respectively) in the dataset

data.shape

(2921, 143)

In [481]:
# List the columns in the dataset

data.columns
for column in data.columns:
    print(column)

Rk
Player
Nation
Pos
Squad
Comp
Age
Born
MP
Starts
Min
90s
Goals
Shots
SoT
SoT%
G/Sh
G/SoT
ShoDist
ShoFK
ShoPK
PKatt
PasTotCmp
PasTotAtt
PasTotCmp%
PasTotDist
PasTotPrgDist
PasShoCmp
PasShoAtt
PasShoCmp%
PasMedCmp
PasMedAtt
PasMedCmp%
PasLonCmp
PasLonAtt
PasLonCmp%
Assists
PasAss
Pas3rd
PPA
CrsPA
PasProg
PasAtt
PasLive
PasDead
PasFK
TB
PasPress
Sw
PasCrs
CK
CkIn
CkOut
CkStr
PasGround
PasLow
PasHigh
PaswLeft
PaswRight
PaswHead
TI
PaswOther
PasCmp
PasOff
PasOut
PasInt
PasBlocks
SCA
ScaPassLive
ScaPassDead
ScaDrib
ScaSh
ScaFld
ScaDef
GCA
GcaPassLive
GcaPassDead
GcaDrib
GcaSh
GcaFld
GcaDef
Tkl
TklWon
TklDef3rd
TklMid3rd
TklAtt3rd
TklDri
TklDriAtt
TklDri%
TklDriPast
Press
PresSucc
Press%
PresDef3rd
PresMid3rd
PresAtt3rd
Blocks
BlkSh
BlkShSv
BlkPass
Int
Tkl+Int
Clr
Err
Touches
TouDefPen
TouDef3rd
TouMid3rd
TouAtt3rd
TouAttPen
TouLive
DriSucc
DriAtt
DriSucc%
DriPast
DriMegs
Carries
CarTotDist
CarPrgDist
CarProg
Car3rd
CPA
CarMis
CarDis
RecTarg
Rec
Rec%
RecProg
CrdY
CrdR
2CrdY
Fls
Fld
Off
Crs


### Handling datatypes

In [482]:
# Check the datatypes in the dataset

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2921 entries, 0 to 2920
Columns: 143 entries, Rk to AerWon%
dtypes: float64(133), int64(5), object(5)
memory usage: 3.2+ MB


In [483]:
# Display the columns with object datatype

object_columns = data.select_dtypes(include='object')
object_columns

Unnamed: 0,Player,Nation,Pos,Squad,Comp
0,Max Aarons,ENG,DF,Norwich City,Premier League
1,Yunis Abdelhamid,MAR,DF,Reims,Ligue 1
2,Salis Abdul Samed,GHA,MF,Clermont Foot,Ligue 1
3,Laurent Abergel,FRA,MF,Lorient,Ligue 1
4,Charles Abi,FRA,FW,Saint-Étienne,Ligue 1
...,...,...,...,...,...
2916,Martín Zubimendi,ESP,MF,Real Sociedad,La Liga
2917,Szymon ?urkowski,POL,MF,Empoli,Serie A
2918,Martin Ødegaard,NOR,MF,Arsenal,Premier League
2919,Milan ?uri?,BIH,FW,Salernitana,Serie A


In [484]:
# Check the unique values of each column with an object datatype
columns_to_check = ['Player', 'Nation', 'Pos', 'Squad', 'Comp']

for column in columns_to_check:
    unique_values = data[column].unique()
    print(f"Column: {column}, Unique Values: {unique_values}")

Column: Player, Unique Values: ['Max Aarons' 'Yunis Abdelhamid' 'Salis Abdul Samed' ... 'Martin Ødegaard'
 'Milan ?uri?' 'Filip ?uri?i?']
Column: Nation, Unique Values: ['ENG' 'MAR' 'GHA' 'FRA' 'NGA' 'PER' 'ITA' 'GER' 'ARG' 'SCO' 'USA' 'ARM'
 'SUI' 'ESP' 'COL' 'BIH' 'CIV' 'NOR' 'REU' 'EQG' 'NED' 'AUT' 'CHI' 'GUF'
 'PAR' 'TUR' 'ROU' 'BRA' 'GAB' 'SEN' 'WAL' 'DEN' 'SWE' 'PAN' 'JAM' 'URU'
 'MEX' 'JPN' 'KVX' 'ALB' 'TOG' 'ALG' 'IRN' 'CRO' 'LUX' 'CMR' 'COD' 'GRN'
 'CZE' 'MKD' 'GAM' 'GUI' 'CPV' 'SUR' 'POL' 'SVN' 'SVK' 'BEL' 'ISL' 'GNB'
 'MLI' 'SRB' 'TUN' 'NZL' 'ECU' 'POR' 'NIR' 'RUS' 'IRL' 'VEN' 'ANG' 'ISR'
 'ZAM' 'CAN' 'MTQ' 'KOR' 'GRE' 'BEN' 'CRC' 'HON' 'EGY' 'MAD' 'FIN' 'CGO'
 'DOM' 'HUN' 'PHI' 'BUL' 'AUS' 'UKR' 'MNE' 'BFA' 'ZIM' 'SLE' 'CYP' 'CTA'
 'CHN' nan 'GEO' 'MOZ' 'RSA' 'CHA' 'UZB']
Column: Pos, Unique Values: ['DF' 'MF' 'FW' 'MFFW' 'FWMF' 'GK' 'DFMF' 'FWDF' 'MFDF' 'DFFW' 'GKMF']
Column: Squad, Unique Values: ['Norwich City' 'Reims' 'Clermont Foot' 'Lorient' 'Saint-Étienne'
 'Greuther

In [475]:
# Convert the 'Age' column to integer type

data['Age'] = data['Age'].astype(int)
data.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%
0,1,Max Aarons,ENG,DF,Norwich City,Premier League,22,2000,34,32,...,0.03,1.41,1.16,0.0,0.06,0.03,5.53,0.47,1.59,22.7
1,2,Yunis Abdelhamid,MAR,DF,Reims,Ligue 1,34,1987,34,34,...,0.0,0.06,1.39,0.0,0.03,0.0,6.77,2.02,1.36,59.8
2,3,Salis Abdul Samed,GHA,MF,Clermont Foot,Ligue 1,22,2000,31,29,...,0.0,0.36,1.24,0.0,0.0,0.0,8.76,0.88,0.88,50.0
3,4,Laurent Abergel,FRA,MF,Lorient,Ligue 1,29,1993,34,34,...,0.03,0.79,2.23,0.0,0.0,0.0,8.87,0.43,0.43,50.0
4,5,Charles Abi,FRA,FW,Saint-Étienne,Ligue 1,22,2000,1,1,...,0.0,2.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,100.0


In [None]:
# Identify the datatype of 'Player' column

data['Player'].dtype

### Handling duplicates and missing values

In [463]:
# Check for duplicates in the dataset

data.duplicated().sum()

0

In [464]:
# Check for missing values in the dataset

data.isna().sum().sum()

2

As shown above, there are no duplicate rows in the dataset. However, the dataset has two missing values. The next code will identify the row(s) with the missing values.

In [465]:
# Identify row(s) with missing values
rows_with_missing_values = df[df.isna().any(axis=1)]

# Display the row(s) with missing values
rows_with_missing_values

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%


As shown above, the two missing values are in one row. This row bears the details of Javier Llabrés who played for Mallorca in La Liga. The row does not contain the information on his country and age. Also note that his year of birth is being indicated as '0' in the 'Born' column. Since the information is available online, it shall be extracted and inputted manually into the dataset.

On research, it was found that Javier Llabrés is a Spanish footballer, and he played for Mallorca during the 2021-2022 football season. He was born on 11th September, 2002.  This can be found here: https://en.wikipedia.org/wiki/Javier_Llabr%C3%A9s

His age during the 2021-2022 season will be inputted as that is the period the dataset was captured. Since he was born in 2002, he would be 19 by 2021. Note that this same format was used to calculate the ages of other players in the dataset (that is, by subtracting the year of birth from 2021).

In [466]:
# Specify the row for Javier Llabrés
javier_row = data[data['Player'] == 'Javier Llabrés']

# Input the values for 'Nation' and 'Age', and set 'Born' to the correct value
data.loc[javier_row.index, 'Nation'] = 'ESP' # Spain is denoted as 'ESP' in the dataset
data.loc[javier_row.index, 'Age'] = 19
data.loc[javier_row.index, 'Born'] = 2002

javier_row

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%
1531,1532,Javier Llabrés,,FWMF,Mallorca,La Liga,,0,5,1,...,0.67,3.33,2.0,0.0,0.0,0.0,5.33,0.67,2.0,25.0


In [467]:
# Confirm that there are no more missing values in the dataset

data.isna().sum().sum()

0

In [443]:
# Investigating the values in each column on the dataset.

data.columns
for column in data.columns:
    print('column: {} - unique value: {}'.format(column, data[column].unique()))

column: Rk - unique value: [   1    2    3 ... 2919 2920 2921]
column: Player - unique value: ['Max Aarons' 'Yunis Abdelhamid' 'Salis Abdul Samed' ... 'Martin Ødegaard'
 'Milan ?uri?' 'Filip ?uri?i?']
column: Nation - unique value: ['ENG' 'MAR' 'GHA' 'FRA' 'NGA' 'PER' 'ITA' 'GER' 'ARG' 'SCO' 'USA' 'ARM'
 'SUI' 'ESP' 'COL' 'BIH' 'CIV' 'NOR' 'REU' 'EQG' 'NED' 'AUT' 'CHI' 'GUF'
 'PAR' 'TUR' 'ROU' 'BRA' 'GAB' 'SEN' 'WAL' 'DEN' 'SWE' 'PAN' 'JAM' 'URU'
 'MEX' 'JPN' 'KVX' 'ALB' 'TOG' 'ALG' 'IRN' 'CRO' 'LUX' 'CMR' 'COD' 'GRN'
 'CZE' 'MKD' 'GAM' 'GUI' 'CPV' 'SUR' 'POL' 'SVN' 'SVK' 'BEL' 'ISL' 'GNB'
 'MLI' 'SRB' 'TUN' 'NZL' 'ECU' 'POR' 'NIR' 'RUS' 'IRL' 'VEN' 'ANG' 'ISR'
 'ZAM' 'CAN' 'MTQ' 'KOR' 'GRE' 'BEN' 'CRC' 'HON' 'EGY' 'MAD' 'FIN' 'CGO'
 'DOM' 'HUN' 'PHI' 'BUL' 'AUS' 'UKR' 'MNE' 'BFA' 'ZIM' 'SLE' 'CYP' 'CTA'
 'CHN' 'GEO' 'MOZ' 'RSA' 'CHA' 'UZB']
column: Pos - unique value: ['DF' 'MF' 'FW' 'MFFW' 'FWMF' 'GK' 'DFMF' 'FWDF' 'MFDF' 'DFFW' 'GKMF']
column: Squad - unique value: ['Norwich City' '

In [444]:
# Identify players whose squad is 'Manchester City'

mancity_players = data[data['Squad'] == 'Manchester City']
mancity_players

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%
40,41,Nathan Aké,NED,DF,Manchester City,Premier League,27,1995,14,10,...,0.0,0.0,0.49,0.0,0.0,0.0,7.57,3.98,0.78,83.7
450,451,João Cancelo,POR,DF,Manchester City,Premier League,28,1994,36,36,...,0.06,2.87,1.5,0.0,0.0,0.0,9.72,1.36,1.25,52.1
507,508,Kayky Chagas,BRA,FW,Manchester City,Premier League,18,2003,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
646,647,Kevin De Bruyne,BEL,MF,Manchester City,Premier League,30,1991,30,25,...,0.2,4.12,0.57,0.0,0.0,0.0,7.1,0.61,1.18,34.1
662,663,Liam Delap,ENG,FW,Manchester City,Premier League,19,2003,1,0,...,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
705,706,Rúben Dias,POR,DF,Manchester City,Premier League,25,1997,29,27,...,0.07,0.19,1.01,0.0,0.0,0.0,8.13,1.99,1.35,59.6
799,800,Ederson,BRA,GK,Manchester City,Premier League,28,1993,37,37,...,0.0,0.0,0.0,0.0,0.03,0.0,3.27,0.0,0.0,0.0
801,802,CJ Egan-Riley,ENG,DF,Manchester City,Premier League,19,2003,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
882,883,Fernandinho,BRA,MFDF,Manchester City,Premier League,37,1985,19,10,...,0.0,0.37,1.5,0.0,0.0,0.0,10.2,1.96,1.4,58.3
899,900,Phil Foden,ENG,FW,Manchester City,Premier League,22,2000,28,24,...,0.55,2.5,0.64,0.0,0.0,0.0,4.87,0.34,1.4,19.5


In [445]:
# Identify players whose squad is 'Chelsea'

chelsea_players = data[data['Squad'] == 'Chelsea']
chelsea_players

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%
71,72,Marcos Alonso,ESP,DF,Chelsea,Premier League,31,1990,28,25,...,0.12,2.66,1.16,0.0,0.04,0.0,8.96,2.45,1.08,69.4
123,124,Kepa Arrizabalaga,ESP,GK,Chelsea,Premier League,27,1994,4,4,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
155,156,César Azpilicueta,ESP,DF,Chelsea,Premier League,32,1989,27,24,...,0.09,1.61,1.35,0.0,0.04,0.0,7.35,2.26,1.48,60.5
207,208,Ross Barkley,ENG,MF,Chelsea,Premier League,28,1993,6,1,...,0.0,0.5,0.5,0.0,0.0,0.0,6.0,3.0,1.0,75.0
509,510,Trevoh Chalobah,ENG,DF,Chelsea,Premier League,22,1999,20,17,...,0.0,0.19,1.3,0.0,0.06,0.0,10.0,2.17,1.55,58.3
525,526,Ben Chilwell,ENG,DF,Chelsea,Premier League,25,1996,7,6,...,0.33,1.67,0.83,0.0,0.17,0.0,7.0,1.33,1.83,42.1
531,532,Andreas Christensen,DEN,DF,Chelsea,Premier League,26,1996,19,17,...,0.0,0.12,0.72,0.0,0.0,0.0,10.1,3.61,2.23,61.9
1144,1145,Kai Havertz,GER,FWMF,Chelsea,Premier League,22,1999,29,22,...,0.6,1.0,0.65,0.05,0.0,0.0,4.33,1.99,3.03,39.6
1208,1209,Callum Hudson-Odoi,ENG,FWMF,Chelsea,Premier League,21,2000,15,11,...,0.37,2.43,0.56,0.09,0.0,0.0,5.14,0.47,1.12,29.4
1264,1265,Reece James,ENG,DF,Chelsea,Premier League,22,1999,26,22,...,0.19,3.96,1.11,0.0,0.0,0.05,8.07,1.06,1.16,47.8


### Handling name errors

The name of one of Chelsea's players - Mateo Kovačić was wrongly spelt in the dataset. The next code will identify the row and it's details.

In [446]:
# Identify the row of Mateo Kovačić

data[data['Rk'] == 1428]

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%
1427,1428,Mateo Kova?i?,CRO,MF,Chelsea,Premier League,28,1994,25,16,...,0.0,0.29,2.31,0.0,0.0,0.0,9.36,0.29,0.52,35.7


The correct name is Mateo Kovačić. It seems the data collection system is not familiar with 'č' and 'ć' alphabets, hence the naming error. Since this player is from Croatia, it is possible to have these same errors on the names of other Croatian players, or even other players. The next code will display the rows of Croatian players.

In [447]:
# Identify the rows of Croatian players

data[data['Nation'] == 'CRO']

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%
160,161,Milan Badelj,CRO,MF,Genoa,Serie A,33,1989,34,34,...,0.0,0.32,1.39,0.03,0.0,0.0,12.6,1.33,1.33,50.0
220,221,Toma Bai?,CRO,MF,Lazio,Serie A,25,1996,29,9,...,0.0,0.92,0.73,0.0,0.0,0.0,6.51,0.92,0.92,50.0
221,222,Toma Bai?,CRO,MF,Bordeaux,Ligue 1,25,1996,3,3,...,0.0,3.04,1.3,0.0,0.0,0.0,5.65,0.0,0.43,0.0
275,276,Filip Benkovi?,CRO,DFFW,Udinese,Serie A,24,1997,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,30.0,0.0,100.0
333,334,Luka Bogdan,CRO,DF,Salernitana,Serie A,26,1996,6,4,...,0.0,0.45,0.91,0.0,0.0,0.0,5.0,1.82,1.59,53.3
361,362,Domagoj Bradari?,CRO,DFMF,Lille,Ligue 1,22,1999,15,2,...,0.0,3.48,1.52,0.0,0.0,0.0,9.13,1.74,3.26,34.8
372,373,Josip Brekalo,CRO,MF,Wolfsburg,Bundesliga,23,1998,1,0,...,0.0,10.0,0.0,0.0,0.0,0.0,25.0,5.0,0.0,100.0
373,374,Josip Brekalo,CRO,FWMF,Torino,Serie A,23,1998,32,26,...,0.2,3.07,0.24,0.0,0.0,0.0,6.61,0.79,1.65,32.3
383,384,Marcelo Brozovi?,CRO,MF,Inter,Serie A,29,1992,35,35,...,0.0,0.49,1.44,0.0,0.0,0.0,11.6,0.61,0.89,40.8
389,390,Ante Budimir,CRO,FW,Osasuna,La Liga,30,1991,28,20,...,0.83,0.26,0.57,0.05,0.0,0.0,4.15,6.11,6.17,49.8


In [487]:
# Condition to identify Croatian players
cro_players_condition = data['Nation'] == 'CRO'

# Split the 'Player' column into first name and last name based on space delimiter
data.loc[cro_players_condition, ['FirstName', 'LastName']] = data[cro_players_condition]['Player'].str.split(n=1, expand=True)

# Display the result
data[data['Nation'] == 'CRO']

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%,FirstName,LastName
160,161,Milan Badelj,CRO,MF,Genoa,Serie A,33.0,1989,34,34,...,1.39,0.03,0.0,0.0,12.6,1.33,1.33,50.0,,
220,221,Toma Bai?,CRO,MF,Lazio,Serie A,25.0,1996,29,9,...,0.73,0.0,0.0,0.0,6.51,0.92,0.92,50.0,,
221,222,Toma Bai?,CRO,MF,Bordeaux,Ligue 1,25.0,1996,3,3,...,1.3,0.0,0.0,0.0,5.65,0.0,0.43,0.0,,
275,276,Filip Benkovi?,CRO,DFFW,Udinese,Serie A,24.0,1997,2,0,...,0.0,0.0,0.0,0.0,10.0,30.0,0.0,100.0,,
333,334,Luka Bogdan,CRO,DF,Salernitana,Serie A,26.0,1996,6,4,...,0.91,0.0,0.0,0.0,5.0,1.82,1.59,53.3,,
361,362,Domagoj Bradari?,CRO,DFMF,Lille,Ligue 1,22.0,1999,15,2,...,1.52,0.0,0.0,0.0,9.13,1.74,3.26,34.8,,
372,373,Josip Brekalo,CRO,MF,Wolfsburg,Bundesliga,23.0,1998,1,0,...,0.0,0.0,0.0,0.0,25.0,5.0,0.0,100.0,,
373,374,Josip Brekalo,CRO,FWMF,Torino,Serie A,23.0,1998,32,26,...,0.24,0.0,0.0,0.0,6.61,0.79,1.65,32.3,,
383,384,Marcelo Brozovi?,CRO,MF,Inter,Serie A,29.0,1992,35,35,...,1.44,0.0,0.0,0.0,11.6,0.61,0.89,40.8,,
389,390,Ante Budimir,CRO,FW,Osasuna,La Liga,30.0,1991,28,20,...,0.57,0.05,0.0,0.0,4.15,6.11,6.17,49.8,,


In [None]:
# Remove unnecessary spaces in the first name and last name columns
data.loc[cro_players_condition, 'FirstName'] = data[cro_players_condition]['FirstName'].str.strip()
data.loc[cro_players_condition, 'LastName'] = data[cro_players_condition]['LastName'].str.strip()

# Replace '?' with 'ć' and '' with 'š' in both first and last names
data.loc[cro_players_condition, 'FirstName'] = data[cro_players_condition]['FirstName'].str.replace(r'\?', 'ć', regex=True)
data.loc[cro_players_condition, 'LastName'] = data[cro_players_condition]['LastName'].str.replace(r'\?', 'ć', regex=True)
data.loc[cro_players_condition, 'FirstName'] = data[cro_players_condition]['FirstName'].str.replace(r'', 'š', regex=True)
data.loc[cro_players_condition, 'LastName'] = data[cro_players_condition]['LastName'].str.replace(r'', 'š', regex=True)

# Concatenate the first and last names back into the 'Player' column
data.loc[cro_players_condition, 'Player'] = data[cro_players_condition]['FirstName'] + ' ' + data[cro_players_condition]['LastName']

# Drop the intermediate columns
data.loc[cro_players_condition, ['FirstName', 'LastName']] = None

# Display the result
data[data['Nation'] == 'CRO']

In [453]:
data['Player'].dtype

dtype('O')

In [452]:
# Replace '?' with 'ć' at the beginning or end of first and last names, and 'č' elsewhere
data.loc[cro_players_condition, 'FirstName'] = data[cro_players_condition]['FirstName'].str.replace(r'^\?', 'ć', regex=True)
data.loc[cro_players_condition, 'FirstName'] = data[cro_players_condition]['FirstName'].str.replace(r'\?$', 'ć', regex=True)
data.loc[cro_players_condition, 'FirstName'] = data[cro_players_condition]['FirstName'].str.replace(r'\?', 'č', regex=True)

data.loc[cro_players_condition, 'LastName'] = data[cro_players_condition]['LastName'].str.replace(r'^\?', 'ć', regex=True)
data.loc[cro_players_condition, 'LastName'] = data[cro_players_condition]['LastName'].str.replace(r'\?$', 'ć', regex=True)
data.loc[cro_players_condition, 'LastName'] = data[cro_players_condition]['LastName'].str.replace(r'\?', 'č', regex=True)

# Replace '' with 'š' in both first and last names
data.loc[cro_players_condition, 'FirstName'] = data[cro_players_condition]['FirstName'].str.replace(r'', 'š', regex=True)
data.loc[cro_players_condition, 'LastName'] = data[cro_players_condition]['LastName'].str.replace(r'', 'š', regex=True)

# Capitalize the first letter of each name
data.loc[cro_players_condition, 'FirstName'] = data[cro_players_condition]['FirstName'].str.capitalize()
data.loc[cro_players_condition, 'LastName'] = data[cro_players_condition]['LastName'].str.capitalize()

# Display the result
data[data['Nation'] == 'CRO']

AttributeError: Can only use .str accessor with string values!

In [None]:
# Concatenate the first and last names back into the 'Player' column
data.loc[cro_players_condition, 'Player'] = data[cro_players_condition]['FirstName'] + ' ' + data[cro_players_condition]['LastName']

# Drop the intermediate columns
data.loc[cro_players_condition, ['FirstName', 'LastName']] = None

# Display the result
data[data['Nation'] == 'CRO']

In [410]:
# Condition to identify Croatian players
cro_players_condition = data['Nation'] == 'CRO'

# Replace '?' with 'Ć' at the beginning of names, 'ć' at the end, and 'č' elsewhere
data.loc[cro_players_condition, 'Player'] = data[cro_players_condition]['Player'].str.replace(r'^\?', 'Ć', regex=True)
data.loc[cro_players_condition, 'Player'] = data[cro_players_condition]['Player'].str.replace(r'\?$', 'ć', regex=True)
data.loc[cro_players_condition, 'Player'] = data[cro_players_condition]['Player'].str.replace(r'\?', 'č', regex=True)

# Replace '' with 'Š' at the beginning of names and 'š' elsewhere
data.loc[cro_players_condition, 'Player'] = data[cro_players_condition]['Player'].str.replace(r'^', 'Š', regex=True)
data.loc[cro_players_condition, 'Player'] = data[cro_players_condition]['Player'].str.replace(r'', 'š', regex=True)

# Display the results
data[data['Nation'] == 'CRO']

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%
160,161,Milan Badelj,CRO,MF,Genoa,Serie A,33,1989,34,34,...,0.0,0.32,1.39,0.03,0.0,0.0,12.6,1.33,1.33,50.0
220,221,Toma Bašić,CRO,MF,Lazio,Serie A,25,1996,29,9,...,0.0,0.92,0.73,0.0,0.0,0.0,6.51,0.92,0.92,50.0
221,222,Toma Bašić,CRO,MF,Bordeaux,Ligue 1,25,1996,3,3,...,0.0,3.04,1.3,0.0,0.0,0.0,5.65,0.0,0.43,0.0
275,276,Filip Benković,CRO,DFFW,Udinese,Serie A,24,1997,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,30.0,0.0,100.0
333,334,Luka Bogdan,CRO,DF,Salernitana,Serie A,26,1996,6,4,...,0.0,0.45,0.91,0.0,0.0,0.0,5.0,1.82,1.59,53.3
361,362,Domagoj Bradarić,CRO,DFMF,Lille,Ligue 1,22,1999,15,2,...,0.0,3.48,1.52,0.0,0.0,0.0,9.13,1.74,3.26,34.8
372,373,Josip Brekalo,CRO,MF,Wolfsburg,Bundesliga,23,1998,1,0,...,0.0,10.0,0.0,0.0,0.0,0.0,25.0,5.0,0.0,100.0
373,374,Josip Brekalo,CRO,FWMF,Torino,Serie A,23,1998,32,26,...,0.2,3.07,0.24,0.0,0.0,0.0,6.61,0.79,1.65,32.3
383,384,Marcelo Brozović,CRO,MF,Inter,Serie A,29,1992,35,35,...,0.0,0.49,1.44,0.0,0.0,0.0,11.6,0.61,0.89,40.8
389,390,Ante Budimir,CRO,FW,Osasuna,La Liga,30,1991,28,20,...,0.83,0.26,0.57,0.05,0.0,0.0,4.15,6.11,6.17,49.8


In [330]:
# Update the correct name for Adrian Šemper and Šime Vrsaljko

# Replace the player's name
data['Player'] = data['Player'].replace('Adrian emper', 'Adrian Šemper')
data['Player'] = data['Player'].replace('ime Vrsaljko', 'Šime Vrsaljko')

# Display the results
data[data['Nation'] == 'CRO']

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%
160,161,Milan Badelj,CRO,MF,Genoa,Serie A,33,1989,34,34,...,0.0,0.32,1.39,0.03,0.0,0.0,12.6,1.33,1.33,50.0
220,221,Toma Bašić,CRO,MF,Lazio,Serie A,25,1996,29,9,...,0.0,0.92,0.73,0.0,0.0,0.0,6.51,0.92,0.92,50.0
221,222,Toma Bašić,CRO,MF,Bordeaux,Ligue 1,25,1996,3,3,...,0.0,3.04,1.3,0.0,0.0,0.0,5.65,0.0,0.43,0.0
275,276,Filip Benković,CRO,DFFW,Udinese,Serie A,24,1997,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,30.0,0.0,100.0
333,334,Luka Bogdan,CRO,DF,Salernitana,Serie A,26,1996,6,4,...,0.0,0.45,0.91,0.0,0.0,0.0,5.0,1.82,1.59,53.3
361,362,Domagoj Bradarić,CRO,DFMF,Lille,Ligue 1,22,1999,15,2,...,0.0,3.48,1.52,0.0,0.0,0.0,9.13,1.74,3.26,34.8
372,373,Josip Brekalo,CRO,MF,Wolfsburg,Bundesliga,23,1998,1,0,...,0.0,10.0,0.0,0.0,0.0,0.0,25.0,5.0,0.0,100.0
373,374,Josip Brekalo,CRO,FWMF,Torino,Serie A,23,1998,32,26,...,0.2,3.07,0.24,0.0,0.0,0.0,6.61,0.79,1.65,32.3
383,384,Marcelo Brozović,CRO,MF,Inter,Serie A,29,1992,35,35,...,0.0,0.49,1.44,0.0,0.0,0.0,11.6,0.61,0.89,40.8
389,390,Ante Budimir,CRO,FW,Osasuna,La Liga,30,1991,28,20,...,0.83,0.26,0.57,0.05,0.0,0.0,4.15,6.11,6.17,49.8


In [338]:
# List the names in the 'Player' column

player_names = data['Player'].unique
player_names

<bound method Series.unique of 0              Max Aarons
1        Yunis Abdelhamid
2       Salis Abdul Samed
3         Laurent Abergel
4             Charles Abi
              ...        
2916     Martín Zubimendi
2917     Szymon čurkowski
2918      Martin Ødegaard
2919          Milan čurić
2920        Filip čuričić
Name: Player, Length: 2921, dtype: object>

In [178]:
# Assuming 'data' is the name of your DataFrame
data.reset_index(drop=True, inplace=True)

In [179]:
data.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%
0,1,Max Aarons,ENG,DF,Norwich City,Premier League,22,2000,34,32,...,0.03,1.41,1.16,0.0,0.06,0.03,5.53,0.47,1.59,22.7
1,2,Yunis Abdelhamid,MAR,DF,Reims,Ligue 1,34,1987,34,34,...,0.0,0.06,1.39,0.0,0.03,0.0,6.77,2.02,1.36,59.8
2,3,Salis Abdul Samed,GHA,MF,Clermont Foot,Ligue 1,22,2000,31,29,...,0.0,0.36,1.24,0.0,0.0,0.0,8.76,0.88,0.88,50.0
3,4,Laurent Abergel,FRA,MF,Lorient,Ligue 1,29,1993,34,34,...,0.03,0.79,2.23,0.0,0.0,0.0,8.87,0.43,0.43,50.0
4,5,Charles Abi,FRA,FW,Saint-Étienne,Ligue 1,22,2000,1,1,...,0.0,2.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,100.0
