In [1]:
# Basic import
import numpy as np
import pandas as pd

# Plotting figures
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# Display file path
import os
for dirname, _, filenames in os.walk('Data/fifa19'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Data/fifa19/data.csv


In [2]:
# Read the data
%time data = pd.read_csv('Data/fifa19/data.csv')
data.shape

CPU times: user 138 ms, sys: 9.08 ms, total: 147 ms
Wall time: 157 ms


(18207, 89)

### Pre-processing


In [3]:
# Trim down useless columns
data = data.drop(columns=['Unnamed: 0', 'Photo', 'Club Logo','Flag',
                          'Real Face','Loaned From','Special', 
                          'Release Clause', 'LS', 'ST', 'RS','LW', 
                          'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 
                          'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM',
                          'LWB', 'LDM', 'CDM', 'RDM', 'RWB',
                          'LB', 'LCB', 'CB', 'RCB', 'RB'])

In [4]:
# Check for NULL value
data.isnull().sum()

ID                             0
Name                           0
Age                            0
Nationality                    0
Overall                        0
Potential                      0
Club                         241
Value                          0
Wage                           0
Preferred Foot                48
International Reputation      48
Weak Foot                     48
Skill Moves                   48
Work Rate                     48
Body Type                     48
Position                      60
Jersey Number                 60
Joined                      1553
Contract Valid Until         289
Height                        48
Weight                        48
Crossing                      48
Finishing                     48
HeadingAccuracy               48
ShortPassing                  48
Volleys                       48
Dribbling                     48
Curve                         48
FKAccuracy                    48
LongPassing                   48
BallContro

In [5]:
# Remove not filled rows
data = data[~data['Height'].isnull()]
data = data[~data['Position'].isnull()]

In [6]:
# Dataframe for players' basic information
info = data[['ID', 'Name', 'Nationality', 'Position', 'Age', 
                    'Club', 'Jersey Number', 'Joined', 
                    'Contract Valid Until', 'Value', 'Wage',
                    'International Reputation']]
info.sample(n=5)

Unnamed: 0,ID,Name,Nationality,Position,Age,Club,Jersey Number,Joined,Contract Valid Until,Value,Wage,International Reputation
9137,242768,G. Kharaishvili,Georgia,LW,21,IFK Göteborg,22.0,"Feb 23, 2018",2022,€950K,€1K,1.0
8244,201352,A. Iacobucci,Italy,GK,27,Frosinone,91.0,"Aug 7, 2018",2020,€600K,€4K,1.0
2908,212350,L. Diony,France,LM,25,AS Saint-Étienne,9.0,"Jul 7, 2017",2021,€5M,€26K,2.0
3094,227789,Kim Seung Joon,Korea Republic,RW,23,Ulsan Hyundai FC,19.0,"Jan 1, 2015",2019,€4.9M,€7K,1.0
7585,232691,Kim Min Hyeok,Korea Republic,CB,26,Sagan Tosu,5.0,"Jan 1, 2014",2021,€925K,€2K,1.0


In [17]:
info.isnull().sum()

ID                          0
Name                        0
Nationality                 0
Position                    0
Age                         0
Club                        0
Jersey Number               0
Joined                      0
Contract Valid Until        0
Value                       0
Wage                        0
International Reputation    0
dtype: int64

In [13]:
# Fill in the missing values
info['Club'].fillna('No Club', inplace=True)
info['Contract Valid Until'].fillna('2019', inplace=True)
info['Joined'].fillna('2019', inplace=True)
info['Joined'] = pd.DatetimeIndex(info['Joined']).year

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [14]:
info.head()

Unnamed: 0,ID,Name,Nationality,Position,Age,Club,Jersey Number,Joined,Contract Valid Until,Value,Wage,International Reputation
0,158023,L. Messi,Argentina,RF,31,FC Barcelona,10.0,2004,2021,€110.5M,€565K,5.0
1,20801,Cristiano Ronaldo,Portugal,ST,33,Juventus,7.0,2018,2022,€77M,€405K,5.0
2,190871,Neymar Jr,Brazil,LW,26,Paris Saint-Germain,10.0,2017,2022,€118.5M,€290K,5.0
3,193080,De Gea,Spain,GK,27,Manchester United,1.0,2011,2020,€72M,€260K,4.0
4,192985,K. De Bruyne,Belgium,RCM,27,Manchester City,7.0,2015,2023,€102M,€355K,4.0


In [16]:
# Dataframe for players' attributes
attrib = data[['Name', 'Position', 'Overall', 'Potential', 
                      'Weak Foot', 'Skill Moves', 'Work Rate',
                      'Body Type', 'Height', 'Weight', 'Crossing', 
                      'Finishing','HeadingAccuracy', 'ShortPassing',
                      'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
                      'LongPassing', 'BallControl', 'Acceleration',
                      'SprintSpeed', 'Agility', 'Reactions', 
                      'Balance', 'ShotPower','Jumping', 'Stamina', 
                      'Strength', 'LongShots', 'Aggression',
                      'Interceptions', 'Positioning', 'Vision', 
                      'Penalties', 'Composure','Marking', 
                      'StandingTackle', 'SlidingTackle', 'GKDiving',
                      'GKHandling','GKKicking', 'GKPositioning', 
                      'GKReflexes']]
attrib.sample(n=5)

Unnamed: 0,Name,Position,Overall,Potential,Weak Foot,Skill Moves,Work Rate,Body Type,Height,Weight,...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
13829,J. Kitolano,LWB,62,77,3.0,2.0,High/ Medium,Lean,5'9,143lbs,...,24.0,56.0,54.0,58.0,55.0,12.0,12.0,7.0,6.0,14.0
9532,J. Norwood,RS,66,66,3.0,2.0,High/ Medium,Normal,5'9,161lbs,...,47.0,53.0,40.0,25.0,16.0,11.0,14.0,8.0,10.0,12.0
28,J. Rodríguez,LAM,88,89,3.0,4.0,Medium/ Medium,Normal,5'11,172lbs,...,81.0,87.0,52.0,41.0,44.0,15.0,15.0,15.0,5.0,14.0
2293,B. Ecuélé Manga,RB,74,74,3.0,2.0,Medium/ High,Normal,6'1,168lbs,...,46.0,67.0,70.0,75.0,74.0,12.0,16.0,8.0,14.0,7.0
1911,A. Flint,LCB,75,76,3.0,2.0,Medium/ High,Lean,6'6,183lbs,...,38.0,60.0,74.0,70.0,74.0,13.0,10.0,8.0,10.0,11.0


In [85]:
player_attrib['Body Type'].unique()

array(['Messi', 'C. Ronaldo', 'Neymar', 'Lean', 'Normal', 'Courtois',
       'Stocky', 'PLAYER_BODY_TYPE_25', 'Shaqiri', 'Akinfenwa', nan],
      dtype=object)