In [1]:
# Basic import
import numpy as np
import pandas as pd

# Plotting figures
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# Display file path
import os
for dirname, _, filenames in os.walk('Data/fifa19'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Data/fifa19/data.csv


In [3]:
# Read the data
%time data = pd.read_csv('Data/fifa19/data.csv')
data.shape

CPU times: user 158 ms, sys: 12.3 ms, total: 170 ms
Wall time: 169 ms


(18207, 89)

### Pre-processing


In [4]:
# Trim down useless columns
data = data.drop(columns=['Unnamed: 0', 'Photo', 'Club Logo','Flag',
                          'Real Face','Loaned From','Special', 
                          'Release Clause', 'LS', 'ST', 'RS','LW', 
                          'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 
                          'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM',
                          'LWB', 'LDM', 'CDM', 'RDM', 'RWB',
                          'LB', 'LCB', 'CB', 'RCB', 'RB'])

In [5]:
# Check for NULL value
data.isnull().sum()

ID                             0
Name                           0
Age                            0
Nationality                    0
Overall                        0
Potential                      0
Club                         241
Value                          0
Wage                           0
Preferred Foot                48
International Reputation      48
Weak Foot                     48
Skill Moves                   48
Work Rate                     48
Body Type                     48
Position                      60
Jersey Number                 60
Joined                      1553
Contract Valid Until         289
Height                        48
Weight                        48
Crossing                      48
Finishing                     48
HeadingAccuracy               48
ShortPassing                  48
Volleys                       48
Dribbling                     48
Curve                         48
FKAccuracy                    48
LongPassing                   48
BallContro

In [6]:
# Remove not filled rows
data = data[~data['Height'].isnull()]
data = data[~data['Position'].isnull()]

In [7]:
# Dataframe for players' basic information
info = data[['ID', 'Name', 'Nationality', 'Position', 'Age', 
                    'Club', 'Jersey Number', 'Joined', 
                    'Contract Valid Until', 'Value', 'Wage',
                    'International Reputation']]
info.sample(n=5)

Unnamed: 0,ID,Name,Nationality,Position,Age,Club,Jersey Number,Joined,Contract Valid Until,Value,Wage,International Reputation
11721,244912,T. Sugeno,Japan,GK,34,Hokkaido Consadole Sapporo,1.0,"Jul 26, 2018",2019,€180K,€1K,1.0
966,183871,N. Müller,Germany,RM,30,Eintracht Frankfurt,27.0,"Jul 1, 2018",2020,€8.5M,€34K,2.0
17492,240348,D. Łuczak,Poland,LM,21,Śląsk Wrocław,26.0,"May 31, 2017",2021,€100K,€1K,1.0
3102,181456,R. Genevois,Haiti,CB,30,Stade Malherbe Caen,29.0,"Jul 8, 2016",2019,€3M,€17K,1.0
7486,223195,O. Ovacıklı,Turkey,RB,29,Çaykur Rizespor,77.0,"Jun 30, 2012",2023,€675K,€6K,1.0


In [8]:
info.isnull().sum()

ID                             0
Name                           0
Nationality                    0
Position                       0
Age                            0
Club                         229
Jersey Number                  0
Joined                      1493
Contract Valid Until         229
Value                          0
Wage                           0
International Reputation       0
dtype: int64

In [144]:
# Fill in the missing values
info['Club'].fillna('No Club', inplace=True)
info['Contract Valid Until'].fillna('2019', inplace=True)
info['Joined'].fillna('2019', inplace=True)
pd.to_datetime(info.Joined, format = '%Y')

0       2004-07-01
1       2018-07-10
2       2017-08-03
3       2011-07-01
4       2015-08-30
           ...    
18202   2017-05-03
18203   2018-03-19
18204   2017-07-01
18205   2018-04-24
18206   2018-10-30
Name: Joined, Length: 18147, dtype: datetime64[ns]

In [145]:
info.head()

Unnamed: 0,ID,Name,Nationality,Position,Age,Club,Jersey Number,Joined,Contract Valid Until,Value,Wage,International Reputation
0,158023,L. Messi,Argentina,RF,31,FC Barcelona,10.0,2004-07-01,2021,€110.5M,€565K,5.0
1,20801,Cristiano Ronaldo,Portugal,ST,33,Juventus,7.0,2018-07-10,2022,€77M,€405K,5.0
2,190871,Neymar Jr,Brazil,LW,26,Paris Saint-Germain,10.0,2017-08-03,2022,€118.5M,€290K,5.0
3,193080,De Gea,Spain,GK,27,Manchester United,1.0,2011-07-01,2020,€72M,€260K,4.0
4,192985,K. De Bruyne,Belgium,RCM,27,Manchester City,7.0,2015-08-30,2023,€102M,€355K,4.0


In [104]:
# Dataframe for players' attributes
player_attrib = data[['Name', 'Position', 'Overall', 'Potential', 
                      'Weak Foot', 'Skill Moves', 'Work Rate',
                      'Body Type', 'Height', 'Weight', 'Crossing', 
                      'Finishing','HeadingAccuracy', 'ShortPassing',
                      'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
                      'LongPassing', 'BallControl', 'Acceleration',
                      'SprintSpeed', 'Agility', 'Reactions', 
                      'Balance', 'ShotPower','Jumping', 'Stamina', 
                      'Strength', 'LongShots', 'Aggression',
                      'Interceptions', 'Positioning', 'Vision', 
                      'Penalties', 'Composure','Marking', 
                      'StandingTackle', 'SlidingTackle', 'GKDiving',
                      'GKHandling','GKKicking', 'GKPositioning', 
                      'GKReflexes']]
#player_attrib.sample(n=5)

In [85]:
player_attrib['Body Type'].unique()

array(['Messi', 'C. Ronaldo', 'Neymar', 'Lean', 'Normal', 'Courtois',
       'Stocky', 'PLAYER_BODY_TYPE_25', 'Shaqiri', 'Akinfenwa', nan],
      dtype=object)