In [56]:
# Basic import
import numpy as np
import pandas as pd

# Plotting figures
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# Display file path
import os
for dirname, _, filenames in os.walk('Data/fifa19'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Data/fifa19/data.csv


In [79]:
# Read the data
%time data = pd.read_csv('Data/fifa19/data.csv')
data.shape

CPU times: user 218 ms, sys: 36.1 ms, total: 254 ms
Wall time: 251 ms


(18207, 89)

### Pre-processing


In [92]:
# Trim down useless columns
data = data.drop(columns=['Unnamed: 0', 'Photo', 'Club Logo','Flag',
                          'Real Face','Loaned From','Special', 
                          'Release Clause', 'LS', 'ST', 'RS','LW', 
                          'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 
                          'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM',
                          'LWB', 'LDM', 'CDM', 'RDM', 'RWB',
                          'LB', 'LCB', 'CB', 'RCB', 'RB'])

In [96]:
# Check for NULL value
data.isnull().sum()

ID                             0
Name                           0
Age                            0
Nationality                    0
Overall                        0
Potential                      0
Club                         229
Value                          0
Wage                           0
Preferred Foot                 0
International Reputation       0
Weak Foot                      0
Skill Moves                    0
Work Rate                      0
Body Type                      0
Position                       0
Jersey Number                  0
Joined                      1493
Contract Valid Until         229
Height                         0
Weight                         0
Crossing                       0
Finishing                      0
HeadingAccuracy                0
ShortPassing                   0
Volleys                        0
Dribbling                      0
Curve                          0
FKAccuracy                     0
LongPassing                    0
BallContro

In [95]:
# Remove not filled rows
data = data[~data['Height'].isnull()]
data = data[~data['Position'].isnull()]

In [113]:
# Dataframe for players' basic information
info = data[['ID', 'Name', 'Nationality', 'Position', 'Age', 
                    'Club', 'Jersey Number', 'Joined', 
                    'Contract Valid Until', 'Value', 'Wage',
                    'International Reputation']]
info.sample(n=5)

Unnamed: 0,ID,Name,Nationality,Position,Age,Club,Jersey Number,Joined,Contract Valid Until,Value,Wage,International Reputation
13064,243967,A. Mendes Moreira,Netherlands,LM,23,FC Groningen,18.0,"Jun 1, 2018",2019,€500K,€2K,1.0
16987,244970,Y. Magnin,France,CM,21,Clermont Foot 63,33.0,"Jul 1, 2018",2021,€130K,€1K,1.0
17526,245753,Y. Omoto,Japan,RM,23,V-Varen Nagasaki,41.0,"Aug 15, 2018",2021,€90K,€1K,1.0
18004,242667,J. O'Keeffe,Republic of Ireland,ST,18,Dundalk,25.0,"Oct 7, 2017",2018,€80K,€1K,1.0
13877,180717,L. Reddy,Australia,GK,36,Perth Glory,33.0,"Jun 24, 2016",2019,€40K,€1K,1.0


In [121]:
info.isnull().sum()

ID                             0
Name                           0
Nationality                    0
Position                       0
Age                            0
Club                           0
Jersey Number                  0
Joined                      1493
Contract Valid Until           0
Value                          0
Wage                           0
International Reputation       0
dtype: int64

In [144]:
# Fill in the missing values
info['Club'].fillna('No Club', inplace=True)
info['Contract Valid Until'].fillna('2019', inplace=True)
info['Joined'].fillna('2019', inplace=True)
pd.to_datetime(info.Joined, format = '%Y')

0       2004-07-01
1       2018-07-10
2       2017-08-03
3       2011-07-01
4       2015-08-30
           ...    
18202   2017-05-03
18203   2018-03-19
18204   2017-07-01
18205   2018-04-24
18206   2018-10-30
Name: Joined, Length: 18147, dtype: datetime64[ns]

In [145]:
info.head()

Unnamed: 0,ID,Name,Nationality,Position,Age,Club,Jersey Number,Joined,Contract Valid Until,Value,Wage,International Reputation
0,158023,L. Messi,Argentina,RF,31,FC Barcelona,10.0,2004-07-01,2021,€110.5M,€565K,5.0
1,20801,Cristiano Ronaldo,Portugal,ST,33,Juventus,7.0,2018-07-10,2022,€77M,€405K,5.0
2,190871,Neymar Jr,Brazil,LW,26,Paris Saint-Germain,10.0,2017-08-03,2022,€118.5M,€290K,5.0
3,193080,De Gea,Spain,GK,27,Manchester United,1.0,2011-07-01,2020,€72M,€260K,4.0
4,192985,K. De Bruyne,Belgium,RCM,27,Manchester City,7.0,2015-08-30,2023,€102M,€355K,4.0


In [104]:
# Dataframe for players' attributes
player_attrib = data[['Name', 'Position', 'Overall', 'Potential', 
                      'Weak Foot', 'Skill Moves', 'Work Rate',
                      'Body Type', 'Height', 'Weight', 'Crossing', 
                      'Finishing','HeadingAccuracy', 'ShortPassing',
                      'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
                      'LongPassing', 'BallControl', 'Acceleration',
                      'SprintSpeed', 'Agility', 'Reactions', 
                      'Balance', 'ShotPower','Jumping', 'Stamina', 
                      'Strength', 'LongShots', 'Aggression',
                      'Interceptions', 'Positioning', 'Vision', 
                      'Penalties', 'Composure','Marking', 
                      'StandingTackle', 'SlidingTackle', 'GKDiving',
                      'GKHandling','GKKicking', 'GKPositioning', 
                      'GKReflexes']]
#player_attrib.sample(n=5)

In [85]:
player_attrib['Body Type'].unique()

array(['Messi', 'C. Ronaldo', 'Neymar', 'Lean', 'Normal', 'Courtois',
       'Stocky', 'PLAYER_BODY_TYPE_25', 'Shaqiri', 'Akinfenwa', nan],
      dtype=object)