In [3]:
import pandas as pd
import numpy as np
import random as rnd
import seaborn as sns
import scipy as sp
import plotly.offline as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import matplotlib.pyplot as plt
import missingno as msno
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [4]:
fifa = pd.read_csv("/Users/mansikhanna/Downloads/FifaDataset.csv",encoding='cp1252')
fifa.head()

FileNotFoundError: ignored

In [None]:
fifa.info()

We have 12.4 MB's dataset. It includes three kind of datatypes: float, int and object.

In [None]:
fifa.describe().T

In [None]:
fifa.shape

So, We have a total of 18,207 rows & 89 columns, which includes 45 categorical features and 44 numerical features.

In [None]:
#checking all columns

fifa.columns

In [None]:
print('Number of Categorical Columns: ', len(fifa.select_dtypes(include=object).columns))
print('Number of Numerical Columns: ', len(fifa.select_dtypes(exclude=object).columns))

In [None]:
#Checking unique values in each column
fifa.nunique()

In [None]:
fifa.isnull().any()



  **After understanding the basic description of our data, we now go ahead with the cleaning, EDA and preprocessing part.**







In [None]:
msno.bar(fifa.sample( 18207 ),(50,25),color='orange')

In the above graph, we see that the columns RB to LS have same number of missing values.
And, the variable 'Loaned From' has most of the values missing in the column, which is obvious because not every player is loaned in their football career.

In [None]:

fifa.isnull().sum()

48 is repeating many times. Let's check if the same rows are null throughout all these columns

In [None]:
missing_1 = fifa[fifa['GKHandling'].isnull()].index.tolist()
missing_2 = fifa[fifa['GKReflexes'].isnull()].index.tolist()
if missing_1 == missing_2:
    print('They are same')
else:
    print('They are different')

As they are same we can confidently assume that it will be same for other all columns too.

In [None]:
fifa.drop(fifa.index[missing_1],inplace =True)

In [None]:
fifa.isnull().sum()

We assumed it correctly and now we have fewer null values as we can see.

In [None]:
#Removing the other columns that have major null values, and are not significant for our classification analysis.

fifa.drop(['Loaned From','Release Clause','Joined'],axis=1,inplace=True)

Let's see the best player as per all the different performance attributes mentioned in the dataframe:-

In [None]:
pr_cols=['Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys',
       'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance',
       'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
       'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']
print('Best player in different aspects:')

i=0
while i < len(pr_cols):
    print('Best {0} : {1}'.format(pr_cols[i],fifa.loc[fifa[pr_cols[i]].idxmax()]['Name']))
    i += 1

**Converting categorical features to appropriate numerical features**

In [None]:
# Function to convert value and wage of the player.

def currencyConverter(val):
    if val[-1] == 'M':
        val = val[1:-1]
        val = float(val) * 1000000
        return val
        
    elif val[-1] == 'K':
        val = val[1:-1]
        val = float(val) * 1000
        return val
    
    else:
        return 0

In [None]:
fifa['Value in Pounds'] = fifa['Value'].apply(currencyConverter)
fifa['Wage in Pounds'] = fifa['Wage'].apply(currencyConverter)

fifa.drop(columns=['Value', 'Wage'], inplace=True)


In [None]:
#Converting height to cms, & converting weight to int

def height_converter(val):
    f = val.split("'")[0]
    i = val.split("'")[1]
    h = (int(f) * 30.48) + (int(i)*2.54)
    return h

def weight_converter(val):
    w = int(val.split('lbs')[0])
    return w

In [None]:
fifa['Height in Cms'] = fifa['Height'].apply(height_converter)
fifa['Weight in Pounds'] = fifa['Weight'].apply(weight_converter)

fifa.drop(columns=['Height', 'Weight'], inplace=True)
fifa[['Height in Cms', 'Weight in Pounds']].head()

In [None]:
fifa[['Height in Cms', 'Weight in Pounds','Value in Pounds','Wage in Pounds']].head()

In [None]:
fifa[['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM',
       'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM',
       'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB']].head()

In [None]:
#Let's convert the rating at each position to int

def ratingConverter(val):
    if type(val) == str:
        s1 = val[0:2]
        s2 = val[-1]
        val = int(s1) + int(s2)
        return val
    
    else:
        return val

In [None]:
skill_columns = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM',
       'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM',
       'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB']
                      
for col in skill_columns:
    fifa[col] = fifa[col].apply(ratingConverter)

In [None]:
fifa[['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM',
       'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM',
       'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB']].head()

In [None]:
# Let's plot a histogram to understand the age distribution of players

sns.set(style ="dark", palette="rocket", color_codes=True)
x = fifa.Age
plt.figure(figsize=(15,10))
ax = sns.distplot(x, bins = 58, kde = False, color='b')
ax.set_xlabel(xlabel="Player\'s age", fontsize=16)
ax.set_ylabel(ylabel='Number of players', fontsize=16)
ax.set_title(label='Histogram of players age', fontsize=20)
plt.show()

We see that maximum players lie in the age category of 21 to 27

In [None]:
# Let's plot a histogram to understand the distribution of overall ratings of players

sns.set(style ="dark", palette="rocket", color_codes=True)
x = fifa.Overall
plt.figure(figsize=(15,10))
ax = sns.distplot(x, bins = 40, kde = False, color='r')
ax.set_xlabel(xlabel="Player\'s overall", fontsize=16)
ax.set_ylabel(ylabel='Number of players', fontsize=16)
ax.set_title(label='Histogram of players overall ratings', fontsize=20)
plt.show()

We see that the mean of overall ratings is around 65

In [None]:
p=sns.jointplot(x=fifa['Age'],y=fifa['Potential'], 
              joint_kws={'alpha':0.1,'s':5,'color':'green'},
              marginal_kws={'color':'green'})
p.fig.suptitle("Age vs Potential")


q=sns.lmplot(data = fifa, x = 'Age', y = 'SprintSpeed',lowess=True,scatter_kws={'alpha':0.01, 's':5,'color':'blue'}, 
           line_kws={'color':'red'})
q.fig.suptitle("Age vs Sprint")

We see that the potential decreases as the age increases, which is again intuitive.

In [None]:
fig = tools.make_subplots(rows=1, cols=2)

aa = go.Histogram(x=fifa['Height in Cms'], nbinsx=25, opacity=0.7, name='Height in cms')
bb = go.Histogram(x=fifa['Weight in Pounds'], nbinsx=30, opacity=0.7, name='Weight in Pounds')

fig.append_trace(aa, 1,1)
fig.append_trace(bb, 1,2)

fig['layout'].update(title='<b>Height & Weight Distribution</b>', \
                     xaxis=dict(automargin=True),
                     yaxis=dict(title='<b><i>Count</b></i>')
                    )
py.iplot(fig)

Majority of the players' height lie in the range of 175cms to 190cms.



Majority of the players' weight lie in the range of 150lbs to 174lbs.



---



---


Now we see player distribution across Countries

In [None]:
df_nations = fifa.groupby(by='Nationality').size().reset_index()
df_nations.columns = ['Nation', 'Count']

df_nations[(df_nations['Nation'] == 'England') | (df_nations['Nation'] == 'Wales') 
           | (df_nations['Nation'] == 'Scotland') | (df_nations['Nation'] == 'Northern Ireland') ]

In [None]:
df_temp = pd.DataFrame(data= [['United Kingdom', 2148]], columns=['Nation', 'Count'])
df_nations = df_nations.append(df_temp, ignore_index=True)
df_nations.tail()

Adding values of England, Northern Ireland, Scotland and Wales under 'United Kingdom', as our choropleth map considers the following countries as a whole which are included in the United Kingdom.


---


We hover over the map to confirm the same.

In [None]:
countrymap = dict(type='choropleth',
              locations=df_nations['Nation'],
              z=df_nations['Count'],
              locationmode='country names',
              colorscale='Portland'
             )

layout = go.Layout(title='<b>Number of Players in each Country</b>',
                   geo=dict(showocean=True,
                            oceancolor='#AEDFDF',
                            projection=dict(type='natural earth'),
                        )
                  )

fig = go.Figure(data=[countrymap], layout=layout)
py.iplot(fig)

So we see that most players are from European and South American countries.

Top 5 countries -

Engalnd - 1657

Germany - 1195

Spain - 1071

Argentina - 936

France - 911

Hover over the red spot on the map to get the value of United Kingdom which includes England, Scotland, Northern Ireland and Wales.

In [None]:
player_features = (
    'Acceleration', 'Aggression', 'Agility', 
    'Balance', 'BallControl', 'Composure', 
    'Crossing', 'Dribbling', 'FKAccuracy', 
    'Finishing', 'GKDiving', 'GKHandling', 
    'GKKicking', 'GKPositioning', 'GKReflexes', 
    'HeadingAccuracy', 'Interceptions', 'Jumping', 
    'LongPassing', 'LongShots', 'Marking', 'Penalties'
)

# We can see the top three features per position:

for i, val in fifa.groupby(fifa['Position'])[player_features].mean().iterrows():
    print('Position {}: {}, {}, {}'.format(i, *tuple(val.nlargest(3).index)))

Let us simplify the above positions into 4 simple categories of - **Forwards**, **Midfielders**, **Defenders** and **GoalKeepers**

In [None]:
def position_classifier(val):
    
    if val == 'RF' or val == 'ST' or val == 'LF' or val == 'RS' or val == 'LS' or val == 'CF':
        val = 'Forward'
        return val
        
    elif val == 'LW' or val == 'RCM' or val == 'LCM' or val == 'LDM' or val == 'CAM' or val == 'CDM' or val == 'RM' \
         or val == 'LAM' or val == 'LM' or val == 'RDM' or val == 'RW' or val == 'CM' or val == 'RAM':
        val = 'Midfielder'
        return val

    
    elif val == 'RCB' or val == 'CB' or val == 'LCB' or val == 'LB' or val == 'RB' or val == 'RWB' or val == 'LWB':
        val = 'Defender'
        return val
    
    else:
        return val

In [None]:
fifa['Position'] = fifa['Position'].apply(position_classifier)
fifa['Position'].value_counts()

Plotting a few visualizations which give us more information about the aggregated positions

In [None]:
ccc = go.Pie(values=fifa['Position'].value_counts().values,
                labels=fifa['Position'].value_counts().index.values,
                hole=0.3
               )
 

layout = go.Layout(title='<b>Distribution of Players Position-Wise</b>')

fig = go.Figure(data=[ccc], layout=layout)
py.iplot(fig)

In [None]:
plt.figure(figsize=(12, 8))


f, axes = plt.subplots(2, 2, figsize=(15, 15), sharex=False)
sns.despine(left=True)

sns.boxplot('Position', 'Overall', data = fifa, ax=axes[0, 0])
sns.boxplot('Position', 'Age', data = fifa, ax=axes[0, 1])

sns.boxplot('Position', 'Height in Cms', data = fifa, ax=axes[1, 1])
sns.boxplot('Position', 'Weight in Pounds', data = fifa, ax=axes[1, 0])

The overall ratings of goalkeepers is slightly low.

---


Defenders & goalkeepers have a smaller deviation in Age.

---


Midfielder tends to be shorter and lighter since they need to be more flexible in passing and dribbling.

---

Also, height and weight of goalkeepers is the most.

In [None]:
f, axes = plt.subplots(ncols= 2, figsize=(30, 10), sharex=False)
sns.despine(left=True)

sns.boxplot('Position', 'Wage in Pounds', data = fifa, showfliers=False, ax=axes[0])
sns.boxplot('Position', 'Value in Pounds', data = fifa, showfliers=False, ax=axes[1])

Strikers tend to have the highest wages and value

Goalkeepers have the lowest.

In [None]:
plt.figure(figsize=(10,5))

a = fifa[fifa['Position'] == 'Forward']
b = fifa[fifa['Position'] == 'Defender']
c = fifa[fifa['Position'] == 'Midfielder']
d = fifa[fifa['Position'] == 'GK']

sns.distplot(a['Skill Moves'], color='blue', label = 'Forward', kde=False)
sns.distplot(b['Skill Moves'], color='red', label = 'Defender',  kde=False)
sns.distplot(c['Skill Moves'], color='green', label = 'Midfielder',  kde=False)
sns.distplot(d['Skill Moves'], color='orange', label = 'GK',  kde=False)

plt.legend(fontsize = 'xx-large')

Defenders & goalkeepers have lower skill moves score compared to Strikers and Midfielders, which could be one of the reasons why Defender & goalkeeper values are less than the other two positions.

In [None]:
#We are choosing 6 attributes here. We are grouping the data by Position and finding the average of our 6 attributes.
df_skills = fifa.groupby(by='Position')['Crossing', 'Finishing', 'FKAccuracy', 
                            'StandingTackle', 'Marking', 'Interceptions'].mean().reset_index()

In [None]:
forward = go.Scatterpolar(theta=['Crossing', 'Finishing', 'FKAccuracy', 
                                 'StandingTackle', 'Marking', 'Interceptions',
                                 'Crossing'
                                ],
                          r=df_skills[df_skills['Position'] == 'Forward'][['Crossing', 'Finishing', 'FKAccuracy', 
                                                                     'StandingTackle', 'Marking', 'Interceptions',
                                                                     'Crossing'
                                                                    ]].values[0],
                          fill='toself',
                          name='Forwards'
                         )

midfielder = go.Scatterpolar(theta=['Crossing', 'Finishing', 'FKAccuracy', 
                                 'StandingTackle', 'Marking', 'Interceptions',
                                 'Crossing'
                                ],
                          r=df_skills[df_skills['Position'] == 'Midfielder'][['Crossing', 'Finishing', 'FKAccuracy', 
                                                                     'StandingTackle', 'Marking', 'Interceptions',
                                                                     'Crossing'
                                                                    ]].values[0],
                          fill='toself',
                          name='Midfielders'
                         )

defender = go.Scatterpolar(theta=['Crossing', 'Finishing', 'FKAccuracy', 
                                 'StandingTackle', 'Marking', 'Interceptions',
                                 'Crossing'
                                ],
                          r=df_skills[df_skills['Position'] == 'Defender'][['Crossing', 'Finishing', 'FKAccuracy', 
                                                                     'StandingTackle', 'Marking', 'Interceptions',
                                                                     'Crossing'
                                                                    ]].values[0],
                          fill='toself',
                          name='Defenders'
                         )


goalkeeper = go.Scatterpolar(theta=['Crossing', 'Finishing', 'FKAccuracy', 
                                 'StandingTackle', 'Marking', 'Interceptions',
                                 'Crossing'
                                ],
                          r=df_skills[df_skills['Position'] == 'GK'][['Crossing', 'Finishing', 'FKAccuracy', 
                                                                     'StandingTackle', 'Marking', 'Interceptions',
                                                                     'Crossing'
                                                                    ]].values[0],
                          fill='toself',
                          name='Goal Keepers'
                         )

layout = go.Layout(polar=dict(radialaxis=dict(visible=True,
                                              range=[0, 100]
                                             )
                             
                             ),
                   showlegend=True,
                   title='<b>Attributes by Position</b>'
                  )

fig = go.Figure(data=[forward, midfielder, defender, goalkeeper], layout=layout)
py.iplot(fig)


No surprise with Defenders getting the highest ratings in defending attributes such as StandingTackle, Marking and Interceptions.

Midfielders seem like all rounders. They are good at everything but excel in Crossing and FKAccuracy.

Forwards' main job is to score goals. Hence, it is expected they score the highest in Finishing.


## **Now, we do classification based on player positions**



In [None]:
#Transforming categorical feature into numeric.
#GoalKeeper = 0
#Defender = 1
#Midfielder = 2
#Forward = 3
def pos_numeric(val):
    if val == 'GK':
        return 0
    elif val == 'Defender':
        return 1
    elif val == 'Midfielder':
        return 2
    else:
        return 3
    
fifa['Position'] = fifa['Position'].apply(pos_numeric)

In [None]:
fifa_replica = fifa.copy()

#Dropping unnecessary columns
fifa_replica.drop(columns=['Age','Photo','Name', 'Nationality', 'Flag','Club', 'Club Logo', 'Special', 'Real Face', 
                    'Special', 'Preferred Foot', 'International Reputation', 'Work Rate', 'Body Type', 'Jersey Number',
                   'Contract Valid Until', 'Unnamed: 0','LS','ST','RS','LW','LF','CF','RF','RW','LAM','CAM','RAM','LM','LCM','CM','RCM','RM',
                   'LWB','LDM','CDM','RDM','RWB','LB','LCB','CB','RCB','RB'], inplace=True)

fifa_replica.head().T

In [None]:
fifa_replica.shape

In [None]:
X = fifa_replica.drop(columns=['Position'])
X = pd.get_dummies(X)
y = fifa_replica['Position']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#Splitting dataset into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train)