## Loading the Dataset

In [175]:
# Import Libraries
import pandas as pd
import numpy as np

# Load Data
df = pd.read_csv("D:/Data Science Projects/Player Performance Prediction/data/raw/players_21.csv")

df.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club_name,...,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,33,1987-06-24,170,72,Argentina,FC Barcelona,...,66+3,65+3,65+3,65+3,66+3,62+3,52+3,52+3,52+3,62+3
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,35,1985-02-05,187,83,Portugal,Juventus,...,65+3,61+3,61+3,61+3,65+3,61+3,54+3,54+3,54+3,61+3
2,200389,https://sofifa.com/player/200389/jan-oblak/210002,J. Oblak,Jan Oblak,27,1993-01-07,188,87,Slovenia,Atlético Madrid,...,32+3,36+3,36+3,36+3,32+3,32+3,33+3,33+3,33+3,32+3
3,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,31,1988-08-21,184,80,Poland,FC Bayern München,...,64+3,65+3,65+3,65+3,64+3,61+3,60+3,60+3,60+3,61+3
4,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,28,1992-02-05,175,68,Brazil,Paris Saint-Germain,...,67+3,62+3,62+3,62+3,67+3,62+3,49+3,49+3,49+3,62+3


### Basic Inspection

In [176]:
df.info()
df.describe()
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Columns: 106 entries, sofifa_id to rb
dtypes: float64(18), int64(44), object(44)
memory usage: 15.3+ MB


Index(['sofifa_id', 'player_url', 'short_name', 'long_name', 'age', 'dob',
       'height_cm', 'weight_kg', 'nationality', 'club_name',
       ...
       'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb'],
      dtype='object', length=106)

#### What we are looking for:
1. Duplicate Rows
2. Too many missing values
3. Useless columns
4. Data types to convert
5. Repeated or similar values

In [177]:
# Remove Useless Columns
columns_to_drop = ['sofifa_id', 'player_url', 'long_name', 'dob', 'real_face', 'body_type']
df.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Checking for missing values
df.isnull().sum().sort_values(ascending=False)

defending_marking       18944
loaned_from             18186
nation_jersey_number    17817
nation_position         17817
player_tags             17536
                        ...  
lb                          0
lcb                         0
cb                          0
rcb                         0
rb                          0
Length: 100, dtype: int64

In [180]:
# Drop columns with too many missing values
df.drop(columns=['release_clause_eur', 'loaned_from'])
df = df.dropna(subset=['club_name'])

# Fill missing values with zero
df['wage_eur'] = df['wage_eur'].fillna(0)
df['value_eur'] = df['value_eur'].fillna(0)

In [181]:
# Filter Out Goalkeepers
df = df[~df['player_positions'].str.startswith('GK')]

## Selecting Relevant Features

In [182]:
columns_to_keep = [
    'short_name', 'age', 'height_cm', 'weight_kg', 'value_eur', 'wage_eur',
    'international_reputation', 'weak_foot', 'skill_moves',
    'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic',
    'attacking_finishing', 'attacking_short_passing', 'mentality_vision',
    'movement_reactions', 'power_shot_power',
    'overall', 'potential', 'player_positions'
]

df = df[columns_to_keep]

## Checking Duplicates and Data Types

In [183]:
df.duplicated().sum()

np.int64(0)

In [184]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16665 entries, 0 to 18943
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   short_name                16665 non-null  object 
 1   age                       16665 non-null  int64  
 2   height_cm                 16665 non-null  int64  
 3   weight_kg                 16665 non-null  int64  
 4   value_eur                 16665 non-null  int64  
 5   wage_eur                  16665 non-null  int64  
 6   international_reputation  16665 non-null  int64  
 7   weak_foot                 16665 non-null  int64  
 8   skill_moves               16665 non-null  int64  
 9   pace                      16665 non-null  float64
 10  shooting                  16665 non-null  float64
 11  passing                   16665 non-null  float64
 12  dribbling                 16665 non-null  float64
 13  defending                 16665 non-null  float64
 14  physic     

In [185]:
# Save Cleaned Dataset
df.to_csv("D:/Data Science Projects/Player Performance Prediction/data/processed/fifa_cleaned.csv")