### Data Cleaning Challenge

#### Importing Libraries and Data

In [4]:
import pandas as pd
import numpy as np

In [31]:
path = r"C:\Users\KSD\Documents\archive\fifa21 raw data v2.csv"
df = pd.read_csv(path, low_memory=False)

In [1]:
# df.columns
# df[['Value','Wage','Release Clause',]]

#### Looking out for Missing Values 

In [58]:
df.isna().sum().sort_values(ascending=False).head(3)

Loan Date End    17966
Hits              2595
Name                 0
dtype: int64

##### Filling the missing values in "Hits"column with 0

In [59]:
df.Hits.fillna(0, ).isna().sum()

0

#### Dealing with Data wrongly inputted

##### In the Weight column, some players weight is recorded in kg while some is in lbs


In [20]:
# Function to remove suffixes from weight column and convert 'ibs' to kg
def weight_convert(weights):
    if 'kg' in weights:
        return int(weights.replace('kg',''))
    else:
        return round(int(weights.replace('lbs','')) / 2.205)
        

In [32]:
# Applying function on Weight Column
df['Weight'] = df['Weight'].apply(weight_convert)

##### Similar to weight, the Height column has two seperate units (cm and feet)
##### We are converting to cm

In [61]:
# Looking at twi seperate weight units in "Weight Column"
df['Height'][859:861]

859    178
860    180
Name: Height, dtype: int64

In [9]:
def height_to_cm(Height):
    if 'cm' in Height:
        return int(Height.replace('cm',''))
    else:
        feet, inches = Height.split('\'')
        inches = inches.replace('"', '')
        return round((int(feet)*30.48)+(int(inches)*2.54))

In [33]:
# Applying Function on Height Column
df['Height'] = df['Height'].apply(height_to_cm)

##### The Club column has '\n\n\n\n' in front of every club name.

In [442]:
df['Club'] = df['Club'].str.split('\n\n\n\n', expand=True)[1]
df['Club'].head()

0           FC Barcelona
1               Juventus
2        Atlético Madrid
3        Manchester City
4    Paris Saint-Germain
Name: Club, dtype: object

##### We have to remove the '★' in the Reputation, skill_moves and weak_foot columns

In [42]:
# Looking at the columns where the '★' is present 
df[['Skill_moves', 'Weak_foot', 'Reputation']].head()

Unnamed: 0,Skill_moves,Weak_foot,Reputation
0,4★,4 ★,5 ★
1,5★,4 ★,5 ★
2,1★,3 ★,3 ★
3,4★,5 ★,4 ★
4,5★,5 ★,5 ★


In [40]:
# Function to remove '★' 
def remove_star(star):
    if '★' in star:
        amend = int(star.replace('★',''))
    return amend

In [43]:
# Applying fuction to columns
df[['Skill_moves', 'Weak_foot', 'Reputation']] = df[['Skill_moves', 'Weak_foot', 'Reputation']].applymap(remove_star)

#### Properly Formatting the Currency Columns ('Value', 'Wage' and 'Release Clause')

In [62]:
# The columns where 
df[['Value','Wage','Release Clause',]].head()

Unnamed: 0,Value,Wage,Release Clause
0,103500000.0,560000.0,138400000.0
1,63000000.0,220000.0,75900000.0
2,120000000.0,125000.0,159400000.0
3,129000000.0,370000.0,161000000.0
4,132000000.0,270000.0,166500000.0


In [28]:
# Creating function to convert Value, Wage and Release Clause
def convert_currency(value):
    if 'M' in value:
        fix_value = float(value.replace('€', '').replace('M','')) * 1000000
    elif 'K' in value:
        fix_value = float(value.replace('€', '').replace('K','')) * 1000
    else:
        fix_value = float(value.replace('€', ''))
    return fix_value

In [45]:
# Applying Functions on Columns
df[['Value','Wage','Release Clause',]] = df[['Value','Wage','Release Clause',]].applymap(convert_currency)

In [569]:
df['Workrate_Att'].unique()

array(['Medium', 'High', 'Low'], dtype=object)

In [572]:
df.loc[df['Workrate_Att'] == "Low", 'Workrate_Att'] = 1
df.loc[df['Workrate_Att'] == "Medium", 'Workrate_Att'] = 2
df.loc[df['Workrate_Att'] == "High", 'Workrate_Att'] = 3

In [573]:
df['Workrate_Att'].unique()

array([2, 3, 1], dtype=object)

In [None]:
df['Workrate_Def'].unique

In [574]:
df.loc[df['Workrate_Def'] == "Low", 'Workrate_Def'] = 1
df.loc[df['Workrate_Def'] == "Medium", 'Workrate_Def'] = 2
df.loc[df['Workrate_Def'] == "High", 'Workrate_Def'] = 3

In [576]:
df['Workrate_Att'].unique()

array([2, 3, 1], dtype=object)

#### Properly Formatting Columns with Dates to DateTime format

In [580]:
df['Joined'] = pd.to_datetime(df['Joined'])
df['Loan Date End'] = pd.to_datetime(df['Loan Date End']) 

#### 

In [None]:
df = df.rename(index=str,
              columns={'↓OVA':'Overall',
                        'A/W':'Workrate_Att',
                        'D/W':'Workrate_Def',
                        'SM':'Skill_moves',
                        'W/F':'Weak_foot',
                        'IR':'Reputation',
                       
                        })