## 00. Coding Best Practices

In [2]:
# Basic Libraries
import os
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Encoding
from sklearn.preprocessing import LabelEncoder

# Machine Learning
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Settings
warnings.filterwarnings('ignore') 
pd.set_option('display.max_columns', None)

In [3]:
# Basic functions:
def data_info(): # improved data.info()
    print(f"The DataFrame shape is {data.shape}.")
    print()
    print("The DataFrame data types are:")
    print(data.dtypes.value_counts().tolist())
    print(data.dtypes.value_counts().index.tolist())
    print()
    print("DataFrame random row sample and full columns:")
    return data.sample(5)

def clean_columns(): # Standardizes the column name, returns the columns
    data_copy.columns = [i.lower().replace(' ', '_') for i in data_copy.columns]
    return data_copy.sample(0)
    
def check_null_cols(): # Checks for empty cells in the entire DataFrame
    null_cols = data_copy.isnull().sum()
    null_cols = null_cols[null_cols > 0]
    print(null_cols)
    
def is_significant(column_name, data_copy): # takes a column name
    n_data = len(data_copy)
    n_nulls = data_copy[column_name].isnull().sum() # counts all NaNs of column name
    null_percentage = n_nulls / n_data * 100
    print(f"{null_percentage:.2f}% of the '{column_name}' column are empty.")
    
# Specific functions for this dataset:
def convert_value(value): # function to convert value to unified format
    try:
        if isinstance(value, float):
            return value
        elif value.endswith('K'):
            return float(value[1:-1]) * 1000
        elif value.endswith('M'):
            return float(value[1:-1]) * 1000000
        else:
            return float(value)
    except ValueError:
        return None

## 01 - Getting the Data

In [4]:
file_path = os.path.join("C:/Users/apisi/01. IronData/01. GitHub/01. iron-activity/project-mbappe", "fifa21_male2.csv")
data = pd.read_csv(file_path)
data_info()

The DataFrame shape is (17125, 107).

The DataFrame data types are:
[52, 45, 10]
[dtype('O'), dtype('int64'), dtype('float64')]

DataFrame random row sample and full columns:


Unnamed: 0,ID,Name,Age,OVA,Nationality,Club,BOV,BP,Position,Player Photo,Club Logo,Flag Photo,POT,Team & Contract,Height,Weight,foot,Growth,Joined,Loan Date End,Value,Wage,Release Clause,Contract,Attacking,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,Skill,Dribbling,Curve,FK Accuracy,Long Passing,Ball Control,Movement,Acceleration,Sprint Speed,Agility,Reactions,Balance,Power,Shot Power,Jumping,Stamina,Strength,Long Shots,Mentality,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Defending,Marking,Standing Tackle,Sliding Tackle,Goalkeeping,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes,Total Stats,Base Stats,W/F,SM,A/W,D/W,IR,PAC,SHO,PAS,DRI,DEF,PHY,Hits,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK,Gender
11427,239592,J. Macías,20,75,Mexico,Mexico,77,ST,ST,https://cdn.sofifa.com/players/239/592/20_120.png,https://cdn.sofifa.com/teams/1386/light_60.png,https://cdn.sofifa.com/flags/mx.png,84,Guadalajara 2017 ~ 2022,"5'10""",161lbs,Right,9,"Jul 1, 2017",,€10.5M,€34K,€20M,Guadalajara 2017 ~ 2022,337,52,79,74,72,60.0,304,76,50.0,50,52,76,354,73,73,65.0,72,71.0,327,73,59.0,64,63,68,272,40,19.0,80.0,55.0,78,67.0,60,21,19,20.0,42,10,10,6,11,5,1696,364,3 ★,3★,High,Medium,1 ★,73,75,59,74,25,58,275,75+2,75+2,75+2,72+0,74+0,74+0,74+0,72+0,71+2,71+2,71+2,70+2,63+2,63+2,63+2,70+2,49+2,46+2,46+2,46+2,49+2,46+2,41+2,41+2,41+2,46+2,16+2,Male
16059,255683,F. Píriz,22,61,Uruguay,Club Plaza Colonia,61,LB,LB,https://cdn.sofifa.com/players/255/683/20_120.png,https://cdn.sofifa.com/teams/114024/light_60.png,https://cdn.sofifa.com/flags/uy.png,67,Club Plaza Colonia 2020 ~ 2020,"5'9""",174lbs,Left,6,"Jan 8, 2020",,€300K,€500,€705K,2020 ~ 2020,243,55,33,60,56,39.0,207,47,37.0,31,36,56,313,66,62,55.0,60,70.0,254,37,62.0,64,59,32,229,47,56.0,40.0,46.0,40,42.0,178,52,66,60.0,52,13,12,13,6,8,1476,317,2 ★,2★,High,Medium,1 ★,64,35,49,52,59,58,3,48+2,48+2,48+2,50+0,49+0,49+0,49+0,50+0,49+2,49+2,49+2,52+2,50+2,50+2,50+2,52+2,58+2,55+2,55+2,55+2,58+2,59+2,58+2,58+2,58+2,59+2,15+2,Male
2165,187132,C. Zambrano,30,73,Peru,Peru,73,CB,CB,https://cdn.sofifa.com/players/187/132/20_120.png,https://cdn.sofifa.com/teams/111108/light_60.png,https://cdn.sofifa.com/flags/pe.png,73,Boca Juniors 2020 ~ 2023,"6'1""",183lbs,Right,0,"Feb 2, 2020",,€3.2M,€13K,€4.5M,Boca Juniors 2020 ~ 2023,270,51,41,74,68,36.0,270,66,36.0,33,68,67,298,58,54,65.0,65,56.0,328,63,77.0,61,73,54,292,90,69.0,44.0,45.0,44,73.0,225,74,75,76.0,65,14,12,13,12,14,1748,375,3 ★,2★,Low,High,2 ★,56,48,57,66,74,74,21,58+2,58+2,58+2,57+0,57+0,57+0,57+0,57+0,59+2,59+2,59+2,59+2,62+2,62+2,62+2,59+2,66+2,70+2,70+2,70+2,66+2,67+2,73+0,73+0,73+0,67+2,19+2,Male
11077,238263,F. Knudsen,23,64,Norway,FK Haugesund,66,CB,CB,https://cdn.sofifa.com/players/238/263/20_120.png,https://cdn.sofifa.com/teams/1463/light_60.png,https://cdn.sofifa.com/flags/no.png,73,FK Haugesund 2017 ~ 2021,"6'3""",183lbs,Right,9,"Jan 30, 2017",,€625K,€950,€906K,2017 ~ 2021,194,31,18,63,54,28.0,174,32,23.0,23,46,50,252,46,58,42.0,61,45.0,290,42,83.0,65,79,21,217,72,64.0,21.0,26.0,34,58.0,186,57,65,64.0,50,13,11,12,8,6,1363,295,3 ★,2★,Medium,High,1 ★,53,25,40,41,62,74,4,41+2,41+2,41+2,38+0,39+0,39+0,39+0,38+0,39+2,39+2,39+2,41+2,45+2,45+2,45+2,41+2,55+2,58+2,58+2,58+2,55+2,57+2,64+2,64+2,64+2,57+2,15+2,Male
296,104442,C. Burke,36,68,Scotland,Kilmarnock,68,RM,RM RW,https://cdn.sofifa.com/players/104/442/20_120.png,https://cdn.sofifa.com/teams/82/light_60.png,https://cdn.sofifa.com/flags/gb-sct.png,68,Kilmarnock 2017 ~ 2021,"5'9""",150lbs,Right,0,"Jul 18, 2017",,€300K,€3K,€525K,2017 ~ 2021,290,70,64,35,63,58.0,326,70,67.0,62,58,69,362,71,68,78.0,68,77.0,341,68,69.0,72,66,66,279,54,29.0,65.0,64.0,67,69.0,120,55,33,32.0,50,10,8,11,11,10,1768,373,4 ★,3★,High,Medium,1 ★,69,65,64,71,39,65,7,63+2,63+2,63+2,68+0,66+0,66+0,66+0,68+0,66+2,66+2,66+2,67+1,62+2,62+2,62+2,67+1,56+2,53+2,53+2,53+2,56+2,52+2,47+2,47+2,47+2,52+2,16+2,Male


This dataset contains **one-year** 17.125 player information distributed among 107 different columns. Also, our data types are mostly **numericals** (52 object / 45 integers / 10 floats).

Our **project_goal** is to predict who will be **the next Mbappé**, so after reading the dataset [documentation](https://www.kaggle.com/datasets/ekrembayar/fifa-21-complete-player-dataset?select=fifa21_male2.csv) we decide to proceed with the following **strategy**:

* We will work with numericals to make a Linear Regression model.
* The **target** for our dataset it will be the `OVA` (overall score) of a player.
* The objective is to **identify** promising talents through **EDA** to **predict** their future score.
_____________

From the numerical columns related to player potential, we will remove some columns such as `'attacking'`, `'crossing'`, `'finishing'`, `'heading_accuracy'`, and `'short_passing'` (see code below...) because we will calculate these columns as sums of other related columns. 

`'Value'`, `'Wage'`, and `'Release Clause'` are interesting columns to work with. However, they contain symbols such as € and K, so they need to be converted to numerical values. 

Additionally, we will use features such as `'height'`, `'weight'`, etc., and we will convert them to the metric system by removing quotation symbols or lbs.

## 02 - Cleaning the Data

In [5]:
data_copy = data.copy() # as best practices, we do first a copy of our dataframe

In [6]:
clean_columns() # Standardizes the column name from data_copy

Unnamed: 0,id,name,age,ova,nationality,club,bov,bp,position,player_photo,club_logo,flag_photo,pot,team_&_contract,height,weight,foot,growth,joined,loan_date_end,value,wage,release_clause,contract,attacking,crossing,finishing,heading_accuracy,short_passing,volleys,skill,dribbling,curve,fk_accuracy,long_passing,ball_control,movement,acceleration,sprint_speed,agility,reactions,balance,power,shot_power,jumping,stamina,strength,long_shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing_tackle,sliding_tackle,goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,total_stats,base_stats,w/f,sm,a/w,d/w,ir,pac,sho,pas,dri,def,phy,hits,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,gender


### Dealing with Null values

In [7]:
check_null_cols() # Checks for empty cells in the entire DataFrame

club                 23
position            413
club_logo            23
joined               58
loan_date_end     16215
volleys              58
curve                58
agility              58
balance              58
jumping              58
interceptions         7
positioning           7
vision               58
composure           423
sliding_tackle       58
a/w                  89
d/w                  89
dtype: int64


In [8]:
# We get rid of loan_date since it has way too many NaN values
data_copy = data_copy.drop(columns=["loan_date_end"])

In [9]:
is_significant('composure', data_copy)

2.47% of the 'composure' column are empty.


In [10]:
# 2.47% is not significant. We will drop it.
data_copy = data_copy.drop(columns=["composure"])

The features `value`, `wage`, and `'release Clause'` are interesting columns to work with. However, they contain symbols such as € and K, so they need to be converted to numerical values. 

In [11]:
data_check = data_copy[['value', 'wage', 'release_clause']]
data_check.head(10)

Unnamed: 0,value,wage,release_clause
0,€625K,€7K,€0
1,€600K,€7K,€1.1M
2,€1.1M,€15K,€0
3,€0,€0,€0
4,€5.5M,€12K,€7.2M
5,€725K,€5K,€0
6,€2.8M,€44K,€5.3M
7,€1K,€60K,€0
8,€0,€0,€0
9,€400K,€40K,€0


In [12]:
# We will use convert_value function to convert value to unified format
data_copy['value'] = data_copy['value'].apply(convert_value) # Applying the function
data_copy['wage'] = data_copy['wage'].apply(convert_value)
data_copy['release_clause'] = data_copy['release_clause'].apply(convert_value)
# Visualizing the changes
data_checkmate = data_copy[['value', 'wage', 'release_clause']] 
data_comparision = pd.concat([data_check, data_checkmate], axis=1) # merging to compare the changes
data_comparision.head(10)

Unnamed: 0,value,wage,release_clause,value.1,wage.1,release_clause.1
0,€625K,€7K,€0,625000.0,7000.0,
1,€600K,€7K,€1.1M,600000.0,7000.0,1100000.0
2,€1.1M,€15K,€0,1100000.0,15000.0,
3,€0,€0,€0,,,
4,€5.5M,€12K,€7.2M,5500000.0,12000.0,7200000.0
5,€725K,€5K,€0,725000.0,5000.0,
6,€2.8M,€44K,€5.3M,2800000.0,44000.0,5300000.0
7,€1K,€60K,€0,1000.0,60000.0,
8,€0,€0,€0,,,
9,€400K,€40K,€0,400000.0,40000.0,


### Encoding
* Some columns considered as **categoricals** are actually **numericals**
* e.g: `height`, `weight`, etc.

In [None]:
# Convert height data from feet and inches to centimeters
data_copy['h_cm'] = data_copy['height'].apply(lambda x: int(x.split("'")[0])*30.48 + int(x.split("'")[1].replace('"', ''))*2.54)
# Convert weight data from pounds to kilograms
data_copy['w_kg'] = data_copy['weight'].apply(lambda x: round(int(x[:-3]) / 2.2046, 2))
# Apply lambda function to 'weak_foot' column
data_copy['w_f'] = data_copy['w/f'].apply(lambda x: int(x[0]))
# Visualizing the changes
data_encoded = data_copy[['h_cm', 'w_kg', 'w_f']]
data_encoded.head(10)

### Further Encoding

In [13]:
data_check2 = data_copy[['sm', 'ir']]
data_check2.head(10)

Unnamed: 0,sm,ir
0,2★,2 ★
1,3★,1 ★
2,4★,2 ★
3,1★,1 ★
4,4★,4 ★
5,3★,2 ★
6,4★,2 ★
7,3★,3 ★
8,2★,3 ★
9,3★,2 ★


In [None]:
# Apply lambda to:
# skilled_moves
data_copy['sm'] = data_copy['sm'].apply(lambda x: int(x[0]))
# international_reputation
data_copy['ir'] = data_copy['ir'].apply(lambda x: int(x[0]))

In [None]:
# Initialize the LabelEncoder object
le = LabelEncoder()

# Fit and transform the data frame column
data_copy['a_wr'] = le.fit_transform(data_copy['attacking_work_rate'])
data_copy['d_wr'] = le.fit_transform(data_copy['defensive_work_rate'])

In [None]:
# Modify cells with 'K' character, leave others unchanged, turn it to integer
data_copy['hits'] = data_copy['hits'].apply(lambda x: float(x.replace('K','')) * 1000 if isinstance(x, str) and 'K' in x else x).astype(int)

In [None]:
# rename columns with their corresponding meanings for better encoding
data_copy = data_copy.rename(columns={
                        "w_f":"weak_foot",
                        "ir":"international_reputation", 
                        "sm":"skilled_moves",
                        "a/w":"attacking_work_rate",
                        "d/w":"defensive_work_rate",})

### Selecting numericals

In [None]:
# Create a new dataframe with only the numerical columns using select_dtypes
X_N = data_copy.select_dtypes(np.number).apply(lambda x: round(x,2)).fillna(0) # we also deal with NaN values
X_N.head(3)

In [None]:
data_headers = list(X_N.columns.values) # to get and check all column names
print("The Column Headers are :", data_headers) 

* **"attacking"** = sum ("crossing","finishing","heading_accuracy","short_passing","volleys")
* **"skill"** = sum ("dribbling","curve","fk_accuracy","long_passing","ball_control")
* **"movement"** = sum ("acceleration","sprint_speed","agility","reactions","balance")
* **"power"** = sum ("shot_power","jumping","stamina","strength","long_shots")
* **"mentality"** = sum("aggression","interceptions","positioning","vision","penalties","composure")
* **"defending"** = sum ("marking","standing_tackle","sliding_tackle")
* **"goalkeeping"** = sum ("gk_diving","gk_handling","gk_kicking","gk_positioning","gk_reflexes")
* **"base_stats"** = sum ("pac","sho","pas","dri","def","phy")
* **"total_stats"** = sum("attacking","skill","movement","power","mentality",("defending","goalkeeping")

* To make it simple to work with our model, we will get rid of many columns since they already summerized in: `"attacking"`, `"skill"`, `"movement"`, "power", `"mentality"`, `"defending"`, `"goalkeeping"`, `"base_stats"` and `"total_stats"`

In [None]:
# We simply move our target Value to the right, for readibility
X_N = X_N[['age', 'best_overall', 'potential_scores', 'value', 'wage', 
           'attacking', 'skill', 'movement', 'power', 'mentality', 'defending',
           'goalkeeping', 'total_stats', 'base_stats', 'hits', 'h_cm', 'w_kg', 
           'weak_foot_ecd', 'skilled_moves_ecd', 'international_reputation_ecd',
           'attacking_work_rate_ecd', 'defensive_work_rate_ecd', 'overall_scores']]
X_N.shape

* **Now**, we got a cleaned dataset with `17.125` numerical player information in `23` distinct features.

In [None]:
X_N.sample(3)

![image.png](attachment:image.png)

In [None]:
# rename columns with their corresponding meanings
data = data.rename(columns={"ova":"overall_scores",
                        "bov":"best_overall",
                        "pot":"potential_scores",
                        "w/f":"weak_foot",
                        "ir":"international_reputation", 
                        "sm":"skilled_moves",
                        "a/w":"attacking_work_rate",
                        "d/w":"defensive_work_rate",})

* We will reduce that number, since we haven't dropped categoricals yet (social media information, among others).

## 03 - Exploratory Data Analysis

In [None]:
X_N.describe() # basic statistics for all numerical values

In [None]:
# Looking at Most Valuable Player with rating >80
mvps = data_copy[(data_copy['overall_scores'] > 80)]

# Sort the MVPs
mvps_sorted = mvps.sort_values('value', ascending=False)
mvps_sorted.head(5)

In [None]:
# Group the data by country and count the number of players in each country
mvps_by_country = mvps.groupby('nationality')['name'].count().sort_values(ascending=False)

# Plot a bar chart of the top 10 countries with the most players
mvps_by_country.head(10).plot(kind='bar')

# Add labels and a title to the chart
plt.xlabel('Country')
plt.ylabel('Number of players')
plt.title('Top 10 countries with the most valuable players')

# Display the chart
plt.show()

In [None]:
# Looking at promising talents
data_copy['pot_minus_ova'] = data_copy['potential_scores'] - data_copy['overall_scores'] 

# Create a new DataFrame that contains only the rows with "difference" > 10 and age < 21
talents = data_copy[(data_copy['pot_minus_ova'] > 10) & (data_copy['age'] < 21)]
talents_sorted = talents.sort_values('pot_minus_ova', ascending=False)
talents_sorted.head(3)

In [None]:
# Group the data by country and count the number of players in each country
talents_by_country = talents.groupby('nationality')['name'].count().sort_values(ascending=False)

# Plot a bar chart of the top 10 countries with the most players
talents_by_country.head(10).plot(kind='bar')

# Add labels and a title to the chart
plt.xlabel('Country')
plt.ylabel('Number of players')
plt.title('Top 10 countries with the most promising talents')

# Display the chart
plt.show()

In [None]:
# create a dataframe with 'name' and 'value' columns
df = pd.DataFrame(data_copy, columns=['name', 'value'])

# select the rows for the six specific players you want to plot
players = ['K. Mbappé', 'Neymar Jr', 'K. De Bruyne', 'R. Richards', 'A. Hajdari','B. Arrey-Mbi']
df_players = df[df['name'].isin(players)]

# create a bar chart
plt.figure(figsize=(8,6))
bars = plt.bar(df_players['name'], df_players['value'])
plt.ylabel('Value in €100Mio')
plt.title('Comparison of MVPs and Talents')

# show the chart
plt.show()

## 04 - Processing Data

### Dealing with multicollinearity

In [None]:
# Correlation Matrix
X_N_corr = round(X_N.corr(),2)
X_N_corr

In [None]:
# Correlation Matrix-Heatmap Plot
mask = np.zeros_like(X_N_corr)
mask[np.triu_indices_from(mask)] = True # optional, to hide repeat half of the matrix
f, ax = plt.subplots(figsize=(15, 15))
ax = sns.heatmap(X_N_corr, mask=mask, annot=True, linewidths=.5, ax=ax)
plt.show()

* In another attempted we did we dropped `bov` and `pas` columns because our target was `value`. Which, at the end, was not feeling right to predict the best next player.
* So, for this dataset, after checking the correlation between variables we do not detect any multicollinearity, so we proceed to normalize the data.

### Normalizing the Data

### X-Y Split

In [None]:
##X_Y SPLIT BEFORE normalizing. ~ Xisca. We don't want to normalize the target
Y = X_N['overall_scores']### X-Y Split
X_N = X_N.drop(['overall_scores'], axis=1)

In [None]:
transformer = MinMaxScaler().fit(X_N) # rescale the Data into 0,1 for our model
X_N_minmax = transformer.transform(X_N) 

In [None]:
# create new data frame using data from 'X_N_minmax'
X_N_normalized = pd.DataFrame(X_N_minmax, columns = X_N.columns)
X_N_normalized.sample(3) 

Красивый!
![image.png](attachment:image.png)

In [None]:
for i in X_N_normalized:
    sns.distplot(X_N_normalized[i])
    plt.show()

## 05 - Modeling

### Train-Test Split

In [None]:
# We define train and test for X and Y
X_train, X_test, y_train, y_test = train_test_split(X_N, Y, test_size=0.3, random_state=42) 
# test_size = We give 30% for testing and 70% for testing
# random_state = it'll improve the model to divide always the model in the same way

In [None]:
len(X_N)*0.7 # Len before the test

In [None]:
len(X_test) # Len after the test

In [None]:
model = LinearRegression() # We have the model
model.fit(X_train,y_train) # We fit the training data into the model

## 06 - Model Validation

In [None]:
predictions = model.predict(X_test)

In [None]:
r2_score(y_test, predictions), mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions)

In [None]:
r2 = r2_score(y_test, predictions)
RMSE = mean_squared_error(y_test, predictions, squared=False)
MSE = mean_squared_error(y_test, predictions)
MAE = mean_absolute_error(y_test, predictions)
print("R2 = ", r2)
print("RMSE = ", RMSE)
print("The value of the metric MSE is ", MSE)
print("MAE = ", MAE)

### Present results

In [None]:
results = pd.DataFrame()
results['true'] = y_test
results['pred'] = predictions
results['diff'] = results.apply(lambda x: abs(x['true'] - x['pred']), axis=1)
results

In [None]:
results.sort_values('diff', ascending=False).head(10)

In [None]:
sns.regplot(results['true'], results['pred'])

## 07 - Reporting

In [None]:
X_N.iloc[[2358]]

In [None]:
X_N.iloc[[2358]]

In [None]:
results.iloc[[2358]]

In [None]:
# Creating a new 
names = data_copy.loc[results.index.to_list(), ["id","name"]]
names

In [None]:
# Merge the the results with 
test = pd.concat([names, results], axis=1)
test

In [None]:
test[test.name=='R. Richards']

In [None]:
data_copy[data_copy.name=='R. Richards']

[Presentation](https://docs.google.com/presentation/d/1Td9rJDfuB_epbsVk1nrdaiT-oH_MAjnKdBoWULncyT0/edit#slide=id.gc6f9e470d_0_126)