# 1. Downloading packages

In [79]:
# ! pip install pandas
# ! pip install matplotlib
! pip install plotly




In [80]:
# help('modules')

# 2. Importing packages

In [81]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import os
import plotly.express as px

from scipy.special import inv_boxcox

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, Normalizer, LabelEncoder


import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None


# 3. Loading data

Loading data and seeing column headers for the 1st time.

In [82]:
df = pd.read_csv("/Users/ignaciolorenzoqueralt/Documents/Ironhack/projects/moneyball/Project_FIFA_MoneyBall/Data/fifa21_male2.csv")

# 4. Cleaning Data

## 4.1. Standardize headers:

In [83]:
def standardize_headers(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_')
standardize_headers(df)

## 4.2. Dropping irrelevant columns

Based on myfootball-related knowledge, lets drop columns that are useless to predict a player's market value
- name
- player_photo
- flag_photo

In [84]:
df=df.drop(columns=['id', 'name','player_photo', 'club_logo', 'flag_photo'])

As i haven't been able to find what is 'bov' and it contains the same values as 'ova', we drop the column for the sake of simplifying the dataset:

In [85]:
df=df.drop(columns=['bov'])

## 4.3. Cleaning weight and height data and transforming to metric units

- Height: from feet to cms

In [86]:
df['height'] = df['height'].str.replace('"', '')
df['height'] = df['height'].apply(lambda x: int(x.split("'")[0])*30.48 + int(x.split("'")[1])*2.54)

- Weight: from lbs to kg

In [87]:
df['weight'] = df['weight'].str.replace('lbs', '')
df['weight'] = df['weight'].apply(lambda x: int(x))*0.4535

## 4.4 Dealing with 'position':

We don't think that knowing each player's potential position will help us infer a player's value. To do so, we only need to know each player's best position.

'bp' stands for best position, which is the information we find relevant as a player's value will be more influenced by its best position rather than by his potential positions.

Therefore, we drop 'position' data and we will use 'bp' so as to know each player's position.

Moreover, position has more NaN values than bp

In [88]:
print("The count of 'bp' null values is ", df['bp'].isna().sum())
print("The count of 'position' null values is ", df['position'].isna().sum())


The count of 'bp' null values is  0
The count of 'position' null values is  413


In [89]:
df=df.drop(columns=['position'])

## 4.5. Dealing with columns which show a player's score in all positions

As we an see below, the ova has the same score as the players score in his best position (bp). E.G: 'id' 488 is GK and has OVA of 87, the same socre as in the column gk. 

In [90]:
df[df.bp=='GK'].head(3)

Unnamed: 0,age,ova,nationality,club,bp,pot,team_&_contract,height,weight,foot,growth,joined,loan_date_end,value,wage,release_clause,contract,attacking,crossing,finishing,heading_accuracy,short_passing,volleys,skill,dribbling,curve,fk_accuracy,long_passing,ball_control,movement,acceleration,sprint_speed,agility,reactions,balance,power,shot_power,jumping,stamina,strength,long_shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing_tackle,sliding_tackle,goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,total_stats,base_stats,w/f,sm,a/w,d/w,ir,pac,sho,pas,dri,def,phy,hits,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,gender
14,38,87,Germany,FC Bayern München,GK,90,FC Bayern München 2008,187.96,89.793,Right,3,,,€0,€0,€0,2008,84,21,21,21,21,,113,21,,7,62,23,204,57,62,,85,,179,22,,51,85,21,228,94,91.0,22.0,,21,,42,21,21,,346,83,88,62,92,83,1196,468,1 ★,1★,,,4 ★,83,88,62,83,60,92,4,29+0,29+0,29+0,29+0,29+0,29+0,29+0,29+0,33+0,33+0,33+0,29+0,36+0,36+0,36+0,29+0,29+0,31+0,31+0,31+0,29+0,31+0,31+0,31+0,31+0,31+0,87+0,Male
18,42,73,Australia,Leicester City,GK,73,Leicester City 2015 ~ 2016,193.04,94.7815,Right,0,"Jan 6, 2015",,€500K,€15K,€0,2015 ~ 2016,79,13,11,14,24,17.0,78,13,14.0,11,23,17,204,23,26,36.0,76,43.0,191,29,55.0,19,70,18,146,40,22.0,19.0,31.0,34,,56,17,19,20.0,357,70,73,67,78,69,1111,382,3 ★,1★,Medium,Medium,2 ★,70,73,67,69,25,78,3,25+0,25+0,25+0,23+0,24+0,24+0,24+0,23+0,25+0,25+0,25+0,23+0,26+0,26+0,26+0,23+0,24+0,28+0,28+0,28+0,24+0,24+0,30+0,30+0,30+0,24+0,73+0,Male
19,39,79,Germany,VfB Stuttgart,GK,87,VfB Stuttgart 2010,190.5,87.072,Right,8,,,€0,€0,€0,2010,90,21,21,21,27,,128,21,,10,75,22,171,49,54,,68,,179,29,,55,74,21,291,78,81.0,58.0,,74,,42,21,21,,323,77,82,75,88,76,1224,450,3 ★,1★,,,4 ★,77,82,75,76,52,88,3,31+0,31+0,31+0,29+0,31+0,31+0,31+0,29+0,35+0,35+0,35+0,29+0,38+0,38+0,38+0,29+0,29+0,32+0,32+0,32+0,29+0,31+0,30+0,30+0,30+0,31+0,81+0,Male


Therefore, we can drop columns which show each player's score across all positions as long as we keep their 'ova' and 'bp'.

In [91]:
df=df.drop(columns=['w/f', 'sm', 'a/w', 'd/w', 'ir', 'pac', 'sho', 'pas',
 'dri', 'def', 'phy', 'hits', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam',
 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb',
 'lcb', 'cb', 'rcb', 'rb', 'gk'])

## 4.6. Getting each player's contract information:

We drop team_&_contract as we have the same data in two other separated columns:

In [92]:
df=df.drop(columns=['team_&_contract'])

We drop 'joined' as we do not think this info is crucial to the player's value:


In [93]:
df=df.drop(columns=['joined'])

We need to clean all the columns that contain info regarding sums of money:
- release_clause
- value
- wage

In [94]:
def clean_monetary(x):
    if (x.startswith('€') and x.endswith('K') and (x.find('.')>=0)):
        return x.replace('€', '').replace('K','').replace('.','')+'00'
    elif (x.startswith('€') and x.endswith('K')):
        return x.replace('€', '').replace('K','').replace('.','')+'000'
    if (x.startswith('€') and x.endswith('M') and (x.find('.')>=0)):
        return x.replace('€', '').replace('M','').replace('.','')+'00000'
    elif (x.startswith('€') and x.endswith('M')):
        return x.replace('€', '').replace('M','').replace('.','')+'000000'
    else:
        return x.replace('€','')

df['value'] = pd.to_numeric(df['value'].apply(clean_monetary))
df['wage'] = pd.to_numeric(df['wage'].apply(clean_monetary))
df['release_clause'] = pd.to_numeric(df['release_clause'].apply(clean_monetary))

We also need to know the player's contract duration, stored in 'contract'

In [95]:
def contract_to_num(x):
    years = [int(i) for i in x.split() if i.isdigit()]
    if (('loan') not in str(x).lower()):
        if len(years) ==2:
            x = years[1]-years[0]
            return int(x)
        elif len(years) == 1:
            x = 1
            return int(x)
        else:
            return ''
df['contract'] = df['contract'].apply(contract_to_num)
df['contract'] = df['contract'].fillna(0)

## 4.7. Dealing with null values

Let's see if any column has null values:

In [96]:
df.columns[df.isna().any()].tolist()

['club',
 'loan_date_end',
 'volleys',
 'curve',
 'agility',
 'balance',
 'jumping',
 'interceptions',
 'positioning',
 'vision',
 'composure',
 'sliding_tackle']

Lets see how many null values each column has

In [97]:
nulls = df.isnull().sum().to_frame()
for index, row in nulls.iterrows():
    if row[0]!=0:
        print(index, row[0])

club 23
loan_date_end 16215
volleys 58
curve 58
agility 58
balance 58
jumping 58
interceptions 7
positioning 7
vision 58
composure 423
sliding_tackle 58


We can select the rows in which we don't have null values for all columns except for loan_date_end (we will deal with them differently in the next steps). This way we are skipping rows with null values.

In [98]:
df = df[df['volleys'].notna()]
df = df[df['club'].notna()]
df = df[df['composure'].notna()]

Dealing with loan_date_end Null values:

The relevant information is wether the player is loaned or not. Accordingly, we create a new column to portray that information:'loaned?'

In the new column 'loaned?', we store null values from 'loan_date_end' as "not loaned", since null values refer to players that are not loaned.
We also stored Non null values as "loaned"

In [99]:
df['loan_date_end'].fillna('not loaned', inplace=True)

loan_list = []
def create_loaned_col(x):
    if x == 'not loaned':
        loan_list.append(x)
    else:
        loan_list.append('loaned')
df['loan_date_end'].apply(create_loaned_col)
df['loaned?'] = loan_list

df=df.drop(columns=['loan_date_end'])

Finally, let's check if we have any null values left:


In [100]:
df.columns[df.isna().any()].tolist()

[]

## 4.8. Analysing numerical values

In [101]:
df_num = df.select_dtypes(include=np.number)
df_num = df_num.round(decimals=1)

In [102]:
df_num.describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,16687.0,25.15269,4.854647,16.0,21.0,25.0,29.0,53.0
ova,16687.0,66.91394,6.824825,45.0,62.0,67.0,72.0,93.0
pot,16687.0,72.49901,5.717089,47.0,69.0,72.0,76.0,95.0
height,16687.0,181.2382,6.881953,154.9,175.3,180.3,185.4,205.7
weight,16687.0,75.08882,7.16412,49.9,69.8,74.8,79.8,110.2
growth,16687.0,5.585066,5.802655,0.0,0.0,4.0,10.0,26.0
value,16687.0,2610607.0,5428192.0,0.0,375000.0,800000.0,2500000.0,105500000.0
wage,16687.0,10399.78,21084.03,0.0,1000.0,3000.0,10000.0,560000.0
release_clause,16687.0,4694515.0,10446330.0,0.0,560000.0,1300000.0,4100000.0,203100000.0
attacking,16687.0,258.1592,72.16343,42.0,231.0,271.0,305.0,437.0


## 4.9 Analysing categoricals

I drop 'nationality' and 'club' because they contain really fragmented values. There are no 5-10 values covering around 80% of the data.

In [103]:
df=df.drop(columns=['nationality', 'club'])

Lets group bp by the different categories:
- Defense
- Midfielder
- Forward
- Goalkeeper

In [104]:
defense = ['CB', 'LB', 'RB', 'RWB', 'LWB']
midfielder = ['CAM', 'RM', 'CDM', 'CM', 'LM', 'RW', 'LW']
forward = ['ST', 'CF']
goalkeeper = ['GK']

def classify_bp(x):
    if x in defense:
        return 'defense'
    elif x in midfielder:
        return 'midfielder'
    elif x in forward:
        return 'forward'
    elif x in goalkeeper:
        return 'goalkeeper'
    else:
        return "other"
df['bp'] = df['bp'].apply(classify_bp)


In [105]:
df['bp'].value_counts()

midfielder    7050
defense       5441
forward       2657
goalkeeper    1539
Name: bp, dtype: int64

In [106]:
df_cat = df.select_dtypes('object')
df_cat

Unnamed: 0,bp,foot,contract,gender,loaned?
1,midfielder,Right,5,Male,not loaned
4,midfielder,Right,3,Male,not loaned
6,midfielder,Left,1,Male,not loaned
12,forward,Right,11,Male,not loaned
26,goalkeeper,Right,2,Male,not loaned
...,...,...,...,...,...
17120,defense,Right,0,Male,loaned
17121,midfielder,Right,1,Male,not loaned
17122,defense,Right,2,Male,not loaned
17123,defense,Right,3,Male,not loaned


# 5. Visualizing our data

## 5.1. Visualizing Numericals

In [107]:
#after visualizing, we skip it so as to make code more fast
"""
for column in df.select_dtypes(np.number).columns:
    sns.distplot(df[column])
    plt.show()
"""

'\nfor column in df.select_dtypes(np.number).columns:\n    sns.distplot(df[column])\n    plt.show()\n'

# 6. Preparing our df for the Linear Regression Model

In the df we have a set of skills that help us rate each player. Nevertheless, the skills that suit GoalKeepers are not the same as the ones that work for players.

Therefore, we will create two df: one for GoalKeepers and another one for players.

- In df_players we will drop these columns:
    ['standing_tackle' 'sliding_tackle' 'goalkeeping' 'gk_diving' 'gk_handling' 'gk_kicking' 'gk_positioning' 'gk_reflexes']

- In df_gk we will drop these columns:
    ['attacking','crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys',
 'skill', 'dribbling', 'curve', 'fk_accuracy', 'long_passing', 'ball_control',
 'movement', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance',
 'power', 'shot_power', 'jumping', 'strength', 'long_shots', 'interceptions', 'positioning', 'vision',
 'penalties', 'composure', 'defending', 'marking', 'standing_tackle',
 'sliding_tackle']

In [108]:
df_players=df.drop(columns=['standing_tackle', 'sliding_tackle', 'goalkeeping', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes'])
df_players = df_players[df_players.bp!='goalkeeper']

In [109]:
df_gk=df.drop(columns=['attacking','crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys',
 'skill', 'dribbling', 'curve', 'fk_accuracy', 'long_passing', 'ball_control',
 'movement', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance',
 'power', 'shot_power', 'jumping', 'strength', 'long_shots', 'interceptions', 'positioning',
 'penalties', 'composure', 'defending', 'marking', 'standing_tackle',
 'sliding_tackle'])
df_gk = df_gk[df_gk.bp=='goalkeeper']

# 7. Starting the model for the df_players

## 7.1. X-y split

In [110]:
y_players = df_players['value']
X_players = df_players.drop(['value'], axis=1)

## 7.2. Numerical & categorical

In [111]:
X_players_cat = X_players.select_dtypes('object')
X_players_num = X_players.select_dtypes('number')

## 7.3. Encoding categorical

In [112]:
X_players_cat_enc = pd.get_dummies(X_players_cat, drop_first=True)

## 7.3. Normalizing numerical

In [113]:
def boxcox_transform(df):
    numeric_cols = df.select_dtypes(np.number).columns
    _ci = {column: None for column in numeric_cols}
    for column in numeric_cols:
        df[column] = np.where(df[column]<=0, np.NAN, df[column]) 
        df[column] = df[column].fillna(df[column].mean())
        transformed_data, ci = stats.boxcox(df[column])
        df[column] = transformed_data
        _ci[column] = [ci] 
    return df, _ci

X_players_num_normalized, _ci = boxcox_transform(X_players_num)
X_players_num_normalized.shape, X_players_cat_enc.shape

((15148, 43), (15148, 27))

## 7.4. Concatenating data

In [114]:
X = np.concatenate([X_players_num_normalized, X_players_cat_enc], axis=1)
X

array([[ 2.47764805, 53.29020232,  9.09361361, ...,  0.        ,
         0.        ,  1.        ],
       [ 2.46538926, 60.27583599,  9.61949685, ...,  0.        ,
         0.        ,  1.        ],
       [ 2.48950799, 57.49068435,  9.41475537, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 2.12891282, 42.65304989,  8.86911818, ...,  0.        ,
         0.        ,  1.        ],
       [ 2.18347849, 46.93316899,  9.20264465, ...,  0.        ,
         0.        ,  1.        ],
       [ 2.12891282, 44.7976191 ,  9.25639933, ...,  0.        ,
         0.        ,  1.        ]])

## 7.5. Splitting data

In [115]:
y = y_players
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)


In [116]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [117]:
predictions = model.predict(X_test)
r2 = r2_score(y_test, predictions)
print(r2)

0.6221536751950691


# 8. Starting the model for the df_gk

## 8.1. Numerical & categorical

In [118]:
y_gk = df_gk['value']
X_gk = df_gk.drop(['value'], axis=1)

In [119]:
X_gk_cat = X_gk.select_dtypes('object')
X_gk_num = X_gk.select_dtypes('number')

## 8.2. Encoding categorical

In [120]:
X_gk_cat_enc = pd.get_dummies(X_gk_cat, drop_first=True) # i'm lazy

## 8.3. Normalizing numerical

In [121]:
def boxcox_transform(df):
    numeric_cols = df.select_dtypes(np.number).columns
    _ci = {column: None for column in numeric_cols}
    for column in numeric_cols:
        df[column] = np.where(df[column]<=0, np.NAN, df[column]) 
        df[column] = df[column].fillna(df[column].mean())
        transformed_data, ci = stats.boxcox(df[column])
        df[column] = transformed_data
        _ci[column] = [ci] 
    return df, _ci

X_gk_num_normalized, _ci = boxcox_transform(X_gk_num)
X_gk_num_normalized.shape


(1539, 20)

## 8.4. Concatenating data

In [122]:
X = np.concatenate([X_gk_num_normalized, X_gk_cat_enc], axis=1)
X

array([[ 3.92819944, 50.66903846, 60.71796657, ...,  0.        ,
         0.        ,  1.        ],
       [ 3.78854301, 45.17099574, 53.77683497, ...,  0.        ,
         0.        ,  1.        ],
       [ 3.75841586, 46.83084616, 55.86721347, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 3.06184527, 37.29048015, 51.67911898, ...,  0.        ,
         0.        ,  1.        ],
       [ 3.268858  , 44.05914812, 57.9505723 , ...,  0.        ,
         0.        ,  1.        ],
       [ 3.06184527, 38.43104823, 53.77683497, ...,  0.        ,
         0.        ,  1.        ]])

## 8.5. Splitting data

In [123]:
y = y_gk
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)


## 8.6. Results

In [124]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [125]:
predictions = model.predict(X_test)
r2 = r2_score(y_test, predictions)
print(r2)

0.6410773107567982


# 9. Doing it for the whole df because of poor performance

As it hasn't proven to be very effective splitting the df into players and gk, we will do the model for de unsplitted

## 9.1. Removing outliers and storing in another df

In [126]:
def remove_outliers(df, threshold=1.5, in_columns=df.select_dtypes(np.number).columns, skip_columns=[]):
    for column in in_columns:
        if column not in skip_columns:
            upper = np.percentile(df[column],75)
            lower = np.percentile(df[column],25)
            iqr = upper - lower
            upper_limit = upper + (threshold * iqr)
            lower_limit = lower - (threshold * iqr)
            df = df[(df[column]>lower_limit) & (df[column]<upper_limit)]
    return df

In [127]:
df1 = df.copy()

df1 = remove_outliers(df1, threshold=1.5)

## 9.2. Normalizing numericals

In [128]:
def boxcox_transform(df):
    numeric_cols = df.select_dtypes(np.number).columns
    _ci = {column: None for column in numeric_cols}
    for column in numeric_cols:
        df[column] = np.where(df[column]<=0, np.NAN, df[column]) 
        df[column] = df[column].fillna(df[column].mean())
        transformed_data, ci = stats.boxcox(df[column])
        df[column] = transformed_data
        _ci[column] = [ci] 
    return df, _ci

df1_normalized, _ci = boxcox_transform(df1)
df1_normalized.shape

(7234, 57)

## 9.3. X-y Splitting and separating cat from num

In [129]:
y = df1['value']
X = df1.drop(['value'], axis=1)

In [130]:
X_cat = X.select_dtypes('object')
X_num = X.select_dtypes('number')

## 9.4. Encoding categoricals

In [131]:
X_cat_enc = pd.get_dummies(X_cat, drop_first=True) # i'm lazy

## 9.5. Concatenating 

In [132]:
X = np.concatenate([X_num, X_cat_enc], axis=1)

## 9.6. Splitting Test vs train data

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

## 9.7. Starting the model

In [134]:
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)


## 9.8. Results

In [135]:
r2 = r2_score(y_test, predictions)
print(r2)
print(mean_absolute_error(y_test, predictions))
print(mean_squared_error(y_test, predictions, squared=False))

0.930247298211031
1.6300909594797977
2.476222482875241


In [136]:
model.coef_

array([-1.04198732e+02,  8.70706864e-06,  5.53675287e+00,  1.74123561e-02,
        1.77022246e-01, -2.39764788e-01,  1.67686720e-01,  8.42923885e-02,
        8.55241713e-03, -8.38819381e-04, -3.62267630e-04, -1.09370560e-02,
       -4.35519123e-05, -4.01830312e-02,  5.19689702e-02, -1.69693842e-05,
       -5.50083441e-02, -8.79699730e-01, -2.96385253e-04, -4.27099087e-05,
        4.20646738e-02, -4.01502015e-02, -7.91798355e-02, -7.42447888e-02,
       -2.45326892e-02, -5.23337331e-02, -1.29254309e-02,  1.55889439e-03,
        5.07174126e-02,  4.43396324e-02,  1.06182104e-02,  3.37994853e-03,
        1.78972402e-01, -9.91253550e-03, -2.55941851e-02, -3.00564828e-04,
       -2.01788980e-03, -5.81176025e-02,  3.59289903e-04, -1.46205644e-02,
        1.08867008e-02,  9.06736392e-03, -2.46143403e-03, -5.24191456e-01,
        7.29304578e-01,  6.21086891e-01,  7.28414905e-01,  5.89187354e-01,
        6.77269161e-01,  2.09837589e-02, -7.36752834e-04,  4.41017876e-01,
        5.39920837e-01, -

In [137]:
predictions = inv_boxcox(predictions, _ci['value'])
mean_absolute_error(y_test, predictions), mean_squared_error(y_test, predictions, squared=False)

(740657.388478455, 871386.7526600357)

In [138]:
results = pd.DataFrame()
results['true'] = inv_boxcox(y_test, _ci['value'])
results['pred'] = predictions
results['resid'] = results.apply(lambda x: abs(x['true'] - x['pred']), axis=1)
results

Unnamed: 0,true,pred,resid
10875,650000.0,5.805612e+05,69438.828261
14797,190000.0,3.731369e+05,183136.912128
14034,275000.0,2.462180e+05,28782.037259
7818,2100000.0,1.961863e+06,138137.335567
9512,425000.0,3.894050e+05,35595.016750
...,...,...,...
7474,1800000.0,1.478032e+06,321968.219381
10001,425000.0,3.575152e+05,67484.837225
11696,650000.0,7.170571e+05,67057.113135
5188,1100000.0,1.332637e+06,232636.557954


In [139]:
results.describe()

Unnamed: 0,true,pred,resid
count,2171.0,2171.0,2171.0
mean,750574.4,740721.9,96193.77
std,527146.6,459180.4,186717.8
min,35000.0,61602.84,66.88103
25%,400000.0,382253.1,19518.56
50%,650000.0,635036.8,47753.24
75%,975000.0,1007169.0,101543.4
max,4800000.0,2582602.0,2567714.0


# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Questions

## 1. Does the release_clause of a player and its value have any correlation?

- There is a correlation between these two factors. Nevertheless, it is interesting to see that this correlation is not linear since when the release_clause is 0, the value of the player is not 0.

In [141]:
fig = px.scatter(df, x='release_clause', y="value", color="bp")
fig.show()

- Bonus: we can see that something simmilar happens with wage.

In [142]:
fig = px.scatter(df, x='release_clause', y="wage", color="bp")
fig.show()

## 2. How does stamina and height affect the sprint_speed of a player?

- We can see that there is a positive correlation between stamina and sprint_speed. Additionally, we can see thanks to our colour gradient that slow players have lower stamina and are higher. The opposite happens for fast players.

In [154]:
fig = px.scatter(df, x='sprint_speed', y="stamina", color="height")
fig.show()

## # Failed question:

- In this case I wanted to show the average value for each age and the distribution of bp across each age. 

I haven't been able to find a way to show the average value though.

In [159]:
fig = px.bar(df, x="age", y=df["value"], color="bp", title="Long-Form Input")
fig.show()