In [2]:
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.axes import Axes as ax

import sklearn
from sklearn.model_selection import train_test_split

In [3]:
# Display data for easy reference 

fifa = pd.read_csv('/work/players_fifa23.csv')
fifa.head() 
fifa['test'] = fifa['Overall'] - fifa['Potential']
fifa

Unnamed: 0,ID,Name,FullName,Age,Height,Weight,PhotoUrl,Nationality,Overall,Potential,...,CMRating,RMRating,LWBRating,CDMRating,RWBRating,LBRating,CBRating,RBRating,GKRating,test
0,158023,L. Messi,Lionel Messi,35,169,67,https://cdn.sofifa.net/players/158/023/23_60.png,Argentina,91,91,...,88,91,67,66,67,62,53,62,22,0
1,165153,K. Benzema,Karim Benzema,34,185,81,https://cdn.sofifa.net/players/165/153/23_60.png,France,91,91,...,84,89,67,67,67,63,58,63,21,0
2,188545,R. Lewandowski,Robert Lewandowski,33,185,81,https://cdn.sofifa.net/players/188/545/23_60.png,Poland,91,91,...,83,86,67,69,67,64,63,64,22,0
3,192985,K. De Bruyne,Kevin De Bruyne,31,181,70,https://cdn.sofifa.net/players/192/985/23_60.png,Belgium,91,91,...,91,91,82,82,82,78,72,78,24,0
4,231747,K. Mbappé,Kylian Mbappé,23,182,73,https://cdn.sofifa.net/players/231/747/23_60.png,France,91,95,...,84,92,70,66,70,66,57,66,21,-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18534,243725,D. Collins,Darren Collins,21,174,68,https://cdn.sofifa.net/players/243/725/23_60.png,Republic of Ireland,47,56,...,44,50,41,38,41,40,36,40,15,-9
18535,261933,Yang Dejiang,Dejiang Yang,17,175,60,https://cdn.sofifa.net/players/261/933/23_60.png,China PR,47,57,...,45,45,47,48,47,49,49,49,15,-10
18536,267823,L. Mullan,Liam Mullan,18,170,65,https://cdn.sofifa.net/players/267/823/23_60.png,Northern Ireland,47,67,...,49,52,46,44,46,46,42,46,17,-20
18537,267824,D. McCallion,Daithí McCallion,17,178,65,https://cdn.sofifa.net/players/267/824/23_60.png,Republic of Ireland,47,61,...,33,33,44,42,44,47,49,47,15,-14


In [41]:
# Replace missing values (NaN) with column mean; drop rows where value/wage/releaseClause info are unavailable  

fifa['Age'].fillna(fifa['Age'].mean(), inplace = True)
fifa['Height'].fillna(fifa['Height'].mean(), inplace = True)
fifa['Weight'].fillna(fifa['Weight'].mean(), inplace = True)
fifa['ValueEUR'].fillna(fifa['ValueEUR'].mean(), inplace = True)
fifa['WageEUR'].fillna(fifa['WageEUR'].mean(), inplace = True)
fifa['ReleaseClause'].fillna(fifa['ReleaseClause'].mean(), inplace = True)

fifa = fifa.drop(fifa[fifa['ValueEUR']==0].index,axis=0)
fifa = fifa.drop(fifa[fifa['ReleaseClause']==0].index,axis=0)
fifa = fifa.drop(fifa[fifa['WageEUR']==0].index,axis=0)

# Clean and organize 'age' column into <30 and > 30

def under_30(df):
    return (df['Age']<30)
temp = pd.get_dummies(under_30(fifa))
fifa['under_30'],fifa['above_30'] = temp.iloc[:,1],temp.iloc[:,0]

fifa['Age under 30'] = fifa['under_30']*fifa['Age']
fifa['Age above 30'] = fifa['above_30']*fifa['Age']

# Clean dataset, create new column 'ContractLen' to replace two redundant columns

def contract_len(df):
    return df['ContractUntil']-df['ClubJoined']
fifa['ContractLen'] = contract_len(fifa)

# Replace player attributes with log values as well as using other forms of transformation

fifa['log ShootingTotal'] = fifa['ShootingTotal']
fifa['log PassingTotal'] = fifa['PassingTotal']
fifa['log DribblingTotal'] = fifa['DribblingTotal']
fifa['log DefendingTotal'] = fifa['DefendingTotal']

fifa['log HeadingAccuracy'] =fifa['HeadingAccuracy']
fifa['log Acceleration'] = fifa['Acceleration']
fifa['log Strength'] = fifa['Strength']
fifa['log Positioning'] = fifa['Positioning']

fifa['log Overall'] = fifa['Overall']
fifa['log BaseStats'] = fifa['BaseStats']
fifa['log Reactions'] = fifa['Reactions']
fifa['log Potential'] = fifa['Potential']
fifa['augmented IntReputation'] = fifa['IntReputation']

fifa['log ReleaseClause'] = fifa['ReleaseClause']
fifa['log ValueEUR'] = fifa['ValueEUR']

In [42]:
# Finally, clean the dataset, select necessary columns, and get ready to put them into the model

df = pd.get_dummies(fifa[['WageEUR', 'log ValueEUR', 'log ReleaseClause','augmented IntReputation', 
'log Overall', 'log BaseStats', 'log Potential', 'log Reactions',
'log HeadingAccuracy', 'log Acceleration','log Strength', 'log Positioning',
'log ShootingTotal','log PassingTotal', 'log DribblingTotal', 'log DefendingTotal',
]])

df.head()

Unnamed: 0,WageEUR,log ValueEUR,log ReleaseClause,augmented IntReputation,log Overall,log BaseStats,log Potential,log Reactions,log HeadingAccuracy,log Acceleration,log Strength,log Positioning,log ShootingTotal,log PassingTotal,log DribblingTotal,log DefendingTotal
0,195000,54000000,99900000,5,91,452,91,92,70,87,68,93,89,90,94,34
1,450000,64000000,131199999,4,91,455,91,92,90,79,82,92,88,83,87,39
2,420000,84000000,172200000,5,91,458,91,93,91,76,87,94,91,79,86,44
3,350000,107500000,198900000,4,91,483,91,91,55,76,74,88,88,93,87,64
4,230000,190500000,366700000,4,91,470,95,93,72,97,76,92,89,80,92,36


In [43]:
columns = df.columns
corr_list = []
column_needed = []
column_needed_2 = []
#y = fifa['WageEUR']
for col in columns:
    for col2 in columns:
        #if type(df[col][0]) == np.int64:
        corr = np.corrcoef(df[col],df[col2])
        corr_list.append(np.round(corr[0][1], 3))
        column_needed.append(col)
        column_needed_2.append(col2)
data = {"Column": column_needed, "Column2": column_needed_2, "Correlation Score": corr_list}
correlation_score = pd.DataFrame(data)
correlation_score = correlation_score[correlation_score['Correlation Score'] != 1.0]
correlation_score = correlation_score[correlation_score['Correlation Score'] > 0.7]
correlation_score.sort_values(by=['Correlation Score'], ascending=False)

Unnamed: 0,Column,Column2,Correlation Score
18,log ValueEUR,log ReleaseClause,0.995
33,log ReleaseClause,log ValueEUR,0.995
71,log Overall,log Reactions,0.874
116,log Reactions,log Overall,0.874
93,log BaseStats,log PassingTotal,0.853
213,log PassingTotal,log BaseStats,0.853
237,log DribblingTotal,log PassingTotal,0.846
222,log PassingTotal,log DribblingTotal,0.846
69,log Overall,log BaseStats,0.845
84,log BaseStats,log Overall,0.845


In [44]:
# Recursive correlation pruning

df = df.drop(columns=['log ValueEUR', 'log Potential', 'log Reactions', 'log BaseStats',
'log Positioning', 'log PassingTotal', 'log DribblingTotal'])


In [45]:
df.head()

Unnamed: 0,WageEUR,log ReleaseClause,augmented IntReputation,log Overall,log HeadingAccuracy,log Acceleration,log Strength,log ShootingTotal,log DefendingTotal
0,195000,99900000,5,91,70,87,68,89,34
1,450000,131199999,4,91,90,79,82,88,39
2,420000,172200000,5,91,91,76,87,91,44
3,350000,198900000,4,91,55,76,74,88,64
4,230000,366700000,4,91,72,97,76,89,36


In [46]:
columns = df.columns
corr_list = []
column_needed = []
column_needed_2 = []
#y = fifa['WageEUR']
for col in columns:
    for col2 in columns:
        #if type(df[col][0]) == np.int64:
        corr = np.corrcoef(df[col],df[col2])
        corr_list.append(np.round(corr[0][1], 3))
        column_needed.append(col)
        column_needed_2.append(col2)
data = {"Column": column_needed, "Column2": column_needed_2, "Correlation Score": corr_list}
correlation_score = pd.DataFrame(data)
correlation_score = correlation_score[correlation_score['Correlation Score'] != 1.0]
correlation_score = correlation_score[correlation_score['Correlation Score'] > 0.7]
correlation_score.sort_values(by=['Correlation Score'], ascending=False)

Unnamed: 0,Column,Column2,Correlation Score
1,WageEUR,log ReleaseClause,0.826
9,log ReleaseClause,WageEUR,0.826


In [76]:
# Fix multicollinearity 
#df['fixing'] =  df['log ReleaseClause'] * df['log Overall']
#df = df.drop(columns=['log ReleaseClause', 'log Overall'])

from sklearn.preprocessing import StandardScaler

# Train test split
training, testing = train_test_split(df, test_size=0.2, random_state=41)

from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA()

# Separating out the features
df_train = training.drop(columns=['WageEUR'])
# Separating out the target
y_train = training.loc[:,['WageEUR']]
# Standardizing the features
x_train = StandardScaler().fit_transform(df_train)

#Fit PCA on training set
pca.fit(x_train)
x_train = pca.transform(x_train)

df_test = testing.drop(columns=['WageEUR'])
# Separating out the target
y_test = testing.loc[:,['WageEUR']]
# Standardizing the features
x_test = StandardScaler().fit_transform(df_test)

x_test = pca.transform(x_test)

# scaled_features_df = pd.DataFrame(x_train_pca, index=df_train.index, columns=df_train.columns)
# scaled_features_df

In [77]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
model = lm.fit(x_train, y_train)
#model.predict(x_test)
model.score(x_train, y_train), model.score(x_test, y_test)

(0.7430052363536447, 0.7419042296081753)

In [78]:
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import train_test_split
rf = DecisionTreeRegressor(random_state = 42, criterion='friedman_mse', max_depth=6)
rf.fit(x_train, y_train)
rf.score(x_train, y_train), rf.score(x_test, y_test)

(0.8294752162101734, 0.6267856650330088)

In [79]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
rf = RandomForestRegressor(n_estimators = 200, random_state = 42, max_depth=6)
rf.fit(x_train, y_train)
rf.score(x_train, y_train), rf.score(x_test, y_test)

  rf.fit(x_train, y_train)


(0.8452721108913498, 0.7625394084917598)

In [80]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42, max_depth=8, max_features=1.0)
rf.fit(x_train, y_train)#
rf.score(x_train, y_train), rf.score(x_test, y_test)

  rf.fit(x_train, y_train)#


(0.8881673190237986, 0.7688026867776019)

In [270]:

from sklearn.svm import LinearSVR
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

model_1 = LinearRegression()
model_3 = RandomForestRegressor(n_estimators = 600, random_state = 42, max_depth=8)
model_4 = MLPRegressor(activation='relu', hidden_layer_sizes=(10, 10, 10, 10, 10, 10), max_iter=200, random_state=42)
model_5 = DecisionTreeRegressor(random_state = 42, criterion='friedman_mse', max_depth=6)
model_6 = KNeighborsRegressor(n_neighbors=6)
model_9 = GradientBoostingRegressor(learning_rate=0.1, random_state=42)

estimators = [('lrr', RidgeCV()), ('lasso', LassoCV(random_state=42)), 
                ('lr', model_1), ('rf', model_3), ('mlp', model_4), ('dt', model_5),
                ('gb', model_9)]
reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=10))
reg.fit(x_train, y_train)
reg.score(x_test, y_test)

  y = column_or_1d(y, warn=True)


In [139]:
mlp = MLPRegressor(activation='relu', hidden_layer_sizes=(10, 10, 10, 10, 10, 10), max_iter=200, random_state=42)
mlp.fit(x_train,y_train)
mlp.score(x_train,y_train), mlp.score(x_test,y_test)

  y = column_or_1d(y, warn=True)


(0.7258378447890301, 0.7314600896970136)

In [None]:
model_4 = MLPRegressor(activation='relu', hidden_layer_sizes=10, max_iter=50, random_state=42)
model_4.fit(x_train,y_train)
model_4.score(x_train,y_train), model_4.score(x_test,y_test)

  y = column_or_1d(y, warn=True)


(0.012476578877074762, 0.012982372060126535)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cc80a5ab-5a4e-4539-9976-15d1bdddd134' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>