In [173]:
import pandas as pd
import numpy as np
import seaborn as sns

In [174]:
df_fifa = pd.read_csv('fifa21_training.csv')

## Cleaning

In [175]:
def clean_data(df, column):
    df = df.drop([column], axis=1)
    cols = []
    for c in df.columns:
        cols.append(c.lower())
    df.columns = cols
    perc = 25.0
    min_count =  int(((100-perc)/100)*df.shape[0] + 1)
    df = df.dropna( axis=1, 
                thresh=min_count)
    return df

df_fifa = clean_data(df_fifa,'Unnamed: 0')

In [176]:
def nan_values(df):
    numerical = df.select_dtypes(include= np.number)
    for c in numerical:
        if numerical[c].isna().sum() > 0:
            numerical[c] = numerical[c].fillna(np.mean(numerical[c]))
    categorical = df.select_dtypes(include= np.object)
    for col in categorical.columns:
        if categorical[col].isna().sum() > 0:
            categorical[col].fillna(categorical[col].mode()[0], inplace=True)
            df = pd.concat([numerical, categorical], axis=1)
    return df

df_fifa = nan_values(df_fifa)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numerical[c] = numerical[c].fillna(np.mean(numerical[c]))
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical = df.select_dtypes(include= np.object)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


## Preprocessing & creating the training model

When building the training model we tried out several options to generate the highest possible R2 score. We first started with only the numerical data, which gave us already a R2 score of 0.908. We then tried to add in some categorical columns that we found interesting based on FIFA research on the internet. We also looked at categorical values that did not have to many different values counts, as this would make the model very lengthy, complicated and therefore noisy. We tried for example 'foot', 'a/w', 'd/w' and the ratings in 'ir'. However, adding these categorical columns made the R2 less high or it remained the same. To reduce the noise in the dataset we decided not to add any categorical values. Next, when we wanted to predict the Overall Rating for the new data set we got. Unfortunately, many of the numerical columns in this dataset did not match the numerical columns in our training dataset. We therefore decided to delete a few columns in both datasets to create identical columns on which we could base our predicitions. We found a FIFA overall rating calculator on the internet and all the variables there are also in both the datasets. In previous analysis we also found that these columns correlated moderately to highly with our target 'Overall rating'. With this knowledge we dropped all the columns in both data sets that are not in the calculator (except for 'age' and 'id') to create a working training model.

In [223]:
df_fifa = df_fifa.drop(['growth','defending','attacking','skill','movement','power','mentality','goalkeeping','total stats','base stats','pac','sho','pas','dri','def','phy'],axis=1)
df_fifa_num = df_fifa.select_dtypes(include= np.number)


In [179]:
def MinMaxtransform_encoder(df, column):
    from sklearn.preprocessing import MinMaxScaler
    y = df[column]
    X = df.drop([column], axis=1)
    X_num = X.select_dtypes(include= np.number)
    MinMaxTransformer = MinMaxScaler().fit(X_num)
    X_normalized = MinMaxTransformer.transform(X_num)
    X = pd.DataFrame(X_normalized,columns=X_num.columns)
    return X, MinMaxTransformer

X, MinMaxTransformer = MinMaxtransform_encoder(df_fifa, 'ova')

In [184]:
def predictions(df, column):
    from sklearn import linear_model
    from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
    import math
    from sklearn.model_selection import train_test_split
    y = df[column]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
    lm = linear_model.LinearRegression()
    model = lm.fit(X_train, y_train)
    predictions = lm.predict(X_test)
    predictions_df = pd.concat([df[column],pd.Series(predictions)],axis=1).head()
    return predictions_df, lm, X_test, y_test

predictions_df, lm, X_test, y_test = predictions(df_fifa, 'ova')

In [212]:
import pickle
filename = 'training_model_final.sav'
pickle.dump(lm, open(filename, 'wb'))

In [202]:
def scores(model, test_x, test_y):
    from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
    from math import sqrt
    predictions = model.predict(test_x)

    r2 = r2_score(predictions, test_y)
    mse = mean_squared_error(predictions, test_y)
    mae = mean_absolute_error(predictions, test_y)
    rmse = sqrt(mse)

    return r2, mse, mae, rmse

r2, mse, mae, rmse = scores(lm, X_test, y_test)



As the score below show, we created an accurate model with a R2 socre of 0.85. With a RMSE of 2.44 our predictions will only we 2.44 rating points away from the actual overall rating.

In [208]:
print('R2 Score:', r2)
print('Mean Squared Error:', mse)
print('Mean aboslute Error:', mae)
print('Root Mean Squared Error:',rmse)

R2 Score: 0.8585125083353262
Mean Squared Error: 5.95761074466989
Mean aboslute Error: 1.903567656504039
Root Mean Squared Error: 2.4408217355370074


## Making predictions

Firstly, we used the above defined functions to clean the data. Secondly, we deleted the columns that did not match the columns from our training data set, to make sure we can predict the correct overall rating. Then by using our previously defined MinMaxTransformer and Linear Model we have been able to make predictions for the Overall Rating for the new data set. 

In [190]:
fifa_new = pd.read_csv('fifa_new_data.csv')

In [191]:
def clean_data(df, column):
    df = df.drop([column], axis=1)
    cols = []
    for c in df.columns:
        cols.append(c.lower())
    df.columns = cols
    perc = 25.0
    min_count =  int(((100-perc)/100)*df.shape[0] + 1)
    df = df.dropna( axis=1, 
                thresh=min_count)
    return df

fifa_new = clean_data(fifa_new,'Unnamed: 0')

In [192]:
def nan_values(df):
    numerical = df.select_dtypes(include= np.number)
    for c in numerical:
        if numerical[c].isna().sum() > 0:
            numerical[c] = numerical[c].fillna(np.mean(numerical[c]))
    categorical = df.select_dtypes(include= np.object)
    for col in categorical.columns:
        if categorical[col].isna().sum() > 0:
            categorical[col].fillna(categorical[col].mode()[0], inplace=True)
            df = pd.concat([numerical, categorical], axis=1)
    return df

fifa_new = nan_values(fifa_new)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numerical[c] = numerical[c].fillna(np.mean(numerical[c]))
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical = df.select_dtypes(include= np.object)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [209]:
# X = df_fifa.drop(['ova','value'], axis=1)
# X_num = X.select_dtypes(include= np.number)
# MinMaxTransformer = MinMaxScaler().fit(X_num)
predict_fifa_num = fifa_new.select_dtypes(include=np.number)
predict_fifa_num = predict_fifa_num.drop(['value_eur', 'height_cm', 'potential', 'wage_eur', 'international_reputation','weak_foot','skill_moves','release_clause_eur','team_jersey_number','contract_valid_until','pace','shooting','passing','dribbling','defending','physic','weight_kg'],axis=1)


In [194]:
from sklearn.preprocessing import MinMaxScaler
predict_num_normalized = MinMaxTransformer.transform(predict_fifa_num)
X = pd.DataFrame(predict_num_normalized, columns=predict_fifa_num.columns)
#X = pd.DataFrame(X_normalized,columns=X_num.columns)
X

Unnamed: 0,sofifa_id,age,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,...,mentality_penalties,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,0.936694,0.129032,0.284091,0.228261,0.522727,0.348837,0.279070,0.351648,0.222222,0.247191,...,0.344828,0.357143,0.483516,0.583333,0.571429,0.089888,0.055556,0.054348,0.120879,0.101124
1,0.759977,0.483871,0.136364,0.086957,0.068182,0.174419,0.127907,0.054945,0.088889,0.067416,...,0.172414,0.416667,0.197802,0.059524,0.059524,0.662921,0.655556,0.663043,0.670330,0.662921
2,0.969688,0.225806,0.522727,0.554348,0.375000,0.593023,0.430233,0.615385,0.533333,0.460674,...,0.482759,0.488095,0.373626,0.321429,0.309524,0.101124,0.088889,0.076087,0.043956,0.089888
3,0.941859,0.129032,0.352273,0.304348,0.625000,0.441860,0.348837,0.230769,0.188889,0.224719,...,0.310345,0.369048,0.527473,0.654762,0.571429,0.123596,0.088889,0.076087,0.054945,0.157303
4,0.926975,0.193548,0.602273,0.554348,0.386364,0.709302,0.430233,0.659341,0.633333,0.584270,...,0.436782,0.607143,0.560440,0.571429,0.547619,0.067416,0.088889,0.119565,0.098901,0.134831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.803949,0.354839,0.306818,0.228261,0.738636,0.441860,0.290698,0.274725,0.300000,0.303371,...,0.379310,0.607143,0.714286,0.821429,0.738095,0.157303,0.088889,0.130435,0.054945,0.157303
996,0.693544,0.451613,0.784091,0.500000,0.738636,0.767442,0.697674,0.758242,0.611111,0.707865,...,0.678161,0.714286,0.670330,0.809524,0.797619,0.112360,0.088889,0.086957,0.065934,0.168539
997,0.809858,0.290323,0.102273,0.173913,0.056818,0.325581,0.174419,0.120879,0.122222,0.146067,...,0.275862,0.214286,0.076923,0.107143,0.095238,0.808989,0.733333,0.663043,0.725275,0.831461
998,0.747409,0.354839,0.670455,0.260870,0.375000,0.651163,0.255814,0.637363,0.511111,0.438202,...,0.402299,0.476190,0.571429,0.678571,0.642857,0.089888,0.133333,0.054348,0.076923,0.067416


In [227]:
fifa_predict = lm.predict(X)


[51.88773293 67.58766956 58.70531672 60.10422673 66.12060927 68.32307766
 61.72688111 71.02244502 65.77327919 62.21622802 58.38597431 69.07264085
 55.52080595 68.60754541 71.29798765 67.73191763 72.08184353 58.1429433
 73.23104464 66.56010387 74.88564067 65.45664182 71.0133761  54.57151625
 77.41035332 55.90950984 65.41425544 60.28470678 65.73010284 65.71475788
 62.13898215 69.97891241 57.56168187 63.27788325 67.38734446 70.87472761
 71.47029386 65.52883745 65.36572346 63.87307297 56.34558638 58.31796934
 67.10359069 66.21114549 53.45069262 56.81816442 58.35172685 61.53152685
 67.84266372 64.91783081 70.55167053 71.4375317  68.10949004 56.746102
 67.25605401 59.50975398 71.40189294 74.67999507 66.99804691 75.39031305
 63.22993194 64.89214054 79.29847867 68.50618876 57.50030966 69.75863031
 73.93517553 63.76628747 66.09555529 67.65316506 76.44196836 66.34484554
 58.45495944 58.20895896 70.61345213 72.29514807 66.67586132 78.38905072
 70.35393535 56.49230761 70.01997304 73.20509735 57.94

AttributeError: 'NoneType' object has no attribute 'head'

In [211]:
predictions = pd.concat([fifa_new['short_name'],pd.Series(fifa_predict)],axis=1)
predictions.head(100)

Unnamed: 0,short_name,0
0,S. Graves,51.887733
1,D. Russo,67.587670
2,A. Carlone,58.705317
3,S. Belkahia,60.104227
4,Edu Cortina,66.120609
...,...,...
95,C. Fejér,63.159039
96,J. Demetriou,68.132883
97,W. Dutoit,69.774855
98,R. Iio,65.037489


## Checking the predictions with the actual values

As the results below show, the predictions are very close to the actual values. The R2 score only went down by 0.01 compared to the training model and the RMSE went up by 0.1. With this information we can conclude that the current model accurately predicts the Overall Rating for the FIFA players. 

In [226]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
actual_score = [52, 62, 57, 59, 66, 65, 62, 73, 70, 62, 58, 72, 52, 68, 69, 67, 71, 61, 72, 70, 73, 64, 71, 56, 76, 56, 68, 63, 66, 67, 62, 70, 64, 62, 66, 70, 70, 66, 66, 70, 57, 55, 64, 64, 51, 58, 62, 60, 68, 67, 68, 71, 66, 56, 65, 60, 71, 72, 67, 75, 63, 67, 78, 66, 61, 69, 73, 64, 64, 67, 78, 64, 60, 58, 69, 68, 68, 82, 71, 54, 68, 70, 56, 69, 64, 62, 70, 69, 65, 79, 63, 61, 62, 63, 60, 63, 66, 71, 64, 68, 74, 71, 62, 65, 74, 57, 66, 71, 64, 64, 65, 65, 49, 75, 62, 67, 63, 75, 61, 69, 61, 72, 65, 60, 65, 61, 66, 67, 65, 70, 65, 64, 76, 76, 62, 67, 62, 70, 70, 60, 60, 68, 54, 71, 69, 72, 64, 66, 64, 67, 59, 59, 71, 56, 85, 79, 72, 65, 57, 73, 76, 67, 53, 66, 65, 67, 72, 75, 63, 70, 58, 66, 69, 88, 67, 62, 63, 72, 68, 69, 62, 64, 54, 75, 61, 58, 67, 77, 59, 49, 64, 68, 70, 72, 55, 62, 70, 77, 65, 81, 55, 61, 59, 70, 59, 64, 58, 64, 69, 63, 56, 58, 64, 67, 50, 67, 64, 51, 68, 56, 54, 64, 67, 60, 70, 78, 66, 72, 72, 66, 70, 76, 65, 62, 73, 68, 61, 72, 58, 56, 71, 60, 73, 65, 65, 74, 73, 67, 69, 65, 73, 69, 71, 74, 66, 74, 75, 79, 72, 69, 64, 66, 65, 72, 63, 65, 67, 69, 78, 51, 65, 75, 60, 67, 69, 64, 57, 63, 76, 64, 72, 59, 73, 65, 70, 80, 72, 59, 88, 61, 69, 61, 65, 69, 66, 70, 66, 60, 67, 68, 75, 62, 52, 69, 65, 81, 86, 72, 72, 66, 52, 56, 78, 62, 71, 78, 58, 60, 68, 64, 73, 53, 75, 68, 60, 77, 66, 73, 63, 71, 67, 65, 80, 77, 70, 66, 69, 65, 53, 52, 74, 65, 65, 67, 69, 70, 60, 59, 69, 68, 54, 82, 68, 75, 69, 71, 70, 79, 67, 58, 79, 64, 57, 68, 68, 66, 66, 73, 64, 81, 60, 69, 52, 59, 57, 68, 67, 55, 72, 76, 75, 64, 74, 65, 67, 59, 65, 66, 72, 73, 51, 67, 64, 62, 66, 68, 52, 56, 64, 70, 78, 71, 59, 69, 61, 62, 66, 64, 66, 67, 61, 72, 66, 64, 70, 70, 54, 74, 68, 64, 69, 65, 62, 76, 63, 66, 61, 72, 69, 76, 65, 76, 61, 52, 80, 67, 63, 60, 68, 66, 67, 59, 67, 72, 60, 51, 62, 81, 71, 69, 56, 67, 68, 69, 63, 65, 69, 62, 65, 71, 67, 66, 61, 73, 61, 51, 62, 61, 75, 65, 76, 68, 69, 65, 62, 64, 64, 73, 75, 71, 69, 58, 67, 60, 52, 65, 58, 77, 50, 80, 70, 68, 66, 69, 69, 60, 69, 61, 68, 80, 77, 67, 65, 74, 66, 65, 68, 78, 62, 80, 60, 87, 83, 68, 54, 67, 68, 60, 67, 61, 64, 59, 66, 73, 72, 58, 73, 72, 60, 53, 71, 65, 74, 73, 56, 75, 71, 64, 64, 66, 64, 70, 73, 78, 75, 53, 79, 73, 63, 67, 56, 73, 62, 54, 67, 63, 71, 69, 74, 74, 76, 68, 68, 64, 58, 62, 60, 63, 68, 68, 66, 75, 54, 70, 74, 62, 60, 67, 74, 73, 74, 55, 79, 67, 60, 68, 64, 50, 75, 63, 72, 57, 65, 66, 71, 59, 63, 57, 55, 68, 57, 67, 73, 52, 66, 68, 67, 56, 70, 69, 66, 63, 73, 65, 51, 61, 61, 78, 68, 65, 55, 64, 60, 62, 66, 67, 53, 67, 72, 64, 52, 65, 59, 70, 65, 79, 67, 75, 65, 61, 77, 63, 53, 61, 71, 69, 78, 48, 69, 63, 53, 67, 64, 76, 76, 60, 63, 66, 62, 67, 66, 67, 69, 68, 78, 62, 74, 72, 72, 65, 63, 59, 71, 68, 67, 70, 71, 65, 62, 58, 84, 68, 63, 62, 71, 68, 62, 78, 82, 67, 72, 79, 68, 69, 68, 60, 61, 76, 69, 72, 66, 68, 77, 62, 57, 66, 53, 62, 63, 63, 59, 74, 70, 72, 61, 66, 82, 69, 58, 70, 65, 69, 72, 67, 66, 76, 70, 71, 71, 66, 77, 59, 74, 68, 68, 83, 67, 63, 63, 69, 64, 67, 63, 64, 62, 61, 48, 72, 53, 49, 59, 77, 74, 67, 66, 60, 74, 58, 91, 71, 60, 84, 73, 68, 71, 66, 54, 65, 64, 64, 63, 69, 75, 71, 68, 62, 70, 65, 79, 57, 65, 65, 70, 66, 61, 61, 74, 59, 54, 59, 64, 75, 50, 66, 63, 69, 70, 66, 63, 57, 66, 76, 54, 73, 68, 62, 64, 62, 64, 67, 70, 75, 68, 57, 70, 64, 63, 51, 74, 80, 72, 65, 72, 53, 65, 77, 72, 63, 69, 57, 69, 65, 65, 67, 70, 79, 69, 55, 73, 66, 61, 77, 68, 66, 70, 73, 71, 70, 72, 66, 66, 64, 60, 67, 63, 58, 51, 61, 71, 65, 80, 75, 74, 64, 69, 62, 73, 65, 66, 72, 64, 68, 62, 56, 78, 78, 70, 73, 52, 68, 61, 72, 61, 60, 67, 69, 79, 66, 65, 76, 66, 74, 63, 71, 66, 71, 66, 54, 70, 71, 65, 64, 68, 66, 71, 66, 79, 64, 78, 64, 54, 70, 59, 59, 76, 70, 78, 65, 61, 68, 71, 63, 65, 67, 71, 64, 65, 62, 77, 48, 75, 67, 75, 68, 64, 67, 62, 64, 59, 64, 73, 58, 85, 63, 65, 62, 69, 72, 75, 59, 71, 55, 70, 70, 64, 66, 61, 64, 54, 72, 54, 66, 62, 63, 76, 69, 69, 73, 67, 74, 71, 56, 71, 63, 67, 68, 65, 80, 60, 58, 65, 75, 66, 70, 63, 69, 61, 69, 72, 67, 65, 67, 70, 72, 68, 67, 73, 72, 59, 68]
r2_new = r2_score(fifa_predict, actual_score)
mse_new = mean_squared_error(fifa_predict, actual_score)
mae_new = mean_absolute_error(fifa_predict, actual_score)
rmse_new = sqrt(mse_new)

print('R2 Score:', r2_new)
print('Mean Squared Error:', mse_new)
print('Mean aboslute Error:', mae_new)
print('Root Mean Squared Error:',rmse_new)

R2 Score: 0.8446881487823947
Mean Squared Error: 6.328733527029574
Mean aboslute Error: 1.9936893757028347
Root Mean Squared Error: 2.5156974235844767
