In [43]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder

In [44]:
FIFA = pd.read_csv('fifa21_train.csv')

In [45]:
def cleanHightWeightFoot(df):
    #Height
    player_info_part_clean = df.copy()
    def feet2cm (x):
        height = x.split("'")
        return float(height[0])*30.48 + float(height[1][:-1])*2.54  
    player_info_part_clean['height'] = player_info_part_clean['height'].map(feet2cm)

    #Weight
    player_info_part_clean2 = player_info_part_clean.copy()
    def libs2kg (x):
        return float(x[:-3]) * 0.45359237
    player_info_part_clean2['weight'] = player_info_part_clean2['weight'].map(libs2kg)

    # Left and right foot
    player_info_part_clean3 = player_info_part_clean2.copy()
    mapping = {'Right':0,'Left':1}
    player_info_part_clean3 = player_info_part_clean3.replace({'foot': mapping})
    player_info_part_clean3.head()
    
    return player_info_part_clean3

In [46]:
def clean_money(df):
    
    data = df.copy()
    
    def typeConvert(x):
        x = x[1:]
        if 'K' in x:
            x = 1000*float(x[:-1])
        elif "M" in x:
            x = 1000000*float(x[:-1])
        else:
            float(x)
        return str(x)

    data['release clause'] = data['release clause'].map(typeConvert)
    data['release clause'] = data['release clause'].astype(float)

    data['wage'] = data['wage'].map(typeConvert)
    data['wage'] = data['wage'].astype(float)

    data['value'] = data['value'].map(typeConvert)
    data['value'] = data['value'].astype(float)
    
    return data

In [47]:
def clean_player(x):
    x = x.drop(['Loan Date End'],axis=1)
    cols = []
    for i in range(len(x.columns)):
        cols.append(x.columns[i].lower())
    x.columns = cols
        
    x.position.fillna(x.bp, inplace=True)
    x.club.fillna(x['team & contract'], inplace=True)
    x = x.drop(['joined','team & contract'],axis=1)
    
    x = cleanHightWeightFoot(x)
    x = clean_money(x)
    
    return x

In [48]:
def clean_skills(df):
    skills = df.copy()
    #Dropping columns
    skills = skills.drop(['attacking', 'skill', 'movement', 'power', 'mentality', 'defending', 'goalkeeping'], axis = 1)

    #Dropping NaNs
    skills = skills.dropna()
    skills = skills.reset_index(drop = True)

    #Replacing NaNs for the mean
    skills = skills.fillna(skills.mean())
    skills['composure'] = skills['composure'].round(decimals = 1)
    
    return skills

In [49]:
def cleanPlayerBaseStats(df):
    # Meaning of the columns:
    # W/F - Weak Foot
    # SM - Skill Moves
    # A/W - Attacking Work Rate
    # D/W - Defending Work Rate
    # IR -  Interception rate?
    # PAC - Pace .. player speed
    # SHO - Shoot
    # DRI - Dribling
    # DEF - Defense
    # PHY - Physical
    # Hits - 
    player_base_stats = df.copy()
   
    player_base_stats = player_base_stats.dropna()
    player_base_stats.reset_index(drop=True)

    player_base_stats_clean = player_base_stats.copy()
    removeStars1 = lambda x: x[:-1]
    removeStars2 = lambda x: x[:-2]
    player_base_stats_clean['w/f'] = player_base_stats_clean['w/f'].map(removeStars2)
    player_base_stats_clean['sm'] = player_base_stats_clean['sm'].map(removeStars1)
    player_base_stats_clean['ir'] = player_base_stats_clean['ir'].map(removeStars2)

    # We change names by data for some features (the ones that can be ordered)
    player_base_stats_clean2 = player_base_stats_clean.copy()
    mapping = {'Low':0,'Medium':1,'High':2}
    player_base_stats_clean2 = player_base_stats_clean2.replace({'a/w': mapping})
    player_base_stats_clean2 = player_base_stats_clean2.replace({'d/w': mapping})

    # We convert them to numeric values
    player_base_stats_clean3= player_base_stats_clean2.copy()
    player_base_stats_clean3['w/f'] = player_base_stats_clean3['w/f'].astype(float)
    player_base_stats_clean3['sm'] = player_base_stats_clean3['sm'].astype(float)
    player_base_stats_clean3['ir'] = player_base_stats_clean3['ir'].astype(float)

    def typeConvert(x):
        if 'K' in x:
            x = 1000*float(x[:-1])
        else:
            float(x)
        return str(x) 

    player_base_stats_clean3['hits'] = player_base_stats_clean3['hits'].map(typeConvert)
    player_base_stats_clean3['hits'] = player_base_stats_clean3['hits'].astype(float)

    return player_base_stats_clean3

In [50]:
def clean_positions(df):
    
    df_remaining = df.drop(['ls',  'st',  'rs',  'lw',  'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 
              'lm', 'lcm', 'cm', 'rcm',  'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb',
              'lcb', 'cb', 'rcb', 'rb', 'gk', 'ova'], axis=1)
    
    #Get the relevant data
    df_edit = df[['ls',  'st',  'rs',  'lw',  'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 
              'lm', 'lcm', 'cm', 'rcm',  'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb',
              'lcb', 'cb', 'rcb', 'rb', 'gk', 'ova']]
    
    pos = df_edit.copy()
    #Create splits and add to the df
    pos[['ls','ls_add']] = pos.ls.str.split('+',1).tolist()
    pos[['st','st_add']] = pos.st.str.split('+',1).tolist()
    pos[['rs','rs_add']] = pos.rs.str.split('+',1).tolist()
    pos[['lw','lw_add']] = pos.lw.str.split('+',1).tolist()
    pos[['lf','lf_add']] = pos.lf.str.split('+',1).tolist()
    pos[['cf','cf_add']] = pos.cf.str.split('+',1).tolist()
    pos[['rf','rf_add']] = pos.rf.str.split('+',1).tolist()
    pos[['rw','rw_add']] = pos.rw.str.split('+',1).tolist()
    pos[['lam','lam_add']] = pos.lam.str.split('+',1).tolist()
    pos[['cam','cam_add']] = pos.cam.str.split('+',1).tolist()
    pos[['ram','ram_add']] = pos.ram.str.split('+',1).tolist()
    pos[['lm','lm_add']] = pos.lm.str.split('+',1).tolist()
    pos[['lcm','lcm_add']] = pos.lcm.str.split('+',1).tolist()
    pos[['cm','cm_add']] = pos.cm.str.split('+',1).tolist()
    pos[['rcm','rcm_add']] = pos.rcm.str.split('+',1).tolist()
    pos[['rm','rm_add']] = pos.rm.str.split('+',1).tolist()
    pos[['lwb','lwb_add']] = pos.lwb.str.split('+',1).tolist()
    pos[['ldm','ldm_add']] = pos.ldm.str.split('+',1).tolist()
    pos[['cdm','cdm_add']] = pos.cdm.str.split('+',1).tolist()
    pos[['rwb','rwb_add']] = pos.rwb.str.split('+',1).tolist()
    pos[['lb','lb_add']] = pos.lb.str.split('+',1).tolist()
    pos[['lcb','lcb_add']] = pos.lcb.str.split('+',1).tolist()
    pos[['rcb','rcb_add']] = pos.rcb.str.split('+',1).tolist()
    pos[['rb','rb_add']] = pos.rb.str.split('+',1).tolist()
    pos[['gk','gk_add']] = pos.gk.str.split('+',1).tolist()
    
    
    #Convert all columns to numeric
    for a in pos:
        pos[a] =  pd.to_numeric(pos[a], errors='coerce')
    
    #Select only columns that are relevant in respect to multicollinearity
    pos_spl_final = pos[['st',  'lw', 'cf', 'cam', 'lm','cm', 'lwb', 'cdm',  'lb', 'lcb', 'gk', 'st_add',
                             'lw_add', 'cf_add','cam_add', 'lm_add', 'cm_add', 'lwb_add', 'cdm_add', 'lb_add', 
                             'lcb_add','gk_add','ova']]
    
    df_new = pd.concat([df_remaining,pos_spl_final], axis=1)    
    
    return df_new

In [51]:
def preprocess(df):
    df_clean = df.copy()
    
    df_clean = clean_player(df_clean)
    df_clean = clean_skills(df_clean)
    df_clean = cleanPlayerBaseStats(df_clean)
    df_clean = clean_positions(df_clean)
    
    return df_clean

In [52]:
print(FIFA.shape)
FIFA_preprocessed = preprocess(FIFA)

(11701, 101)


  skills = skills.fillna(skills.mean())


In [53]:
print(FIFA_preprocessed.shape)
FIFA_preprocessed.head()

(11422, 86)


Unnamed: 0,id,name,age,nationality,club,bp,position,height,weight,foot,...,cf_add,cam_add,lm_add,cm_add,lwb_add,cdm_add,lb_add,lcb_add,gk_add,ova
0,184383,A. Pasche,26,Switzerland,FC Lausanne-Sport,CM,CM CDM,175.26,73.028372,0,...,0,1,1,1,1,1,1,1,1,64
1,188044,Alan Carvalho,30,China PR,Beijing Sinobo Guoan FC,ST,ST LW LM,182.88,72.121187,0,...,0,1,1,2,2,2,2,2,2,77
2,184431,S. Giovinco,33,Italy,Al Hilal,CAM,CAM CF,162.56,60.781378,0,...,0,0,1,2,2,2,2,2,2,80
3,233796,J. Evans,22,Wales,Swansea City,CDM,CDM CM,177.8,68.94604,0,...,0,2,2,2,2,2,2,2,2,59
4,234799,Y. Demoncy,23,France,US Orléans Loiret Football,CDM,CDM CM,180.34,68.038855,0,...,0,2,2,2,2,2,2,2,2,65


In [77]:
X = FIFA_preprocessed.drop(['ova','name','club','bp'], axis=1)
y = FIFA_preprocessed[['ova']]

In [62]:
# #       · Numerical Features.
# numerical = X.select_dtypes(include=[np.number])
# numerical.columns

Index(['id', 'age', 'height', 'weight', 'foot', 'growth', 'value', 'wage',
       'release clause', 'crossing', 'finishing', 'heading accuracy',
       'short passing', 'volleys', 'dribbling', 'curve', 'fk accuracy',
       'long passing', 'ball control', 'acceleration', 'sprint speed',
       'agility', 'reactions', 'balance', 'shot power', 'jumping', 'stamina',
       'strength', 'long shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'composure', 'marking', 'standing tackle',
       'sliding tackle', 'gk diving', 'gk handling', 'gk kicking',
       'gk positioning', 'gk reflexes', 'total stats', 'base stats', 'w/f',
       'sm', 'a/w', 'd/w', 'ir', 'pac', 'sho', 'pas', 'dri', 'def', 'phy',
       'hits', 'st', 'lw', 'cf', 'cam', 'lm', 'cm', 'lwb', 'cdm', 'lb', 'lcb',
       'gk', 'st_add', 'lw_add', 'cf_add', 'cam_add', 'lm_add', 'cm_add',
       'lwb_add', 'cdm_add', 'lb_add', 'lcb_add', 'gk_add'],
      dtype='object')

In [65]:
# from sklearn.preprocessing import MinMaxScaler 
# transformer = MinMaxScaler().fit(numerical)
# x_normalized = transformer.transform(numerical)
# print(x_normalized.shape)
# x_normalized = pd.DataFrame(x_normalized, columns=numerical.columns)

(11422, 79)


In [73]:
# #       · Categorical Features.
# categorical = X.select_dtypes(include=[object])
# categorical.columns

Index(['nationality', 'club', 'bp', 'position', 'contract'], dtype='object')

In [74]:
# #       · Encoding Categorical Data.
# def categoricalEncoder(df):
#     cat = df.copy()

#     # We define the function that we will use in OneHotColum
#     def oneHotColumn(col_transform):
#         enc = OneHotEncoder(handle_unknown='ignore')
#         enc.fit(col_transform)
#         onehotlabels = enc.transform(col_transform).toarray()
#         return pd.DataFrame(onehotlabels,columns = enc.categories_)

#     # We apply OneHotColum to the other features of the frame
#     concat_categorical = pd.DataFrame()# categorical[['Customer']]
#     for col in cat.columns:
#           df_from_column = oneHotColumn(cat[[col]])
#           concat_categorical = pd.concat([concat_categorical,df_from_column], axis=1)#
        
#     new_columns = []
#     for column in concat_categorical.columns:
#         new_columns.append(column[0])
#     concat_categorical.columns = new_columns
    
#     return concat_categorical

In [72]:
# categorical2 = categorical.drop(['club','position','contract'], axis=1)
# categorical_encoded = categoricalEncoder(categorical2)
# categorical_encoded.columns

Index(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Antigua &amp; Barbuda',
       'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria',
       ...
       'GK', 'LB', 'LM', 'LW', 'LWB', 'RB', 'RM', 'RW', 'RWB', 'ST'],
      dtype='object', length=173)

In [66]:
# First we get back X all together:
X_tot = pd.concat([categorical_encoded,x_normalized], axis=1) 

In [81]:
#       · Splitting into train set and test set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=100)

In [99]:
# #       · Categorical Features.
categorical_train = X_train.select_dtypes(include=[object])
categorical_test = X_test.select_dtypes(include=[object])
categorical.columns
categorical_train.shape

(7652, 3)

In [100]:
from sklearn.preprocessing import OneHotEncoder
encoder_nationality = OneHotEncoder(handle_unknown = 'ignore').fit(categorical_train[['nationality']])#fit determines how many unique alues has each categorical column. You can see below.
print(encoder_nationality.categories_)
encoded_train_nationality = encoder_nationality.transform(categorical_train[['nationality']]).toarray()
encoded_test_nationality = encoder_nationality.transform(categorical_test[['nationality']]).toarray()
nationality_encoded_train = pd.DataFrame(encoded_train_nationality,columns = encoder_nationality.categories_)
nationality_encoded_test = pd.DataFrame(encoded_test_nationality,columns = encoder_nationality.categories_)

# encoder_club = OneHotEncoder(handle_unknown = 'ignore').fit(categorical_train[['club']])#fit determines how many unique alues has each categorical column. You can see below.
# print(encoder_club.categories_)
# encoded_train_club = encoder_club.transform(categorical_train[['club']]).toarray()
# encoded_test_club = encoder_club.transform(categorical_test[['club']]).toarray()
# club_encoded_train = pd.DataFrame(encoded_train_club,columns = encoder_club.categories_)
# club_encoded_test = pd.DataFrame(encoded_test_club,columns = encoder_club.categories_)

# encoder_bp = OneHotEncoder(handle_unknown = 'ignore').fit(categorical_train[['bp']])#fit determines how many unique alues has each categorical column. You can see below.
# print(encoder_bp.categories_)
# encoded_train_bp = encoder_bp.transform(categorical_train[['bp']]).toarray()
# encoded_test_bp = encoder_bp.transform(categorical_test[['bp']]).toarray()
# bp_encoded_train = pd.DataFrame(encoded_train_bp,columns = encoder_bp.categories_)
# bp_encoded_test = pd.DataFrame(encoded_test_bp,columns = encoder_bp.categories_)

categorical_encoded_train = pd.concat([nationality_encoded_train], axis = 1)
categorical_encoded_test = pd.concat([nationality_encoded_test], axis = 1)


[array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua &amp; Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Barbados', 'Belarus', 'Belgium', 'Benin',
       'Bolivia', 'Bosnia Herzegovina', 'Brazil', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cameroon', 'Canada', 'Cape Verde',
       'Central African Republic', 'Chad', 'Chile', 'China PR',
       'Chinese Taipei', 'Colombia', 'Comoros', 'Congo', 'Costa Rica',
       'Croatia', 'Cuba', 'Curacao', 'Cyprus', 'Czech Republic',
       'DR Congo', 'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'England', 'Equatorial Guinea', 'Estonia',
       'Faroe Islands', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia',
       'Germany', 'Ghana', 'Gibraltar', 'Greece', 'Grenada', 'Guam',
       'Guinea', 'Guinea Bissau', 'Guyana', 'Haiti', 'Honduras',
       'Hungary', 'Iceland', 'India', 'Iran', 'Iraq', 'Israel', 'Italy',
       'Ivory Coast', 'Jamaica', 'Japan', 'Kazakhs

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder_nationality = OneHotEncoder().fit(categorical_train[['nationality']])#fit determines how many unique alues has each categorical column. You can see below.
print(encoder_nationality.categories_)
encoded_train_nationality = encoder_nationality.transform(categorical_enc_train[['nationality']]).toarray()
encoded_test_nationality = encoder_nationality.transform(categorical_enc_test[['nationality']]).toarray()
nationality_encoded_train = pd.DataFrame(encoded_train_nationality,columns = encoder_nationality.categories_)
nationality_encoded_test = pd.DataFrame(encoded_test_nationality,columns = encoder_nationality.categories_)

encoder_club = OneHotEncoder().fit(categorical_train[['club']])#fit determines how many unique alues has each categorical column. You can see below.
print(encoder_club.categories_)
encoded_train_club = encoder_club.transform(categorical_enc_train[['club']]).toarray()
encoded_test_club = encoder_club.transform(categorical_enc_test[['club']]).toarray()
club_encoded_train = pd.DataFrame(encoded_train_club,columns = encoder_club.categories_)
club_encoded_test = pd.DataFrame(encoded_test_club,columns = encoder_club.categories_)

encoder_bp = OneHotEncoder().fit(categorical_train[['bp']])#fit determines how many unique alues has each categorical column. You can see below.
print(encoder_bp.categories_)
encoded_train_bp = encoder_bp.transform(categorical_enc_train[['bp']]).toarray()
encoded_test_bp = encoder_bp.transform(categorical_enc_test[['bp']]).toarray()
bp_encoded_train = pd.DataFrame(encoded_train_bp,columns = encoder_bp.categories_)
bp_encoded_test = pd.DataFrame(encoded_test_bp,columns = encoder_bp.categories_)

categorical_encoded_train = pd.concat([nationality_encoded_train, club_encoded_train, bp_encoded_train], axis = 1)
categorical_encoded_test = pd.concat([nationality_encoded_test, club_encoded_test, bp_encoded_test], axis = 1)


In [68]:
#       · Apply model.
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [69]:
# Model Validation
#       · R2.
#       · MSE.
#       · RMSE.
#       · MAE.
# If we compare with the train set:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
y_pred = reg.predict(X_train)
y_true = y_train
print("The R2 is: ",r2_score(y_true, y_pred))
print("The MSE is: ",mean_squared_error(y_true, y_pred, squared=True))
print("The RMSE is: ",mean_squared_error(y_true, y_pred, squared=False))
print("The MAE is: ",mean_absolute_error(y_true, y_pred))

The R2 is:  0.9306079325918577
The MSE is:  3.2097672733161318
The RMSE is:  1.7915823378555984
The MAE is:  1.392315223961056


In [94]:
# If we compare with the test set:
y_pred = reg.predict(X_test)
y_true = y_test
print("The R2 is: ",r2_score(y_true, y_pred))
print("The MSE is: ",mean_squared_error(y_true, y_pred, squared=True))
print("The RMSE is: ",mean_squared_error(y_true, y_pred, squared=False))
print("The MAE is: ",mean_absolute_error(y_true, y_pred))

The R2 is:  0.9289241691524726
The MSE is:  3.4155555049067474
The RMSE is:  1.8481221563811054
The MAE is:  1.4499067382887532
