In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

fifa = pd.read_csv('Datasets/fifa_cleaned.csv')
fifa.head()

Unnamed: 0,name_club,short_name,long_name,age,nationality,club,overall,potential,value_eur,wage_eur,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,attack_work_rate,defense_work_rate
0,"L. Messi, FC Barcelona",L. Messi,Lionel Andres Messi Cuccittini,32,Argentina,FC Barcelona,94,94,95500000,565000,...,66,66,68,63,52,52,52,63,2,1
1,"Cristiano Ronaldo, Juventus",Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,Portugal,Juventus,93,93,58500000,405000,...,61,61,65,61,53,53,53,61,3,1
2,"Neymar Jr, Paris Saint-Germain",Neymar Jr,Neymar da Silva Santos Junior,27,Brazil,Paris Saint-Germain,92,92,105500000,290000,...,61,61,66,61,46,46,46,61,3,2
3,"J. Oblak, Atlético Madrid",J. Oblak,Jan Oblak,26,Slovenia,Atlético Madrid,91,93,77500000,125000,...,0,0,0,0,0,0,0,0,2,2
4,"E. Hazard, Real Madrid",E. Hazard,Eden Hazard,28,Belgium,Real Madrid,91,91,90000000,470000,...,63,63,66,61,49,49,49,61,3,2


In [2]:
print(fifa.columns)
print(fifa.shape[1])

Index(['name_club', 'short_name', 'long_name', 'age', 'nationality', 'club',
       'overall', 'potential', 'value_eur', 'wage_eur', 'player_positions',
       'preferred_foot', 'weak_foot', 'work_rate', 'release_clause_eur',
       'player_tags', 'team_position', 'contract_valid_until', 'pace',
       'shooting', 'passing', 'dribbling', 'defending', 'physic', 'gk_diving',
       'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed',
       'gk_positioning', 'player_traits', 'attacking_crossing',
       'attacking_finishing', 'attacking_heading_accuracy',
       'attacking_short_passing', 'attacking_volleys', 'skill_dribbling',
       'skill_curve', 'skill_fk_accuracy', 'skill_long_passing',
       'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed',
       'movement_agility', 'movement_reactions', 'movement_balance',
       'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength',
       'power_long_shots', 'mentality_aggression', 'mentality_intercept

# Step 1: Quick and Basic Recommender

In [3]:
# seperate and classify the different features first

basic_attr = ['overall', 'potential', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']

pos_scores = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm',
       'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb',
       'rcb', 'rb']

gk_attr = ['gk_diving', 'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning']

attack_attr = ['attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys']
skill_attr = ['skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control']
movement_attr = ['movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance']
power_attr = ['power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots']
defend_attr = ['defending_standing_tackle', 'defending_sliding_tackle', 'defending_marking']

technical_attr = attack_attr + skill_attr + movement_attr + power_attr + defend_attr + gk_attr
technical_attr.append('weak_foot')

mental_attr = ['mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure']

positions = ['player_positions', 'team_position']

att_def_rate = ['attack_work_rate', 'defense_work_rate']

other_var = ['name_club', 'short_name', 'long_name', 'age', 'club', 'nationality', 'work_rate', 'preferred_foot',
             'value_eur', 'wage_eur', 'release_clause_eur', 'player_tags', 'player_traits', 'contract_valid_until']

print(len(pos_scores) + len(mental_attr) + len(att_def_rate) + len(basic_attr) + len(technical_attr) + len(positions))

print(len(other_var))

print(len(fifa.columns))

74
14
88


In [4]:
attr_features = basic_attr + technical_attr + mental_attr + pos_scores + att_def_rate
attr_features.insert(0, 'name_club')
fifa_RS = fifa[attr_features]

In [5]:
fifa_RS.head()

Unnamed: 0,name_club,overall,potential,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,attack_work_rate,defense_work_rate
0,"L. Messi, FC Barcelona",94,94,87,92,92,96,39,66,88,...,66,66,68,63,52,52,52,63,2,1
1,"Cristiano Ronaldo, Juventus",93,93,90,93,82,89,35,78,84,...,61,61,65,61,53,53,53,61,3,1
2,"Neymar Jr, Paris Saint-Germain",92,92,91,85,87,95,32,58,87,...,61,61,66,61,46,46,46,61,3,2
3,"J. Oblak, Atlético Madrid",91,93,0,0,0,0,0,0,13,...,0,0,0,0,0,0,0,0,2,2
4,"E. Hazard, Real Madrid",91,91,91,83,86,94,35,66,81,...,63,63,66,61,49,49,49,61,3,2


In [6]:
fifa_RS = fifa_RS.set_index("name_club")
fifa_RS.head()

Unnamed: 0_level_0,overall,potential,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,attack_work_rate,defense_work_rate
name_club,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"L. Messi, FC Barcelona",94,94,87,92,92,96,39,66,88,95,...,66,66,68,63,52,52,52,63,2,1
"Cristiano Ronaldo, Juventus",93,93,90,93,82,89,35,78,84,94,...,61,61,65,61,53,53,53,61,3,1
"Neymar Jr, Paris Saint-Germain",92,92,91,85,87,95,32,58,87,87,...,61,61,66,61,46,46,46,61,3,2
"J. Oblak, Atlético Madrid",91,93,0,0,0,0,0,0,13,11,...,0,0,0,0,0,0,0,0,2,2
"E. Hazard, Real Madrid",91,91,91,83,86,94,35,66,81,84,...,63,63,66,61,49,49,49,61,3,2


In [7]:
scaler = StandardScaler()

In [8]:
fifa_scaled = scaler.fit_transform(fifa_RS)

In [9]:
fifa_scaled = pd.DataFrame(fifa_scaled, index = fifa_RS.index, columns = fifa_RS.columns)
fifa_scaled.head()

Unnamed: 0_level_0,overall,potential,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,attack_work_rate,defense_work_rate
name_club,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"L. Messi, FC Barcelona",3.993662,3.657156,1.127098,2.1567,2.006332,1.84356,-0.303796,0.372952,2.089048,2.52167,...,0.769503,0.769503,0.851397,0.632295,0.130714,0.130714,0.130714,0.632295,-0.418305,-2.158629
"Cristiano Ronaldo, Juventus",3.849772,3.494276,1.253076,2.204072,1.518666,1.524399,-0.482213,0.908933,1.870766,2.470634,...,0.522648,0.522648,0.701125,0.532251,0.179059,0.179059,0.179059,0.532251,1.506128,-2.158629
"Neymar Jr, Paris Saint-Germain",3.705883,3.331397,1.295069,1.825099,1.762499,1.797965,-0.616026,0.015632,2.034477,2.113383,...,0.522648,0.522648,0.751215,0.532251,-0.159354,-0.159354,-0.159354,0.532251,1.506128,-0.165327
"J. Oblak, Atlético Madrid",3.561993,3.494276,-2.526264,-2.20149,-2.480193,-2.533508,-2.043363,-2.574941,-2.003744,-1.765341,...,-2.488987,-2.488987,-2.554771,-2.519079,-2.383209,-2.383209,-2.383209,-2.519079,-0.418305,-0.165327
"E. Hazard, Real Madrid",3.561993,3.168517,1.295069,1.730355,1.713733,1.752371,-0.482213,0.372952,1.707054,1.960276,...,0.62139,0.62139,0.751215,0.532251,-0.01432,-0.01432,-0.01432,0.532251,1.506128,-0.165327


In [10]:
def mean_center_rows(df):
    return (df.T - df.mean(axis=1)).T

In [11]:
fifa_mc = mean_center_rows(fifa_scaled)
fifa_mc.head()

Unnamed: 0_level_0,overall,potential,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,attack_work_rate,defense_work_rate
name_club,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"L. Messi, FC Barcelona",2.682445,2.345939,-0.184119,0.845483,0.695115,0.532343,-1.615013,-0.938264,0.777831,1.210453,...,-0.541714,-0.541714,-0.45982,-0.678922,-1.180503,-1.180503,-1.180503,-0.678922,-1.729521,-3.469846
"Cristiano Ronaldo, Juventus",2.584294,2.228798,-0.012403,0.938593,0.253188,0.25892,-1.747692,-0.356545,0.605287,1.205156,...,-0.742831,-0.742831,-0.564354,-0.733227,-1.08642,-1.08642,-1.08642,-0.733227,0.24065,-3.424107
"Neymar Jr, Paris Saint-Germain",2.50352,2.129034,0.092706,0.622736,0.560137,0.595603,-1.818389,-1.186731,0.832115,0.91102,...,-0.679715,-0.679715,-0.451147,-0.670111,-1.361716,-1.361716,-1.361716,-0.670111,0.303766,-1.367689
"J. Oblak, Atlético Madrid",4.626806,4.55909,-1.46145,-1.136677,-1.41538,-1.468695,-0.97855,-1.510127,-0.93893,-0.700527,...,-1.424173,-1.424173,-1.489958,-1.454265,-1.318395,-1.318395,-1.318395,-1.454265,0.646509,0.899487
"E. Hazard, Real Madrid",2.377648,1.984172,0.110723,0.54601,0.529387,0.568026,-1.666559,-0.811393,0.522709,0.77593,...,-0.562955,-0.562955,-0.43313,-0.652094,-1.198665,-1.198665,-1.198665,-0.652094,0.321783,-1.349672


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
sim_matrix = cosine_similarity(fifa_mc)
RS_basic = pd.DataFrame(sim_matrix, columns=fifa_mc.index, index=fifa_mc.index)
RS_basic.head()

name_club,"L. Messi, FC Barcelona","Cristiano Ronaldo, Juventus","Neymar Jr, Paris Saint-Germain","J. Oblak, Atlético Madrid","E. Hazard, Real Madrid","K. De Bruyne, Manchester City","M. ter Stegen, FC Barcelona","V. van Dijk, Liverpool","L. Modric, Real Madrid","M. Salah, Liverpool",...,"M. Gallagher, Finn Harps","Huang Jiahui, Dalian YiFang FC","M. Sagaf, Carlisle United","E. Tweed, Derry City","P. Martin, Waterford FC","Shao Shuai, Beijing Renhe FC","Xiao Mingjie, Shanghai SIPG FC","Zhang Wei, Hebei China Fortune FC","Wang Haijian, Shanghai Greenland Shenhua FC","Pan Ximing, Hebei China Fortune FC"
name_club,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"L. Messi, FC Barcelona",1.0,0.896573,0.9159,-0.142076,0.933145,0.771246,-0.120092,0.181368,0.694323,0.912573,...,-0.323194,-0.681896,-0.453848,-0.498819,-0.530844,-0.822635,-0.739237,-0.495347,-0.288415,-0.55107
"Cristiano Ronaldo, Juventus",0.896573,1.0,0.852771,-0.086353,0.855155,0.694625,-0.066091,0.233136,0.536628,0.882091,...,-0.377835,-0.583486,-0.444496,-0.577589,-0.49329,-0.787299,-0.741393,-0.617615,-0.40554,-0.625046
"Neymar Jr, Paris Saint-Germain",0.9159,0.852771,1.0,-0.120332,0.97371,0.82949,-0.091161,0.017201,0.741658,0.894033,...,-0.262558,-0.602697,-0.375494,-0.42962,-0.49276,-0.690813,-0.723557,-0.60892,-0.467094,-0.558156
"J. Oblak, Atlético Madrid",-0.142076,-0.086353,-0.120332,1.0,-0.147594,-0.213112,0.977434,-0.092657,-0.268677,-0.189497,...,-0.392526,-0.079116,-0.411995,-0.369461,0.746311,-0.095939,-0.188303,-0.400601,-0.385994,-0.362083
"E. Hazard, Real Madrid",0.933145,0.855155,0.97371,-0.147594,1.0,0.821508,-0.128156,0.043079,0.746002,0.945973,...,-0.303959,-0.657347,-0.426457,-0.478795,-0.515555,-0.73282,-0.73386,-0.59782,-0.457434,-0.567585


In [14]:
# masking for faster computational operations

RS_basic = RS_basic.where(np.tril(np.ones(RS_basic.shape)).astype(np.bool))
RS_basic.head()

name_club,"L. Messi, FC Barcelona","Cristiano Ronaldo, Juventus","Neymar Jr, Paris Saint-Germain","J. Oblak, Atlético Madrid","E. Hazard, Real Madrid","K. De Bruyne, Manchester City","M. ter Stegen, FC Barcelona","V. van Dijk, Liverpool","L. Modric, Real Madrid","M. Salah, Liverpool",...,"M. Gallagher, Finn Harps","Huang Jiahui, Dalian YiFang FC","M. Sagaf, Carlisle United","E. Tweed, Derry City","P. Martin, Waterford FC","Shao Shuai, Beijing Renhe FC","Xiao Mingjie, Shanghai SIPG FC","Zhang Wei, Hebei China Fortune FC","Wang Haijian, Shanghai Greenland Shenhua FC","Pan Ximing, Hebei China Fortune FC"
name_club,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"L. Messi, FC Barcelona",1.0,,,,,,,,,,...,,,,,,,,,,
"Cristiano Ronaldo, Juventus",0.896573,1.0,,,,,,,,,...,,,,,,,,,,
"Neymar Jr, Paris Saint-Germain",0.9159,0.852771,1.0,,,,,,,,...,,,,,,,,,,
"J. Oblak, Atlético Madrid",-0.142076,-0.086353,-0.120332,1.0,,,,,,,...,,,,,,,,,,
"E. Hazard, Real Madrid",0.933145,0.855155,0.97371,-0.147594,1.0,,,,,,...,,,,,,,,,,


In [18]:
RS_basic['L. Messi, FC Barcelona'].sort_values(ascending=False).head(10)

name_club
L. Messi, FC Barcelona          1.000000
J. Sildero, Uruguay             0.957726
K. Coman, FC Bayern München     0.948896
A. Januzaj, Real Sociedad       0.943269
P. Dybala, Juventus             0.941714
J. Brandt, Borussia Dortmund    0.937238
R. Mahrez, Manchester City      0.933639
E. Hazard, Real Madrid          0.933145
K. Benzema, Real Madrid         0.923474
M. Reus, Borussia Dortmund      0.921396
Name: L. Messi, FC Barcelona, dtype: float64

In [23]:
# first, create function to return top 5 similar players by asking only for player's short name (which is what people usually input):

fifa_shortname = [name for name in fifa.short_name]

def get_sim_players(player_name, RS):
    if fifa_shortname.count(player_name) == 1:
    # for players with unique name 
        player_index = int(fifa.loc[fifa.short_name == player_name].index.values)
        player_nameclub = fifa.iloc[player_index]['name_club']
    elif fifa_shortname.count(player_name) > 1:
    # for players with non-unique name
        dups_count = fifa_shortname.count(player_name)
        dups_indexes = list(fifa.loc[fifa.short_name == player_name].index.values)
        dups_nameclub = fifa.iloc[dups_indexes]['name_club']
        
        print(f"There are a total of {dups_count} counts for {player_name}.")
        print("")
        print(f"{dups_nameclub}")
        print("")
        print("Which of the following players above are you actually looking for?")
        exact_dup_index = int(input("Please key in the corresponding index beside them:"))
        print("")
        
        player_nameclub = fifa.iloc[exact_dup_index]['name_club']
        
    else:
        # for players whom the user keyed in wrongly (but where the last name still matches)
        player_namesplit = player_name.split(". ")
        for name in fifa_shortname:
            if player_namesplit[1] in name:
                print(name)
                # show list of players with same last name
        print("\n")
        print(f"The player entered, {player_name}, does not exist. Do you actually mean the following player(s) above?")
        print("")
        
        actual_player_name = str(input("If so, please enter the exact name as above:"))
        print("")
                
        player_index = int(fifa.loc[fifa.short_name == actual_player_name].index.values)
        player_nameclub = fifa.iloc[player_index]['name_club']
        
    print(f"Here are the top 5 players that are similar to {player_nameclub}.")
    df = pd.DataFrame(RS[player_nameclub].sort_values(ascending=False).head(6))
    df = df.drop([player_nameclub])
    print("The distances below represent the similarity.")
    print(f"The higher it is, the more similar the player is to {player_name}.")
    return df

In [24]:
get_sim_players('L. Messi', RS_basic)

Here are the top 5 players that are similar to L. Messi, FC Barcelona.
The distances below represent the similarity.
The higher it is, the more similar the player is to L. Messi.


Unnamed: 0_level_0,"L. Messi, FC Barcelona"
name_club,Unnamed: 1_level_1
"J. Sildero, Uruguay",0.957726
"K. Coman, FC Bayern München",0.948896
"A. Januzaj, Real Sociedad",0.943269
"P. Dybala, Juventus",0.941714
"J. Brandt, Borussia Dortmund",0.937238


In [25]:
get_sim_players('C. Ronaldo', RS_basic)

Cristiano Ronaldo
Ronaldo Cabrais
Ronaldo Esler
Ronaldo Vieira


The player entered, C. Ronaldo, does not exist. Do you actually mean the following player(s) above?

If so, please enter the exact name as above:Cristiano Ronaldo

Here are the top 5 players that are similar to Cristiano Ronaldo, Juventus.
The distances below represent the similarity.
The higher it is, the more similar the player is to C. Ronaldo.


Unnamed: 0_level_0,"Cristiano Ronaldo, Juventus"
name_club,Unnamed: 1_level_1
"K. Mbappe, Paris Saint-Germain",0.953139
"S. Aguero, Manchester City",0.942917
"P. Aubameyang, Arsenal",0.941189
"S. Mane, Liverpool",0.937856
"K. Benzema, Real Madrid",0.936133


The basic recommender system works!

... but can I do better? Yes!

# Step 2: Finetune Recommender by Feature Engineering 'Positions' and 'Traits/Tags"

There exists two other features that can be further feature engineered from to improve the recommender - positions (`player_positions`, `team_position`) and players' styles/characteristics (`player_tags`, `player_traits`).

`player_positions` and `team_position` might be abit vague/similar. This would be better explained by the following Messi example (index [0] is Messi):

In [15]:
print(fifa.iloc[0][pos_scores])

print(fifa.iloc[0]['player_positions'])
print(fifa.iloc[0]['team_position'])

ls     89
st     89
rs     89
lw     93
lf     93
cf     93
rf     93
rw     93
lam    93
cam    93
ram    93
lm     92
lcm    87
cm     87
rcm    87
rm     92
lwb    68
ldm    66
cdm    66
rdm    66
rwb    68
lb     63
lcb    52
cb     52
rcb    52
rb     63
Name: 0, dtype: object
RW, CF, ST
RW


In [16]:
fifa.team_position.unique()

array(['RW', 'LW', 'CAM', 'GK', 'RCM', 'LCB', 'ST', 'CDM', 'LDM', 'RM',
       'RCB', 'LCM', 'LM', 'CF', 'SUB', 'LB', 'LS', 'RB', 'RDM', 'RES',
       'RAM', 'RS', 'RF', 'CM', 'CB', 'LF', 'LAM', nan, 'RWB', 'LWB'],
      dtype=object)

In [17]:
fifa.player_positions.unique()

array(['RW, CF, ST', 'ST, LW', 'LW, CAM', 'GK', 'LW, CF', 'CAM, CM', 'CB',
       'CM', 'RW, ST', 'ST, RW', 'ST', 'CDM, CM', 'CF, ST, LW', 'CAM, RW',
       'CM, CDM', 'RW, LW', 'CAM, LM, ST', 'ST, LM', 'LW, LM', 'CB, LB',
       'RW, CAM, CM', 'CDM', 'CF, LM', 'CF, ST', 'LB', 'CM, CAM, CDM',
       'CF, LW, ST', 'LW', 'CB, CDM', 'RB, CM, CDM', 'CAM, CM, LW',
       'CF, ST, CAM', 'LW, CM', 'CAM, RM, RW', 'CM, CAM', 'CM, LM, RM',
       'LB, CB', 'RB', 'CAM, CF, ST', 'RW, LW, ST', 'LB, LM',
       'RM, LM, CM', 'CAM, CM, RM', 'RM, LM', 'CAM, RM', 'CF, LW, CAM',
       'CAM, LM, RM', 'LM, RM, LW', 'RM, LM, LW', 'CAM', 'CAM, CM, CF',
       'LM', 'CDM, CB', 'RB, CB', 'RM, RW', 'LM, RW, LW', 'RM, CM',
       'CAM, LW, ST', 'RW, RM', 'CM, CDM, CAM', 'CM, CAM, CF',
       'LW, ST, LM', 'LM, ST', 'RM, RW, ST', 'LM, CAM, RM', 'LW, RW',
       'CF, LM, LW', 'RM, CAM', 'CF, RM, LM', 'RW, LW, CAM',
       'CDM, CM, CAM', 'CDM, CB, LB', 'ST, CAM, LW', 'ST, CF', 'RW, CAM',
       'LW, LM, RW', 'RW

As above, Messi, like every other player in FIFA, plays a specific singular position in the team (hence, only single entries in all of `team_position` feature). However, he is also able to play a few other positions such as CF and ST, in addition to RW. Generally, most players are able to play at least 2 positions, resulting in the `player_positions` having entries of more than 1 position stated (except for goalkeeper, which is a very specialised position).

Hence, a potential replacement's `player_positions` should also be taken into consideration and compared in terms of similarity with the other (departing) player.

As for positional scores, these are scores that every player possess with respect to every position on the field, whether or not that position is their natural/preferred position.

### 2a) Feature Engineering: Playable Positions

First, define a function that will create a Euclidean Distance dataframe:

In [11]:
def euc_dist_df(df):
    df = mean_center_rows(df)
    sim_matrix = euclidean_distances(df)
    df_RecSys = pd.DataFrame(sim_matrix, columns=df.index, index=df.index)
    df_RecSys = df_RecSys.where(np.tril(np.ones(df_RecSys.shape)).astype(np.bool))
    return df_RecSys

In [12]:
fifa_player_pos = fifa[['name_club', 'player_positions', 'team_position']]
fifa_player_pos.head()

Unnamed: 0,name_club,player_positions,team_position
0,"L. Messi, FC Barcelona","RW, CF, ST",RW
1,"Cristiano Ronaldo, Juventus","ST, LW",LW
2,"Neymar Jr, Paris Saint-Germain","LW, CAM",CAM
3,"J. Oblak, Atlético Madrid",GK,GK
4,"E. Hazard, Real Madrid","LW, CF",LW


In [13]:
# concatenate both columns, then perform feature extraction (of positions) later:

fifa_player_pos['all_positions'] = fifa_player_pos['player_positions'] + ', ' + fifa_player_pos['team_position']

fifa_player_pos.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name_club,player_positions,team_position,all_positions
0,"L. Messi, FC Barcelona","RW, CF, ST",RW,"RW, CF, ST, RW"
1,"Cristiano Ronaldo, Juventus","ST, LW",LW,"ST, LW, LW"
2,"Neymar Jr, Paris Saint-Germain","LW, CAM",CAM,"LW, CAM, CAM"
3,"J. Oblak, Atlético Madrid",GK,GK,"GK, GK"
4,"E. Hazard, Real Madrid","LW, CF",LW,"LW, CF, LW"


In [14]:
fifa_player_pos.drop(columns = ['player_positions', 'team_position'], inplace=True)
fifa_player_pos.head()

Unnamed: 0,name_club,all_positions
0,"L. Messi, FC Barcelona","RW, CF, ST, RW"
1,"Cristiano Ronaldo, Juventus","ST, LW, LW"
2,"Neymar Jr, Paris Saint-Germain","LW, CAM, CAM"
3,"J. Oblak, Atlético Madrid","GK, GK"
4,"E. Hazard, Real Madrid","LW, CF, LW"


In [15]:
fifa_player_pos.loc[fifa_player_pos['all_positions'].isnull()]

Unnamed: 0,name_club,all_positions
327,"E. Schetino, Uruguay",
328,"J. Sildero, Uruguay",
407,"J. Frendado, Uruguay",
408,"S. Ardero, Uruguay",
409,"L. Dalves, Uruguay",
...,...,...
16353,"A. Varkay, India",
16354,"C. Palan, India",
16356,"H. Ivanov, Bulgaria",
16600,"E. Ira Tape, Côte d'Ivoire",


These are the same players that are free agents and do not have existing clubs (and hence, positions).

Fill up these null values with "None":

In [16]:
fifa_player_pos.all_positions = fifa_player_pos.all_positions.fillna('None')
fifa_player_pos.dtypes

name_club        object
all_positions    object
dtype: object

In [17]:
fifa_player_pos['all_positions'] = [word.lower() for word in fifa_player_pos['all_positions']]
fifa_player_pos.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,name_club,all_positions
0,"L. Messi, FC Barcelona","rw, cf, st, rw"
1,"Cristiano Ronaldo, Juventus","st, lw, lw"
2,"Neymar Jr, Paris Saint-Germain","lw, cam, cam"
3,"J. Oblak, Atlético Madrid","gk, gk"
4,"E. Hazard, Real Madrid","lw, cf, lw"


In [18]:
pos_scores

['ls',
 'st',
 'rs',
 'lw',
 'lf',
 'cf',
 'rf',
 'rw',
 'lam',
 'cam',
 'ram',
 'lm',
 'lcm',
 'cm',
 'rcm',
 'rm',
 'lwb',
 'ldm',
 'cdm',
 'rdm',
 'rwb',
 'lb',
 'lcb',
 'cb',
 'rcb',
 'rb']

In [19]:
pos_scores.insert(0, 'name_club')
pos_scores.append('gk')

In [20]:
# create dataframe to perform one-hot encoding of player's positions

fifa_player_pos = pd.concat([fifa_player_pos, pd.DataFrame(columns=pos_scores)])
print(fifa_player_pos.shape)
print(len(pos_scores))

(18278, 29)
28


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
fifa_player_pos = fifa_player_pos.fillna(0)
fifa_player_pos.head()

Unnamed: 0,all_positions,cam,cb,cdm,cf,cm,gk,lam,lb,lcb,...,rb,rcb,rcm,rdm,rf,rm,rs,rw,rwb,st
0,"rw, cf, st, rw",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"st, lw, lw",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"lw, cam, cam",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"gk, gk",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"lw, cf, lw",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# rearrange columns, followed by splitting the positions into elements of a list:

fifa_player_pos = fifa_player_pos[['all_positions'] + [col for col in pos_scores]]
fifa_player_pos['all_positions'] = [word.split(', ') for word in fifa_player_pos['all_positions']]
fifa_player_pos.head()

Unnamed: 0,all_positions,name_club,ls,st,rs,lw,lf,cf,rf,rw,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk
0,"[rw, cf, st, rw]","L. Messi, FC Barcelona",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[st, lw, lw]","Cristiano Ronaldo, Juventus",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"[lw, cam, cam]","Neymar Jr, Paris Saint-Germain",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[gk, gk]","J. Oblak, Atlético Madrid",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[lw, cf, lw]","E. Hazard, Real Madrid",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Rather than performing a simple one-hot encoding, I decide to give a higher weightage/score if player's team position is the same as his playable positions. This may seem redundant, since all players will definitely play in one of their playable positions for the team... but consider the following case:

In [23]:
fifa.loc[fifa.short_name == 'C. Omoigui']

Unnamed: 0,name_club,short_name,long_name,age,nationality,club,overall,potential,value_eur,wage_eur,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,attack_work_rate,defense_work_rate
9167,"C. Omoigui, Royal Excel Mouscron",C. Omoigui,Cedric Omoigui,24,Nigeria,Royal Excel Mouscron,66,71,875000,4000,...,39,39,44,42,39,39,39,42,3,2


In [24]:
print("Messi's player positions:")
print(fifa.iloc[0].player_positions)
print("Messi's actual team position:")
print(fifa.iloc[0].team_position)
print("\n")
print("Omoigui's player positions:")
print(fifa.iloc[9167].player_positions)
print("Omoigui's actual team position:")
print(fifa.iloc[9167].team_position)

Messi's player positions:
RW, CF, ST
Messi's actual team position:
RW


Omoigui's player positions:
ST, RW, CF
Omoigui's actual team position:
SUB


Based purely on player's playable positions, Omoigui would get a very low distance score from Messi. But he is merely a substitute for his team!

By giving a higher score to those who actually play regularly in a position for their team, we "reward" these regular players and penalize those that don't play that often (only a striker/midfielder/defender on paper).

To put it in another way, if there is a another player who has the same playable positions as Messi, and who ALSO actually regularly plays the same team position as Messi, then we can more confidently say that this player can better mirror (and potentially replace) Messi, due to extremely close positional similarities.

In [25]:
for num, pos_list in enumerate(fifa_player_pos.all_positions):
    for pos in pos_list:
        if (pos_list.count(pos) > 1) and (pos != 'gk'):
            # need to make exception for goalkeepers, since their team position and player position are always the same
            # which also means, upon concatenation as above, goalkeeper's 'all_positions' will have double 'gk' entries
            fifa_player_pos.set_value(num, pos, 2)
        elif (pos_list.count(pos) > 1) and (pos == 'gk'):
            fifa_player_pos.set_value(num, pos, 1)
        else:
            fifa_player_pos.set_value(num, pos, 1)

  
  # Remove the CWD from sys.path while we load stuff.
  


In [26]:
fifa_player_pos.head()

Unnamed: 0,all_positions,name_club,ls,st,rs,lw,lf,cf,rf,rw,...,rwb,lb,lcb,cb,rcb,rb,gk,sub,res,none
0,"[rw, cf, st, rw]","L. Messi, FC Barcelona",0,1,0,0,0,1,0,2,...,0,0,0,0,0,0,0,,,
1,"[st, lw, lw]","Cristiano Ronaldo, Juventus",0,1,0,2,0,0,0,0,...,0,0,0,0,0,0,0,,,
2,"[lw, cam, cam]","Neymar Jr, Paris Saint-Germain",0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,,,
3,"[gk, gk]","J. Oblak, Atlético Madrid",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,,,
4,"[lw, cf, lw]","E. Hazard, Real Madrid",0,0,0,2,0,1,0,0,...,0,0,0,0,0,0,0,,,


In [27]:
fifa_player_pos = fifa_player_pos.fillna(0)
fifa_player_pos.isnull().sum()

all_positions    0
name_club        0
ls               0
st               0
rs               0
lw               0
lf               0
cf               0
rf               0
rw               0
lam              0
cam              0
ram              0
lm               0
lcm              0
cm               0
rcm              0
rm               0
lwb              0
ldm              0
cdm              0
rdm              0
rwb              0
lb               0
lcb              0
cb               0
rcb              0
rb               0
gk               0
sub              0
res              0
none             0
dtype: int64

In [28]:
# rearrange dataframe, as per before, to prepare for cosine similiarity transformation:

fifa_player_pos = fifa_player_pos.set_index("name_club")
fifa_player_pos.drop(columns = 'all_positions', inplace=True)
fifa_player_pos.head()

Unnamed: 0_level_0,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,...,rwb,lb,lcb,cb,rcb,rb,gk,sub,res,none
name_club,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"L. Messi, FC Barcelona",0,1,0,0,0,1,0,2,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
"Cristiano Ronaldo, Juventus",0,1,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
"Neymar Jr, Paris Saint-Germain",0,0,0,1,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0.0,0.0,0.0
"J. Oblak, Atlético Madrid",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0.0,0.0,0.0
"E. Hazard, Real Madrid",0,0,0,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


In [29]:
positions = ["position_" + col for col in fifa_player_pos.columns]
positions

['position_ls',
 'position_st',
 'position_rs',
 'position_lw',
 'position_lf',
 'position_cf',
 'position_rf',
 'position_rw',
 'position_lam',
 'position_cam',
 'position_ram',
 'position_lm',
 'position_lcm',
 'position_cm',
 'position_rcm',
 'position_rm',
 'position_lwb',
 'position_ldm',
 'position_cdm',
 'position_rdm',
 'position_rwb',
 'position_lb',
 'position_lcb',
 'position_cb',
 'position_rcb',
 'position_rb',
 'position_gk',
 'position_sub',
 'position_res',
 'position_none']

In [30]:
fifa_player_pos.columns = positions
fifa_player_pos.head()

Unnamed: 0_level_0,position_ls,position_st,position_rs,position_lw,position_lf,position_cf,position_rf,position_rw,position_lam,position_cam,...,position_rwb,position_lb,position_lcb,position_cb,position_rcb,position_rb,position_gk,position_sub,position_res,position_none
name_club,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"L. Messi, FC Barcelona",0,1,0,0,0,1,0,2,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
"Cristiano Ronaldo, Juventus",0,1,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
"Neymar Jr, Paris Saint-Germain",0,0,0,1,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0.0,0.0,0.0
"J. Oblak, Atlético Madrid",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0.0,0.0,0.0
"E. Hazard, Real Madrid",0,0,0,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


### 2b) Feature Engineering: Player's Styles/Characteristics

To engineer this feature, similar to player's positions before this, I will extract features from dataset's `player_tags` and `player_traits` columns.

In [4]:
fifa_players_TraitsTags = fifa[['name_club', 'player_tags', 'player_traits']]
fifa_players_TraitsTags.head()

Unnamed: 0,name_club,player_tags,player_traits
0,"L. Messi, FC Barcelona","#Dribbler, #Distance Shooter, #Crosser, #FK Sp...","Beat Offside Trap, Argues with Officials, Earl..."
1,"Cristiano Ronaldo, Juventus","#Speedster, #Dribbler, #Distance Shooter, #Acr...","Long Throw-in, Selfish, Argues with Officials,..."
2,"Neymar Jr, Paris Saint-Germain","#Speedster, #Dribbler, #Playmaker , #Crosser,...","Power Free-Kick, Injury Free, Selfish, Early C..."
3,"J. Oblak, Atlético Madrid",,"Flair, Acrobatic Clearance"
4,"E. Hazard, Real Madrid","#Speedster, #Dribbler, #Acrobat","Beat Offside Trap, Selfish, Finesse Shot, Spee..."


In [5]:
# concatenate both columns, then perform feature extraction later:

fifa_players_TraitsTags['player_traits_tags'] = fifa_players_TraitsTags['player_tags'] + ', ' + fifa_players_TraitsTags['player_traits']
fifa_players_TraitsTags.drop(columns = ['player_tags', 'player_traits'], inplace = True)
fifa_players_TraitsTags.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,name_club,player_traits_tags
0,"L. Messi, FC Barcelona","#Dribbler, #Distance Shooter, #Crosser, #FK Sp..."
1,"Cristiano Ronaldo, Juventus","#Speedster, #Dribbler, #Distance Shooter, #Acr..."
2,"Neymar Jr, Paris Saint-Germain","#Speedster, #Dribbler, #Playmaker , #Crosser,..."
3,"J. Oblak, Atlético Madrid",
4,"E. Hazard, Real Madrid","#Speedster, #Dribbler, #Acrobat, Beat Offside ..."


In [6]:
fifa_players_TraitsTags = fifa_players_TraitsTags.fillna('None')
fifa_players_TraitsTags.head()

Unnamed: 0,name_club,player_traits_tags
0,"L. Messi, FC Barcelona","#Dribbler, #Distance Shooter, #Crosser, #FK Sp..."
1,"Cristiano Ronaldo, Juventus","#Speedster, #Dribbler, #Distance Shooter, #Acr..."
2,"Neymar Jr, Paris Saint-Germain","#Speedster, #Dribbler, #Playmaker , #Crosser,..."
3,"J. Oblak, Atlético Madrid",
4,"E. Hazard, Real Madrid","#Speedster, #Dribbler, #Acrobat, Beat Offside ..."


In [7]:
traits_tags = [word for word in fifa_players_TraitsTags['player_traits_tags']]
traits_tags[:10]

['#Dribbler, #Distance Shooter, #Crosser, #FK Specialist, #Acrobat, #Clinical Finisher, #Complete Forward, Beat Offside Trap, Argues with Officials, Early Crosser, Finesse Shot, Speed Dribbler (CPU AI Only), 1-on-1 Rush, Giant Throw-in, Outside Foot Shot',
 '#Speedster, #Dribbler, #Distance Shooter, #Acrobat, #Clinical Finisher, #Complete Forward, Long Throw-in, Selfish, Argues with Officials, Early Crosser, Speed Dribbler (CPU AI Only), Skilled Dribbling',
 '#Speedster, #Dribbler, #Playmaker\xa0 , #Crosser, #FK Specialist, #Acrobat, #Clinical Finisher, #Complete Midfielder, #Complete Forward, Power Free-Kick, Injury Free, Selfish, Early Crosser, Speed Dribbler (CPU AI Only), Crowd Favourite',
 'None',
 '#Speedster, #Dribbler, #Acrobat, Beat Offside Trap, Selfish, Finesse Shot, Speed Dribbler (CPU AI Only), Crowd Favourite',
 '#Dribbler, #Playmaker\xa0 , #Engine, #Distance Shooter, #Crosser, #Complete Midfielder, Power Free-Kick, Avoids Using Weaker Foot, Dives Into Tackles (CPU AI Onl

In [8]:
# clean the words in the feature first before extracting

fifa_players_TraitsTags['player_traits_tags'] = [word.replace('#', '') for word in fifa_players_TraitsTags.player_traits_tags]
# remove '#'s
fifa_players_TraitsTags['player_traits_tags'] = [word.replace('\xa0', '') for word in fifa_players_TraitsTags.player_traits_tags]
# remove the extra whitespaces denoted by '\xa0'
fifa_players_TraitsTags.head()

Unnamed: 0,name_club,player_traits_tags
0,"L. Messi, FC Barcelona","Dribbler, Distance Shooter, Crosser, FK Specia..."
1,"Cristiano Ronaldo, Juventus","Speedster, Dribbler, Distance Shooter, Acrobat..."
2,"Neymar Jr, Paris Saint-Germain","Speedster, Dribbler, Playmaker , Crosser, FK S..."
3,"J. Oblak, Atlético Madrid",
4,"E. Hazard, Real Madrid","Speedster, Dribbler, Acrobat, Beat Offside Tra..."


In [9]:
fifa_players_TraitsTags['player_traits_tags'] = [word.split(', ') for word in fifa_players_TraitsTags['player_traits_tags']]
fifa_players_TraitsTags.head()

Unnamed: 0,name_club,player_traits_tags
0,"L. Messi, FC Barcelona","[Dribbler, Distance Shooter, Crosser, FK Speci..."
1,"Cristiano Ronaldo, Juventus","[Speedster, Dribbler, Distance Shooter, Acroba..."
2,"Neymar Jr, Paris Saint-Germain","[Speedster, Dribbler, Playmaker , Crosser, FK ..."
3,"J. Oblak, Atlético Madrid",[None]
4,"E. Hazard, Real Madrid","[Speedster, Dribbler, Acrobat, Beat Offside Tr..."


In [10]:
# remove traits and tags related to game (those with 'CPU AI Only')

for item in fifa_players_TraitsTags.player_traits_tags:
    for elem in item:
        if 'CPU' in elem:
            item.remove(elem)
            print(item)

['Dribbler', 'Distance Shooter', 'Crosser', 'FK Specialist', 'Acrobat', 'Clinical Finisher', 'Complete Forward', 'Beat Offside Trap', 'Argues with Officials', 'Early Crosser', 'Finesse Shot', '1-on-1 Rush', 'Giant Throw-in', 'Outside Foot Shot']
['Speedster', 'Dribbler', 'Distance Shooter', 'Acrobat', 'Clinical Finisher', 'Complete Forward', 'Long Throw-in', 'Selfish', 'Argues with Officials', 'Early Crosser', 'Skilled Dribbling']
['Speedster', 'Dribbler', 'Playmaker ', 'Crosser', 'FK Specialist', 'Acrobat', 'Clinical Finisher', 'Complete Midfielder', 'Complete Forward', 'Power Free-Kick', 'Injury Free', 'Selfish', 'Early Crosser', 'Crowd Favourite']
['Speedster', 'Dribbler', 'Acrobat', 'Beat Offside Trap', 'Selfish', 'Finesse Shot', 'Crowd Favourite']
['Dribbler', 'Playmaker ', 'Engine', 'Distance Shooter', 'Crosser', 'Complete Midfielder', 'Power Free-Kick', 'Avoids Using Weaker Foot', 'Leadership', 'Argues with Officials', 'Finesse Shot']
['Tackling', 'Tactician', 'Strength', 'Compl

In [11]:
# feature extract a unique set of players' traits/tags
# define a function to create a unique set

def unique_set(series):

    unique_set = []
    for item in series:
        if type(item) == list:
            unique_set += item
    unique_set = list(set(unique_set))
    return unique_set

In [12]:
unique_set(fifa_players_TraitsTags.player_traits_tags)

['Leadership',
 'Speedster',
 'Injury Free',
 'Complete Midfielder',
 'Diver',
 'Finesse Shot',
 'Speed Dribbler (CPU AI Only)',
 'Acrobat',
 'Inflexible',
 'Beat Offside Trap',
 'Strength',
 'Distance Shooter',
 'Long Throw-in',
 'Aerial Threat',
 'Outside Foot Shot',
 'Dribbler',
 'Argues with Officials',
 'Early Crosser',
 'Clinical Finisher',
 'Giant Throw-in',
 'Avoids Using Weaker Foot',
 'Long Passer (CPU AI Only)',
 'Poacher',
 'Engine',
 'FK Specialist',
 'Tackling',
 'None',
 'Complete Forward',
 'Injury Prone',
 '1-on-1 Rush',
 'Crosser',
 'Power Free-Kick',
 'Tactician',
 'Playmaker ',
 'Crowd Favourite',
 'Complete Defender',
 'Skilled Dribbling',
 'Selfish',
 'Long Shot Taker (CPU AI Only)']

In [13]:
# still has 'CPU AI' traits and tags, so remove them once more and extract unique set of features

for item in fifa_players_TraitsTags.player_traits_tags:
    for elem in item:
        if 'CPU' in elem:
            item.remove(elem)
            print(item)
            
unique_set(fifa_players_TraitsTags.player_traits_tags)

['Tackling', 'Tactician', 'Avoids Using Weaker Foot', 'Leadership', 'Finesse Shot']
['Poacher', 'Diver']
['Poacher', 'Aerial Threat', 'Distance Shooter', 'Strength', 'Clinical Finisher', 'Complete Forward', 'Selfish', 'Argues with Officials', 'Crowd Favourite']
['Aerial Threat', 'Strength', 'Injury Prone', 'Selfish', 'Skilled Dribbling']
['Strength']
['Acrobat', 'Selfish', 'Early Crosser']
['Strength', 'Beat Offside Trap', 'Giant Throw-in']
['Strength', 'Power Free-Kick', 'Avoids Using Weaker Foot', 'Beat Offside Trap']
['Strength', 'Long Throw-in', 'Power Free-Kick']
['Strength']
['Engine', 'Power Free-Kick']
['Strength']
['Aerial Threat', 'Strength', 'Power Free-Kick']
['Strength', 'Argues with Officials']


['Leadership',
 'Speedster',
 'Injury Free',
 'Complete Midfielder',
 'Diver',
 'Finesse Shot',
 'Acrobat',
 'Inflexible',
 'Beat Offside Trap',
 'Strength',
 'Distance Shooter',
 'Long Throw-in',
 'Aerial Threat',
 'Outside Foot Shot',
 'Dribbler',
 'Argues with Officials',
 'Early Crosser',
 'Clinical Finisher',
 'Giant Throw-in',
 'Avoids Using Weaker Foot',
 'Poacher',
 'Engine',
 'FK Specialist',
 'Tackling',
 'None',
 'Complete Forward',
 'Injury Prone',
 '1-on-1 Rush',
 'Crosser',
 'Power Free-Kick',
 'Tactician',
 'Playmaker ',
 'Crowd Favourite',
 'Complete Defender',
 'Skilled Dribbling',
 'Selfish']

In [14]:
unique_traits_tags = unique_set(fifa_players_TraitsTags.player_traits_tags)

In [15]:
# create dataframe to perform one-hot encoding of player's traits/tags

fifa_players_TraitsTags = pd.concat([fifa_players_TraitsTags, pd.DataFrame(columns=unique_traits_tags)])
fifa_players_TraitsTags = fifa_players_TraitsTags.fillna(0)
print(fifa_players_TraitsTags.shape)
print(len(unique_traits_tags))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


(18278, 38)
36


In [16]:
fifa_players_TraitsTags.head()

Unnamed: 0,1-on-1 Rush,Acrobat,Aerial Threat,Argues with Officials,Avoids Using Weaker Foot,Beat Offside Trap,Clinical Finisher,Complete Defender,Complete Forward,Complete Midfielder,...,Poacher,Power Free-Kick,Selfish,Skilled Dribbling,Speedster,Strength,Tackling,Tactician,name_club,player_traits_tags
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"L. Messi, FC Barcelona","[Dribbler, Distance Shooter, Crosser, FK Speci..."
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Cristiano Ronaldo, Juventus","[Speedster, Dribbler, Distance Shooter, Acroba..."
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Neymar Jr, Paris Saint-Germain","[Speedster, Dribbler, Playmaker , Crosser, FK ..."
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"J. Oblak, Atlético Madrid",[None]
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"E. Hazard, Real Madrid","[Speedster, Dribbler, Acrobat, Beat Offside Tr..."


In [17]:
# rearrange columns, followed by splitting the positions into elements of a list:

fifa_players_TraitsTags = fifa_players_TraitsTags[['name_club'] + ['player_traits_tags'] + [col for col in unique_traits_tags]]
fifa_players_TraitsTags.head()

Unnamed: 0,name_club,player_traits_tags,Leadership,Speedster,Injury Free,Complete Midfielder,Diver,Finesse Shot,Acrobat,Inflexible,...,Injury Prone,1-on-1 Rush,Crosser,Power Free-Kick,Tactician,Playmaker,Crowd Favourite,Complete Defender,Skilled Dribbling,Selfish
0,"L. Messi, FC Barcelona","[Dribbler, Distance Shooter, Crosser, FK Speci...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Cristiano Ronaldo, Juventus","[Speedster, Dribbler, Distance Shooter, Acroba...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Neymar Jr, Paris Saint-Germain","[Speedster, Dribbler, Playmaker , Crosser, FK ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"J. Oblak, Atlético Madrid",[None],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"E. Hazard, Real Madrid","[Speedster, Dribbler, Acrobat, Beat Offside Tr...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# perform one-hot encoding:

for num, traits_tags_list in enumerate(fifa_players_TraitsTags.player_traits_tags):
    for traits_tags in traits_tags_list:
        if traits_tags in unique_traits_tags:
            fifa_players_TraitsTags.set_value(num, traits_tags, 1)

  


In [19]:
fifa_players_TraitsTags.head()

Unnamed: 0,name_club,player_traits_tags,Leadership,Speedster,Injury Free,Complete Midfielder,Diver,Finesse Shot,Acrobat,Inflexible,...,Injury Prone,1-on-1 Rush,Crosser,Power Free-Kick,Tactician,Playmaker,Crowd Favourite,Complete Defender,Skilled Dribbling,Selfish
0,"L. Messi, FC Barcelona","[Dribbler, Distance Shooter, Crosser, FK Speci...",0,0,0,0,0,1,1,0,...,0,1,1,0,0,0,0,0,0,0
1,"Cristiano Ronaldo, Juventus","[Speedster, Dribbler, Distance Shooter, Acroba...",0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,1
2,"Neymar Jr, Paris Saint-Germain","[Speedster, Dribbler, Playmaker , Crosser, FK ...",0,1,1,1,0,0,1,0,...,0,0,1,1,0,1,1,0,0,1
3,"J. Oblak, Atlético Madrid",[None],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"E. Hazard, Real Madrid","[Speedster, Dribbler, Acrobat, Beat Offside Tr...",0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,1,0,0,1


In [20]:
# do a quick check on Messi:

fifa_players_TraitsTags.iloc[0].player_traits_tags

['Dribbler',
 'Distance Shooter',
 'Crosser',
 'FK Specialist',
 'Acrobat',
 'Clinical Finisher',
 'Complete Forward',
 'Beat Offside Trap',
 'Argues with Officials',
 'Early Crosser',
 'Finesse Shot',
 '1-on-1 Rush',
 'Giant Throw-in',
 'Outside Foot Shot']

In [21]:
fifa_players_TraitsTags.iloc[0]

name_club                                              L. Messi, FC Barcelona
player_traits_tags          [Dribbler, Distance Shooter, Crosser, FK Speci...
Leadership                                                                  0
Speedster                                                                   0
Injury Free                                                                 0
Complete Midfielder                                                         0
Diver                                                                       0
Finesse Shot                                                                1
Acrobat                                                                     1
Inflexible                                                                  0
Beat Offside Trap                                                           1
Strength                                                                    0
Distance Shooter                                                

In [22]:
# transform dataframe to perform cosine similarity algorithm

fifa_players_TraitsTags = fifa_players_TraitsTags.set_index("name_club")
fifa_players_TraitsTags.head()

Unnamed: 0_level_0,player_traits_tags,Leadership,Speedster,Injury Free,Complete Midfielder,Diver,Finesse Shot,Acrobat,Inflexible,Beat Offside Trap,...,Injury Prone,1-on-1 Rush,Crosser,Power Free-Kick,Tactician,Playmaker,Crowd Favourite,Complete Defender,Skilled Dribbling,Selfish
name_club,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"L. Messi, FC Barcelona","[Dribbler, Distance Shooter, Crosser, FK Speci...",0,0,0,0,0,1,1,0,1,...,0,1,1,0,0,0,0,0,0,0
"Cristiano Ronaldo, Juventus","[Speedster, Dribbler, Distance Shooter, Acroba...",0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
"Neymar Jr, Paris Saint-Germain","[Speedster, Dribbler, Playmaker , Crosser, FK ...",0,1,1,1,0,0,1,0,0,...,0,0,1,1,0,1,1,0,0,1
"J. Oblak, Atlético Madrid",[None],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"E. Hazard, Real Madrid","[Speedster, Dribbler, Acrobat, Beat Offside Tr...",0,1,0,0,0,1,1,0,1,...,0,0,0,0,0,0,1,0,0,1


In [None]:
fifa_players_TraitsTags.to_csv('../../../Desktop/Capstone/Datasets/fifa_players_TraitsTags.csv')

### 2c) Combine New Features into Dataset and Run Algorithm

In [62]:
fifa_mc.head()

Unnamed: 0_level_0,overall,potential,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,attack_work_rate,defense_work_rate
name_club,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"L. Messi, FC Barcelona",2.682445,2.345939,-0.184119,0.845483,0.695115,0.532343,-1.615013,-0.938264,0.777831,1.210453,...,-0.541714,-0.541714,-0.45982,-0.678922,-1.180503,-1.180503,-1.180503,-0.678922,-1.729521,-3.469846
"Cristiano Ronaldo, Juventus",2.584294,2.228798,-0.012403,0.938593,0.253188,0.25892,-1.747692,-0.356545,0.605287,1.205156,...,-0.742831,-0.742831,-0.564354,-0.733227,-1.08642,-1.08642,-1.08642,-0.733227,0.24065,-3.424107
"Neymar Jr, Paris Saint-Germain",2.50352,2.129034,0.092706,0.622736,0.560137,0.595603,-1.818389,-1.186731,0.832115,0.91102,...,-0.679715,-0.679715,-0.451147,-0.670111,-1.361716,-1.361716,-1.361716,-0.670111,0.303766,-1.367689
"J. Oblak, Atlético Madrid",4.626806,4.55909,-1.46145,-1.136677,-1.41538,-1.468695,-0.97855,-1.510127,-0.93893,-0.700527,...,-1.424173,-1.424173,-1.489958,-1.454265,-1.318395,-1.318395,-1.318395,-1.454265,0.646509,0.899487
"E. Hazard, Real Madrid",2.377648,1.984172,0.110723,0.54601,0.529387,0.568026,-1.666559,-0.811393,0.522709,0.77593,...,-0.562955,-0.562955,-0.43313,-0.652094,-1.198665,-1.198665,-1.198665,-0.652094,0.321783,-1.349672


In [175]:
print(fifa_mc.columns)
print(len(fifa_mc.columns))

Index(['overall', 'potential', 'pace', 'shooting', 'passing', 'dribbling',
       'defending', 'physic', 'attacking_crossing', 'attacking_finishing',
       'attacking_heading_accuracy', 'attacking_short_passing',
       'attacking_volleys', 'skill_dribbling', 'skill_curve',
       'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
       'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
       'movement_reactions', 'movement_balance', 'power_shot_power',
       'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
       'defending_standing_tackle', 'defending_sliding_tackle',
       'defending_marking', 'gk_diving', 'gk_handling', 'gk_kicking',
       'gk_reflexes', 'gk_speed', 'gk_positioning', 'weak_foot',
       'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw',
       'lam', 'ca

In [176]:
print(fifa_players_TraitsTags.columns)
print(len(fifa_players_TraitsTags.columns))

Index(['None', '1-on-1 Rush', 'Acrobat', 'Complete Midfielder',
       'Outside Foot Shot', 'Speedster', 'Complete Defender', 'Crosser',
       'Inflexible', 'Avoids Using Weaker Foot', 'Beat Offside Trap',
       'FK Specialist', 'Aerial Threat', 'Diver', 'Playmaker ',
       'Distance Shooter', 'Injury Prone', 'Long Throw-in', 'Dribbler',
       'Giant Throw-in', 'Argues with Officials', 'Skilled Dribbling',
       'Crowd Favourite', 'Selfish', 'Tackling', 'Tactician', 'Poacher',
       'Clinical Finisher', 'Leadership', 'Complete Forward',
       'Power Free-Kick', 'Engine', 'Finesse Shot', 'Strength',
       'Early Crosser', 'Injury Free'],
      dtype='object')
36


In [177]:
print(fifa_player_pos.columns)
print(len(fifa_player_pos.columns))

Index(['position_ls', 'position_st', 'position_rs', 'position_lw',
       'position_lf', 'position_cf', 'position_rf', 'position_rw',
       'position_lam', 'position_cam', 'position_ram', 'position_lm',
       'position_lcm', 'position_cm', 'position_rcm', 'position_rm',
       'position_lwb', 'position_ldm', 'position_cdm', 'position_rdm',
       'position_rwb', 'position_lb', 'position_lcb', 'position_cb',
       'position_rcb', 'position_rb', 'position_gk', 'position_sub',
       'position_res', 'position_none'],
      dtype='object')
30


In [63]:
fifa_new_feats = fifa_mc.merge(fifa_players_TraitsTags, left_index = True, right_index = True)
print(fifa_mc.shape[1])
print(fifa_new_feats.shape[1])

72
108


In [64]:
fifa_new_feats = fifa_new_feats.merge(fifa_player_pos, left_index = True, right_index = True)
print(fifa_player_pos.shape[1])
print(fifa_new_feats.shape[1])

30
138


In [65]:
fifa_new_feats.head()

Unnamed: 0_level_0,overall,potential,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,...,position_rwb,position_lb,position_lcb,position_cb,position_rcb,position_rb,position_gk,position_sub,position_res,position_none
name_club,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"L. Messi, FC Barcelona",2.682445,2.345939,-0.184119,0.845483,0.695115,0.532343,-1.615013,-0.938264,0.777831,1.210453,...,0,0,0,0,0,0,0,0.0,0.0,0.0
"Cristiano Ronaldo, Juventus",2.584294,2.228798,-0.012403,0.938593,0.253188,0.25892,-1.747692,-0.356545,0.605287,1.205156,...,0,0,0,0,0,0,0,0.0,0.0,0.0
"Neymar Jr, Paris Saint-Germain",2.50352,2.129034,0.092706,0.622736,0.560137,0.595603,-1.818389,-1.186731,0.832115,0.91102,...,0,0,0,0,0,0,0,0.0,0.0,0.0
"J. Oblak, Atlético Madrid",4.626806,4.55909,-1.46145,-1.136677,-1.41538,-1.468695,-0.97855,-1.510127,-0.93893,-0.700527,...,0,0,0,0,0,0,1,0.0,0.0,0.0
"E. Hazard, Real Madrid",2.377648,1.984172,0.110723,0.54601,0.529387,0.568026,-1.666559,-0.811393,0.522709,0.77593,...,0,0,0,0,0,0,0,0.0,0.0,0.0


Upon further pondering, I decide to transform using Euclidean Distance instead, as for my case, **magnitude matters**.

A player with an ability of (10, 10) would have the same distance as a player of ability (100, 100), if cosine similiarity metric was used for the transformation instead.

In [1]:
from sklearn.metrics.pairwise import euclidean_distances

In [183]:
sim_matrix = euclidean_distances(fifa_new_feats)
RS_new = pd.DataFrame(sim_matrix, columns=fifa_new_feats.index, index=fifa_new_feats.index)
RS_new = RS_new.where(np.tril(np.ones(RS_new.shape)).astype(np.bool))
RS_new.head()

name_club,"L. Messi, FC Barcelona","Cristiano Ronaldo, Juventus","Neymar Jr, Paris Saint-Germain","J. Oblak, Atlético Madrid","E. Hazard, Real Madrid","K. De Bruyne, Manchester City","M. ter Stegen, FC Barcelona","V. van Dijk, Liverpool","L. Modric, Real Madrid","M. Salah, Liverpool",...,"M. Gallagher, Finn Harps","Huang Jiahui, Dalian YiFang FC","M. Sagaf, Carlisle United","E. Tweed, Derry City","P. Martin, Waterford FC","Shao Shuai, Beijing Renhe FC","Xiao Mingjie, Shanghai SIPG FC","Zhang Wei, Hebei China Fortune FC","Wang Haijian, Shanghai Greenland Shenhua FC","Pan Ximing, Hebei China Fortune FC"
name_club,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"L. Messi, FC Barcelona",0.0,,,,,,,,,,...,,,,,,,,,,
"Cristiano Ronaldo, Juventus",6.458527,0.0,,,,,,,,,...,,,,,,,,,,
"Neymar Jr, Paris Saint-Germain",6.542744,6.792744,0.0,,,,,,,,...,,,,,,,,,,
"J. Oblak, Atlético Madrid",22.099849,21.22799,21.641074,0.0,,,,,,,...,,,,,,,,,,
"E. Hazard, Real Madrid",6.043428,6.222539,4.75033,21.206458,0.0,,,,,,...,,,,,,,,,,


### 2d) Save all dataframes, before performing operations on them

As the dataframes are huge (shape: 18k x 18k), I will transform them into numpy arrays before performing any operations on them, to lower the computational efforts.

In [66]:
fifa_new_feats.to_csv('../../../Desktop/Capstone/Datasets/fifa_new_feats.csv')

In [184]:
np_RS_basic = RS_basic.to_numpy()
np_RS_new = RS_new.to_numpy()

In [185]:
from numpy import savez_compressed

In [186]:
# save all numpy arrays first before proceeding

savez_compressed('../../../Desktop/Capstone/Datasets/RecSys/Compressed Arrays/RS_basic.npz', np_RS_basic)
savez_compressed('../../../Desktop/Capstone/Datasets/RecSys/Compressed Arrays/RS_new.npz', np_RS_new)

In [187]:
import pickle

In [189]:
RS_basic.to_pickle("../../../Desktop/Capstone/Datasets/RecSys/RS_basic.pkl")