In [54]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from pathlib import Path
from pprint import pprint

import sys
import os
import pandas as pd
import numpy as np

In [3]:
REPO_PATH = Path('/home','shawn','Soccer-Player-Attributes-Comparison')
sys.path.append(os.fspath(REPO_PATH))

In [4]:
from fifa_similarity_search.service.index import ExampleIndex

In [31]:
embeddings = pd.read_csv(Path(REPO_PATH, 'datasets','final_embeddings','dnn_embeddings_v3.csv'))

In [32]:
df = pd.read_csv(Path(REPO_PATH, 'datasets','cleaned_soccer_data_120K.csv'))

In [33]:
df = df.query('fifa==16')

In [34]:
embeddings = embeddings.merge(df[['sofifa_id','overall','potential']], how='left', on=['sofifa_id'])

In [35]:
embeddings = embeddings.rename(columns={"dob": "birthday", "short_name": "player_name","overall":"overall_rating","sofifa_id":"player_fifa_api_id"})

In [10]:
#embeddings.to_csv(Path(REPO_PATH, 'datasets','final_embeddings','dnn_embeddings_sh_v2.csv'), index=False)

In [11]:
embeddings = pd.read_csv(Path(REPO_PATH, 'datasets','final_embeddings','dnn_embeddings_sh_v2.csv'))

In [36]:
embeddings

Unnamed: 0,player_fifa_api_id,player_name,long_name,birthday,fifa,player_positions_general,player_positions,PC0,PC1,PC2,...,PC24,PC25,PC26,PC27,PC28,PC29,PC30,PC31,overall_rating,potential
0,158023,L. Messi,Lionel Andrés Messi Cuccittini,1987-06-24,16,ATK,"RW, CF",-2.820459,0.463528,-14.149214,...,-7.790683,1.493453,-1.563942,-0.325550,-1.689328,-8.339558,-0.498441,-11.954984,94,95
1,20801,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,1985-02-05,16,ATK,"LW, LM",-6.879275,-0.027998,-14.428118,...,-8.056432,1.180157,-3.449321,-0.729221,-4.749788,-10.166391,1.064353,-11.029340,93,93
2,9014,A. Robben,Arjen Robben,1984-01-23,16,MID,"RM, LM, RW",-1.214763,-0.025625,-11.304901,...,-6.153422,1.276017,-1.281750,0.319182,-0.955590,-6.504946,-0.260668,-8.885252,90,90
3,167495,M. Neuer,Manuel Neuer,1986-03-27,16,GK,GK,-20.294119,13.371469,4.543362,...,1.984269,10.765091,-14.294777,-11.988588,-19.039328,-8.704237,-7.056895,-0.351833,90,90
4,176580,L. Suárez,Luis Alberto Suárez Díaz,1987-01-24,16,ATK,ST,-5.768158,0.626190,-11.303053,...,-6.374952,1.721924,-2.405569,-1.052525,-3.861679,-8.001318,-0.152860,-8.865654,90,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14876,224867,R. Horne,Ryan Horne,1995-11-02,16,MID,CM,0.423741,0.607692,0.048621,...,0.342504,0.010696,-2.454395,-0.168915,-3.610232,0.519966,-0.259955,-0.734067,44,57
14877,229852,C. Shephard,Corey Shephard,1997-12-28,16,MID,CM,0.472541,0.962732,-0.677146,...,-0.093138,0.293502,-2.360829,-0.814828,-3.576773,0.158617,-0.910374,-2.140381,44,56
14878,227910,L. Gooch,Liam Gooch,1997-11-25,16,GK,GK,-6.923523,7.113905,2.510575,...,1.565786,6.228837,-10.562727,-4.795074,-12.354741,-4.431404,-4.144162,1.331121,44,54
14879,225339,S. Warburton,Sam Warburton,1996-10-10,16,DEF,"LB, LM",0.109028,-0.682285,0.891646,...,0.013958,0.833154,0.838275,0.634556,0.705203,0.730203,0.028193,0.589721,44,51


In [37]:
embeddings_columns = []
for i in range(0,32):
    embeddings_columns.append(f'PC{i}')

In [38]:
def get_dataset(df, embeddings_columns):
    vectors = np.ascontiguousarray(df[embeddings_columns])
    labels = np.ascontiguousarray(df['player_fifa_api_id'])
    
    return vectors.astype('float32'), np.expand_dims(labels, axis=1)

In [39]:
class Service_Index():
    def __init__(self, df, embeddings_columns):
        vectors, labels = get_dataset(df, embeddings_columns)
        self.df = df
        self.vectors = vectors
        self.labels = labels

        self.index = ExampleIndex(vectors, labels)
        self.index.build()

        self.columns = ['player_name', 'player_positions', 'overall_rating', 'potential', 'birthday',
                        'player_fifa_api_id']

    def search_by_player_name(self, player_name):
        # player_name must be exact
        query_id = self.df[self.df.player_name == player_name].player_fifa_api_id.values[0]
        id_ = self.df[self.df.player_fifa_api_id == query_id].index.values[0]
        results, distances = self.index.query(np.expand_dims(self.vectors[id_], axis=0), k=11)
        response = []
        for ind, result_id in enumerate(results):
            if query_id != result_id:
                result_dict = self.df[self.df.player_fifa_api_id == result_id[0]][self.columns].iloc[0].to_dict()
                result_dict["similarity_score"] = 1/(1+distances[ind])
                response.append(result_dict)
        return response

    def search_by_vector(self, vector, query_id):
        results, distances = self.index.query(np.expand_dims(vector, axis=0), k=11)
        response = []
        for ind, result_id in enumerate(results):
            if query_id != result_id:
                result_dict = self.df[self.df.player_fifa_api_id == result_id[0]][self.columns].iloc[0].to_dict()
                result_dict["similarity_score"] = 1/(1+distances[ind])
                response.append(result_dict)
        return response

## Use Case 1: Search all in database

In [40]:
all_index = Service_Index(embeddings, embeddings_columns)

In [55]:
pprint(all_index.search_by_player_name('B. Richardson'))

[{'birthday': '1995-02-14',
  'overall_rating': 46,
  'player_fifa_api_id': 229393,
  'player_name': 'S. Hornby',
  'player_positions': 'GK',
  'potential': 55,
  'similarity_score': 0.10569022761313629},
 {'birthday': '1997-06-22',
  'overall_rating': 46,
  'player_fifa_api_id': 228155,
  'player_name': 'C. Johns',
  'player_positions': 'GK',
  'potential': 57,
  'similarity_score': 0.10478651282969634},
 {'birthday': '1997-07-06',
  'overall_rating': 45,
  'player_fifa_api_id': 215734,
  'player_name': 'E. Farrell',
  'player_positions': 'GK',
  'potential': 61,
  'similarity_score': 0.09459603923470901},
 {'birthday': '1998-06-12',
  'overall_rating': 46,
  'player_fifa_api_id': 226099,
  'player_name': 'A. Bishop',
  'player_positions': 'GK',
  'potential': 53,
  'similarity_score': 0.08879549031204292},
 {'birthday': '1997-08-02',
  'overall_rating': 48,
  'player_fifa_api_id': 225342,
  'player_name': 'J. Stevens',
  'player_positions': 'GK',
  'potential': 54,
  'similarity_scor

In [41]:
#First Name is search query
#The rest are search results

## Use Case 2: Filter-Search: only look at all the under-21s (youths)

Database is 2016. Those under 21 y-o would be born after 1995. Purpose of this is to get the potential next generation Cristiano Ronaldo

In [60]:
embeddings.birthday = pd.to_datetime(embeddings.birthday)
new_df = embeddings.query('birthday > 19950101')

In [61]:
player_name = 'Cristiano Ronaldo'
query_id = embeddings[embeddings.player_name == player_name].player_fifa_api_id.values[0]
query_vector = np.array(embeddings[embeddings.player_fifa_api_id == query_id][embeddings_columns])
query_vector = query_vector.reshape((32)).astype('float32')

In [62]:
index = Service_Index(new_df, embeddings_columns)
response = index.search_by_vector(query_vector, query_id)
pprint(response)

[{'birthday': Timestamp('1997-04-05 00:00:00'),
  'overall_rating': 64,
  'player_fifa_api_id': 228635,
  'player_name': 'Borja Mayoral',
  'player_positions': 'ST',
  'potential': 79,
  'similarity_score': 0.27580427548496017},
 {'birthday': Timestamp('1996-06-12 00:00:00'),
  'overall_rating': 68,
  'player_fifa_api_id': 222059,
  'player_name': 'Mosquito',
  'player_positions': 'ST',
  'potential': 83,
  'similarity_score': 0.23187068767120578},
 {'birthday': Timestamp('1997-01-08 00:00:00'),
  'overall_rating': 62,
  'player_fifa_api_id': 225019,
  'player_name': 'R. Cisneros',
  'player_positions': 'ST',
  'potential': 73,
  'similarity_score': 0.20609737694989047},
 {'birthday': Timestamp('1995-07-15 00:00:00'),
  'overall_rating': 70,
  'player_fifa_api_id': 211239,
  'player_name': 'C. Jean',
  'player_positions': 'ST, LM, RM',
  'potential': 82,
  'similarity_score': 0.1940049148049897},
 {'birthday': Timestamp('1995-06-11 00:00:00'),
  'overall_rating': 61,
  'player_fifa_api

## Use Case 3: Best-Search: Get the best player of all similar players

After getting the top K most similar players, which players are the best players out of them?

Using the earlier example (use case 2), we notice that each player has a "potential" field. Let's sort them by their potential

In [20]:
sorted(response, key=lambda response: response['potential'], reverse=True) 

[{'player_name': 'Munir',
  'player_positions': 'RW, LW, ST',
  'overall_rating': 73,
  'potential': 85,
  'birthday': Timestamp('1995-09-01 00:00:00'),
  'player_fifa_api_id': 220253,
  'similarity_score': 0.24818256723712764},
 {'player_name': 'B. Traoré',
  'player_positions': 'ST, RW',
  'overall_rating': 74,
  'potential': 85,
  'birthday': Timestamp('1995-09-06 00:00:00'),
  'player_fifa_api_id': 207948,
  'similarity_score': 0.14100624228455244},
 {'player_name': 'T. Werner',
  'player_positions': 'LM, ST',
  'overall_rating': 72,
  'potential': 84,
  'birthday': Timestamp('1996-03-06 00:00:00'),
  'player_fifa_api_id': 212188,
  'similarity_score': 0.1495583463483166},
 {'player_name': 'P. Roberts',
  'player_positions': 'RM',
  'overall_rating': 65,
  'potential': 81,
  'birthday': Timestamp('1997-02-05 00:00:00'),
  'player_fifa_api_id': 221982,
  'similarity_score': 0.13519795426862563},
 {'player_name': 'M. Simon',
  'player_positions': 'ST',
  'overall_rating': 70,
  'pote

We can do the same for Use Case 1, but lets sort by ability instead. We are finding the best player most similar to Messi.

In [21]:
all_index = Service_Index(embeddings, embeddings_columns)
response = all_index.search_by_player_name('L. Messi')

In [22]:
sorted(response, key=lambda response: response['potential'], reverse=True) 

[{'player_name': 'A. Robben',
  'player_positions': 'RM, LM, RW',
  'overall_rating': 90,
  'potential': 90,
  'birthday': Timestamp('1984-01-23 00:00:00'),
  'player_fifa_api_id': 9014,
  'similarity_score': 0.3158182371135131},
 {'player_name': 'P. Dybala',
  'player_positions': 'ST',
  'overall_rating': 78,
  'potential': 88,
  'birthday': Timestamp('1993-11-15 00:00:00'),
  'player_fifa_api_id': 211110,
  'similarity_score': 0.576062518992101},
 {'player_name': 'M. Özil',
  'player_positions': 'CAM, LW',
  'overall_rating': 87,
  'potential': 88,
  'birthday': Timestamp('1988-10-15 00:00:00'),
  'player_fifa_api_id': 176635,
  'similarity_score': 0.3395830491197915},
 {'player_name': 'N. Fékir',
  'player_positions': 'ST, CAM, RW, LW',
  'overall_rating': 79,
  'potential': 85,
  'birthday': Timestamp('1993-07-18 00:00:00'),
  'player_fifa_api_id': 216594,
  'similarity_score': 0.2593831580882039},
 {'player_name': 'Munir',
  'player_positions': 'RW, LW, ST',
  'overall_rating': 73