In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from pprint import pprint

import sys
import os
import pandas as pd
import numpy as np

In [3]:
REPO_PATH = Path('/home','shawn','Soccer-Player-Attributes-Comparison')
sys.path.append(os.fspath(REPO_PATH))

In [4]:
from fifa_similarity_search.service.index import ExampleIndex

In [5]:
embeddings = pd.read_csv(Path(REPO_PATH, 'datasets','soccer_player_embeddings_v1.csv'))

In [6]:
embeddings.head(5)

Unnamed: 0,id,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC12,PC13,PC14,player_fifa_api_id,player_api_id,overall_rating,potential,player_name,birthday,player_positions
0,97455,-0.020058,0.683692,-0.25527,-0.329416,-0.594869,0.124635,-0.209664,0.304167,-0.082228,...,0.047019,-0.091479,0.273118,119152,30831,78.0,78.0,Kolo Toure,1981-03-19 00:00:00,CB
1,42881,0.068503,0.188564,-0.228424,-0.542123,-0.304828,0.015172,0.001972,-0.151031,-0.113127,...,-0.188535,0.25828,-0.195703,199284,133126,63.0,63.0,Dennis Hediger,1986-09-22 00:00:00,"CDM, CM, CAM"
2,97483,2.236177,-0.317256,1.209209,0.559709,0.488686,0.023748,-0.034294,-0.239571,-0.111753,...,-0.245468,0.054037,0.010629,201169,206641,58.0,63.0,Konrad Forenc,1992-07-17 00:00:00,GK
3,97505,-0.553902,0.108279,-0.700631,1.268173,0.137731,-0.259513,0.327372,-0.145581,-0.180327,...,-0.004284,-0.028598,-0.068586,183559,11242,67.0,67.0,Konstantin Engel,1988-07-27 00:00:00,"RB, LB"
4,42402,0.235236,-0.260011,1.173912,-0.533751,0.475334,-0.042503,-0.011437,-0.537734,0.061248,...,0.065391,-0.188249,0.083385,225951,664962,64.0,75.0,Denis Bouanga,1994-11-11 00:00:00,"RW, ST"


In [7]:
embeddings_columns = []
for i in range(1,15):
    embeddings_columns.append(f'PC{i}')

In [8]:
def get_dataset(df, embeddings_columns):
    vectors = np.ascontiguousarray(df[embeddings_columns])
    labels = np.ascontiguousarray(df['player_fifa_api_id'])
    
    return vectors.astype('float32'), np.expand_dims(labels, axis=1)

In [9]:
class Service_Index():
    def __init__(self, df, embeddings_columns):
        vectors, labels = get_dataset(df, embeddings_columns)
        self.df = df
        self.vectors = vectors
        self.labels = labels
        
        self.index = ExampleIndex(vectors, labels)
        self.index.build()
        
        self.columns = ['player_name', 'player_positions', 'overall_rating', 'potential', 'birthday', 'player_fifa_api_id']
        
    def search_by_player_name(self, player_name):
        #player_name must be exact
        query_id = self.df[self.df.player_name == player_name].player_fifa_api_id.values[0]
        id_ = self.df[self.df.player_fifa_api_id == query_id].index.values[0]
        results = self.index.query(np.expand_dims(self.vectors[id_], axis=0), k=10)
        response = []
        for result_id in results:
            response.append(self.df[self.df.player_fifa_api_id == result_id[0]][self.columns].iloc[0].to_dict())
        return response
    
    def search_by_vector(self, vector):
        results = self.index.query(np.expand_dims(vector, axis=0), k=10)
        response = []
        for result_id in results:
            response.append(self.df[self.df.player_fifa_api_id == result_id[0]][self.columns].iloc[0].to_dict())
        return response

## Use Case 1: Search all in database

In [10]:
all_index = Service_Index(embeddings, embeddings_columns)

In [11]:
pprint(all_index.search_by_player_name('Cristiano Ronaldo'))

[{'birthday': '1985-02-05 00:00:00',
  'overall_rating': 93.0,
  'player_fifa_api_id': 20801,
  'player_name': 'Cristiano Ronaldo',
  'player_positions': 'LW, LM',
  'potential': 93.0},
 {'birthday': '1989-06-18 00:00:00',
  'overall_rating': 84.0,
  'player_fifa_api_id': 188567,
  'player_name': 'Pierre-Emerick Aubameyang',
  'player_positions': 'ST, RM, RW',
  'potential': 85.0},
 {'birthday': '1994-02-13 00:00:00',
  'overall_rating': 79.0,
  'player_fifa_api_id': 202556,
  'player_name': 'Memphis Depay',
  'player_positions': 'LW, CAM',
  'potential': 86.0},
 {'birthday': '1992-01-04 00:00:00',
  'overall_rating': 80.0,
  'player_fifa_api_id': 208808,
  'player_name': 'Quincy Promes',
  'player_positions': 'RW, RM, LW',
  'potential': 86.0},
 {'birthday': '1981-12-03 00:00:00',
  'overall_rating': 81.0,
  'player_fifa_api_id': 113422,
  'player_name': 'David Villa',
  'player_positions': 'ST',
  'potential': 81.0},
 {'birthday': '1980-07-08 00:00:00',
  'overall_rating': 80.0,
  'p

In [12]:
#First Name is search query
#The rest are search results

## Use Case 2: Filter-Search: only look at all the under-21s (youths)

Database is 2016. Those under 21 y-o would be born after 1995. Purpose of this is to get the potential next generation Cristiano Ronaldo

In [13]:
embeddings.birthday = pd.to_datetime(embeddings.birthday)
new_df = embeddings.query('birthday > 19950101')

In [14]:
player_name = 'Cristiano Ronaldo'
query_id = embeddings[embeddings.player_name == player_name].player_fifa_api_id.values[0]
query_vector = np.array(embeddings[embeddings.player_fifa_api_id == query_id][embeddings_columns])
query_vector = query_vector.reshape((14)).astype('float32')

In [21]:
index = Service_Index(new_df, embeddings_columns)
response = index.search_by_vector(query_vector)
pprint(response)

[{'birthday': Timestamp('1995-03-08 00:00:00'),
  'overall_rating': 77.0,
  'player_fifa_api_id': 215785,
  'player_name': 'Balde Diao Keita',
  'player_positions': 'LW, RW, ST',
  'potential': 85.0},
 {'birthday': Timestamp('1996-12-01 00:00:00'),
  'overall_rating': 74.0,
  'player_fifa_api_id': 224458,
  'player_name': 'Diogo Jota',
  'player_positions': 'ST',
  'potential': 87.0},
 {'birthday': Timestamp('1995-02-13 00:00:00'),
  'overall_rating': 75.0,
  'player_fifa_api_id': 219732,
  'player_name': "Georges N'Koudou",
  'player_positions': 'LM, RM',
  'potential': 82.0},
 {'birthday': Timestamp('1995-02-07 00:00:00'),
  'overall_rating': 68.0,
  'player_fifa_api_id': 215598,
  'player_name': 'Shani Tarashaj',
  'player_positions': 'CAM, ST',
  'potential': 80.0},
 {'birthday': Timestamp('1995-07-15 00:00:00'),
  'overall_rating': 70.0,
  'player_fifa_api_id': 211239,
  'player_name': 'Corentin Jean',
  'player_positions': 'ST, LM, RM',
  'potential': 82.0},
 {'birthday': Timesta

Antony Martial is the 6th nearest player. Back in 2016, he was touted as the next CR7:

https://www.express.co.uk/sport/football/650539/Anthony-Martial-Manchester-United-Next-Cristiano-Ronaldo-Louis-Saha
"Louis Saha: Why Anthony Martial can be Manchester United's new Cristiano Ronaldo"

## Use Case 3: Best-Search: Get the best player of all similar players

After getting the top K most similar players, which players are the best players out of them?

Using the earlier example (use case 2), we notice that each player has a "potential" field. Let's sort them by their potential

In [22]:
sorted(response, key=lambda response: response['potential'], reverse=True) 

[{'player_name': 'Breel Embolo',
  'player_positions': 'ST, RM',
  'overall_rating': 76.0,
  'potential': 88.0,
  'birthday': Timestamp('1997-02-14 00:00:00'),
  'player_fifa_api_id': 222357},
 {'player_name': 'Diogo Jota',
  'player_positions': 'ST',
  'overall_rating': 74.0,
  'potential': 87.0,
  'birthday': Timestamp('1996-12-01 00:00:00'),
  'player_fifa_api_id': 224458},
 {'player_name': 'Anthony Martial',
  'player_positions': 'ST',
  'overall_rating': 78.0,
  'potential': 86.0,
  'birthday': Timestamp('1995-12-05 00:00:00'),
  'player_fifa_api_id': 211300},
 {'player_name': 'Balde Diao Keita',
  'player_positions': 'LW, RW, ST',
  'overall_rating': 77.0,
  'potential': 85.0,
  'birthday': Timestamp('1995-03-08 00:00:00'),
  'player_fifa_api_id': 215785},
 {'player_name': 'Anwar El-Ghazi',
  'player_positions': 'RW',
  'overall_rating': 76.0,
  'potential': 84.0,
  'birthday': Timestamp('1995-05-03 00:00:00'),
  'player_fifa_api_id': 216433},
 {'player_name': "Georges N'Koudou",

Anthony Martial now ranked 3rd.

We can do the same for Use Case 1, but lets sort by ability instead. We are finding the best player most similar to Ronaldo.

In [23]:
all_index = Service_Index(embeddings, embeddings_columns)
response = all_index.search_by_player_name('Cristiano Ronaldo')

In [25]:
sorted(response, key=lambda response: response['overall_rating'], reverse=True) 

[{'player_name': 'Cristiano Ronaldo',
  'player_positions': 'LW, LM',
  'overall_rating': 93.0,
  'potential': 93.0,
  'birthday': Timestamp('1985-02-05 00:00:00'),
  'player_fifa_api_id': 20801},
 {'player_name': 'Pierre-Emerick Aubameyang',
  'player_positions': 'ST, RM, RW',
  'overall_rating': 84.0,
  'potential': 85.0,
  'birthday': Timestamp('1989-06-18 00:00:00'),
  'player_fifa_api_id': 188567},
 {'player_name': 'David Villa',
  'player_positions': 'ST',
  'overall_rating': 81.0,
  'potential': 81.0,
  'birthday': Timestamp('1981-12-03 00:00:00'),
  'player_fifa_api_id': 113422},
 {'player_name': 'Quincy Promes',
  'player_positions': 'RW, RM, LW',
  'overall_rating': 80.0,
  'potential': 86.0,
  'birthday': Timestamp('1992-01-04 00:00:00'),
  'player_fifa_api_id': 208808},
 {'player_name': 'Robbie Keane',
  'player_positions': 'ST',
  'overall_rating': 80.0,
  'potential': 80.0,
  'birthday': Timestamp('1980-07-08 00:00:00'),
  'player_fifa_api_id': 330},
 {'player_name': 'Mem