In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from pprint import pprint

import sys
import os
import pandas as pd
import numpy as np

In [3]:
REPO_PATH = Path('/home','shawn','Soccer-Player-Attributes-Comparison')
sys.path.append(os.fspath(REPO_PATH))

In [4]:
from fifa_similarity_search.service.index import ExampleIndex

In [5]:
embeddings = pd.read_csv(Path(REPO_PATH, 'datasets','final_embeddings','dnn_embeddings_sh.csv'))

In [6]:
df = pd.read_csv(Path(REPO_PATH, 'datasets','cleaned_soccer_data_120K.csv'))

In [7]:
df = df.query('fifa==16')

In [8]:
embeddings = embeddings.merge(df[['sofifa_id','overall','potential']], how='left', on=['sofifa_id'])

In [9]:
embeddings = embeddings.rename(columns={"dob": "birthday", "short_name": "player_name","overall":"overall_rating","sofifa_id":"player_fifa_api_id"})

In [10]:
embeddings.to_csv(Path(REPO_PATH, 'datasets','final_embeddings','dnn_embeddings_sh_v2.csv'), index=False)

In [11]:
embeddings_columns = []
for i in range(1,15):
    embeddings_columns.append(f'PC_{i}')

In [12]:
def get_dataset(df, embeddings_columns):
    vectors = np.ascontiguousarray(df[embeddings_columns])
    labels = np.ascontiguousarray(df['player_fifa_api_id'])
    
    return vectors.astype('float32'), np.expand_dims(labels, axis=1)

In [13]:
class Service_Index():
    def __init__(self, df, embeddings_columns):
        vectors, labels = get_dataset(df, embeddings_columns)
        self.df = df
        self.vectors = vectors
        self.labels = labels

        self.index = ExampleIndex(vectors, labels)
        self.index.build()

        self.columns = ['player_name', 'player_positions', 'overall_rating', 'potential', 'birthday',
                        'player_fifa_api_id']

    def search_by_player_name(self, player_name):
        # player_name must be exact
        query_id = self.df[self.df.player_name == player_name].player_fifa_api_id.values[0]
        id_ = self.df[self.df.player_fifa_api_id == query_id].index.values[0]
        results, distances = self.index.query(np.expand_dims(self.vectors[id_], axis=0), k=11)
        response = []
        for ind, result_id in enumerate(results):
            if query_id != result_id:
                result_dict = self.df[self.df.player_fifa_api_id == result_id[0]][self.columns].iloc[0].to_dict()
                result_dict["similarity_score"] = 1/(1+distances[ind])
                response.append(result_dict)
        return response

    def search_by_vector(self, vector, query_id):
        results, distances = self.index.query(np.expand_dims(vector, axis=0), k=11)
        response = []
        for ind, result_id in enumerate(results):
            if query_id != result_id:
                result_dict = self.df[self.df.player_fifa_api_id == result_id[0]][self.columns].iloc[0].to_dict()
                result_dict["similarity_score"] = 1/(1+distances[ind])
                response.append(result_dict)
        return response

## Use Case 1: Search all in database

In [14]:
all_index = Service_Index(embeddings, embeddings_columns)

In [15]:
pprint(all_index.search_by_player_name('L. Messi'))

[{'birthday': '1987-02-01',
  'overall_rating': 80,
  'player_fifa_api_id': 162409,
  'player_name': 'G. Rossi',
  'player_positions': 'ST',
  'potential': 80,
  'similarity_score': 0.5864794747935401},
 {'birthday': '1993-11-15',
  'overall_rating': 78,
  'player_fifa_api_id': 211110,
  'player_name': 'P. Dybala',
  'player_positions': 'ST',
  'potential': 88,
  'similarity_score': 0.576062518992101},
 {'birthday': '1988-10-15',
  'overall_rating': 87,
  'player_fifa_api_id': 176635,
  'player_name': 'M. Özil',
  'player_positions': 'CAM, LW',
  'potential': 88,
  'similarity_score': 0.3395830491197915},
 {'birthday': '1984-01-23',
  'overall_rating': 90,
  'player_fifa_api_id': 9014,
  'player_name': 'A. Robben',
  'player_positions': 'RM, LM, RW',
  'potential': 90,
  'similarity_score': 0.3158182371135131},
 {'birthday': '1986-01-16',
  'overall_rating': 69,
  'player_fifa_api_id': 196871,
  'player_name': 'M. Fernández',
  'player_positions': 'RM, ST',
  'potential': 69,
  'simila

In [16]:
#First Name is search query
#The rest are search results

## Use Case 2: Filter-Search: only look at all the under-21s (youths)

Database is 2016. Those under 21 y-o would be born after 1995. Purpose of this is to get the potential next generation Cristiano Ronaldo

In [17]:
embeddings.birthday = pd.to_datetime(embeddings.birthday)
new_df = embeddings.query('birthday > 19950101')

In [18]:
player_name = 'L. Messi'
query_id = embeddings[embeddings.player_name == player_name].player_fifa_api_id.values[0]
query_vector = np.array(embeddings[embeddings.player_fifa_api_id == query_id][embeddings_columns])
query_vector = query_vector.reshape((14)).astype('float32')

In [19]:
index = Service_Index(new_df, embeddings_columns)
response = index.search_by_vector(query_vector, query_id)
pprint(response)

[{'birthday': Timestamp('1995-09-01 00:00:00'),
  'overall_rating': 73,
  'player_fifa_api_id': 220253,
  'player_name': 'Munir',
  'player_positions': 'RW, LW, ST',
  'potential': 85,
  'similarity_score': 0.24818256723712764},
 {'birthday': Timestamp('1996-06-13 00:00:00'),
  'overall_rating': 69,
  'player_fifa_api_id': 213345,
  'player_name': 'K. Coman',
  'player_positions': 'ST, LW',
  'potential': 79,
  'similarity_score': 0.17972101983643746},
 {'birthday': Timestamp('1995-11-20 00:00:00'),
  'overall_rating': 69,
  'player_fifa_api_id': 229147,
  'player_name': 'João Vigário',
  'player_positions': 'RW',
  'potential': 77,
  'similarity_score': 0.16666375271275863},
 {'birthday': Timestamp('1995-05-09 00:00:00'),
  'overall_rating': 68,
  'player_fifa_api_id': 216295,
  'player_name': 'T. Dierckx',
  'player_positions': 'RW',
  'potential': 78,
  'similarity_score': 0.1583054079533606},
 {'birthday': Timestamp('1995-02-13 00:00:00'),
  'overall_rating': 67,
  'player_fifa_api

## Use Case 3: Best-Search: Get the best player of all similar players

After getting the top K most similar players, which players are the best players out of them?

Using the earlier example (use case 2), we notice that each player has a "potential" field. Let's sort them by their potential

In [20]:
sorted(response, key=lambda response: response['potential'], reverse=True) 

[{'player_name': 'Munir',
  'player_positions': 'RW, LW, ST',
  'overall_rating': 73,
  'potential': 85,
  'birthday': Timestamp('1995-09-01 00:00:00'),
  'player_fifa_api_id': 220253,
  'similarity_score': 0.24818256723712764},
 {'player_name': 'B. Traoré',
  'player_positions': 'ST, RW',
  'overall_rating': 74,
  'potential': 85,
  'birthday': Timestamp('1995-09-06 00:00:00'),
  'player_fifa_api_id': 207948,
  'similarity_score': 0.14100624228455244},
 {'player_name': 'T. Werner',
  'player_positions': 'LM, ST',
  'overall_rating': 72,
  'potential': 84,
  'birthday': Timestamp('1996-03-06 00:00:00'),
  'player_fifa_api_id': 212188,
  'similarity_score': 0.1495583463483166},
 {'player_name': 'P. Roberts',
  'player_positions': 'RM',
  'overall_rating': 65,
  'potential': 81,
  'birthday': Timestamp('1997-02-05 00:00:00'),
  'player_fifa_api_id': 221982,
  'similarity_score': 0.13519795426862563},
 {'player_name': 'M. Simon',
  'player_positions': 'ST',
  'overall_rating': 70,
  'pote

We can do the same for Use Case 1, but lets sort by ability instead. We are finding the best player most similar to Messi.

In [21]:
all_index = Service_Index(embeddings, embeddings_columns)
response = all_index.search_by_player_name('L. Messi')

In [22]:
sorted(response, key=lambda response: response['potential'], reverse=True) 

[{'player_name': 'A. Robben',
  'player_positions': 'RM, LM, RW',
  'overall_rating': 90,
  'potential': 90,
  'birthday': Timestamp('1984-01-23 00:00:00'),
  'player_fifa_api_id': 9014,
  'similarity_score': 0.3158182371135131},
 {'player_name': 'P. Dybala',
  'player_positions': 'ST',
  'overall_rating': 78,
  'potential': 88,
  'birthday': Timestamp('1993-11-15 00:00:00'),
  'player_fifa_api_id': 211110,
  'similarity_score': 0.576062518992101},
 {'player_name': 'M. Özil',
  'player_positions': 'CAM, LW',
  'overall_rating': 87,
  'potential': 88,
  'birthday': Timestamp('1988-10-15 00:00:00'),
  'player_fifa_api_id': 176635,
  'similarity_score': 0.3395830491197915},
 {'player_name': 'N. Fékir',
  'player_positions': 'ST, CAM, RW, LW',
  'overall_rating': 79,
  'potential': 85,
  'birthday': Timestamp('1993-07-18 00:00:00'),
  'player_fifa_api_id': 216594,
  'similarity_score': 0.2593831580882039},
 {'player_name': 'Munir',
  'player_positions': 'RW, LW, ST',
  'overall_rating': 73