In [None]:
import pandas as pd
import numpy as np
import random
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from pyclustering.cluster.cure import cure
from pyclustering.utils import read_sample
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
import sys
sys.path.append("clustering_eval.py")

from clustering_eval import *

In [None]:
def remove_GK(samples):
    kmeans = KMeans(n_clusters=2, random_state=42).fit(samples)
    labels, cnts = np.unique(kmeans.labels_, return_counts=True)
    majLabel = labels[np.argmax(cnts)]
    return np.where(kmeans.labels_ == majLabel)[0]

def group_samples(total_samples, nonGK_ind, nonGK_labels):
    maxLabel = np.amax(nonGK_labels)
    allLabels = np.full((total_samples), maxLabel+1)
    allLabels[nonGK_ind] = nonGK_labels
    return allLabels

In [None]:
class ScoreGenerator():
    def __init__(self, samples, embedding_name):
        self.embedding = embedding_name
        
        if pathlib.Path('{}/DBSCAN_results.csv'.format(embedding_name)).exists():
            print("Using preexisting DBSCAN results")
            self.dbscan = pd.read_csv('{}/DBSCAN_results.csv'.format(embedding_name))
        else:
            self.dbscan = self.try_DBSCAN(samples)
            
        if pathlib.Path('{}/KMeans_results.csv'.format(embedding_name)).exists():
            print("Using preexisting KMeans results")
            self.kmeans = pd.read_csv('{}/Kmeans_results.csv'.format(embedding_name))
        else:
            self.kmeans = self.try_KMeans(samples)
            
        if pathlib.Path('{}/CURE_results.csv'.format(embedding_name)).exists():
            print("Using preexisting CURE results")
            self.cure = pd.read_csv('{}/CURE_results.csv'.format(embedding_name))        
        else:
            self.cure = self.try_CURE(samples)
        
        if pathlib.Path('{}/AggClustering_results.csv'.format(embedding_name)).exists():
            print("Using preexisting AggClustering results")
            self.aggClustering = pd.read_csv('{}/AggClustering_results.csv'.format(embedding_name))
        else:
            self.aggClustering = self.try_AggClustering(samples)

        if pathlib.Path('{}/Overall_results.csv'.format(embedding_name)).exists():
            print("Using preexisting Overall results")
            self.overall = pd.read_csv('{}/Overall_results.csv'.format(embedding_name))
        else:
            self.overall = self.combine_results(samples.shape[0])
        
    def try_KMeans(self, samples):
        KMeans_df = pd.DataFrame(columns=['Title', 'Embeddings', 'Algorithm', 'Num Clusters', 'Silhouette Score'])

        for nc in range(6,21,2):
            kmeans = KMeans(n_clusters=nc, random_state=42).fit(samples)
            score = silhouette_score(samples, kmeans.labels_)

            title = "{}_KMeans_nc:{}".format(self.embedding, nc)
            KMeans_df = KMeans_df.append({
                'Title': title,
                'Embeddings' : self.embedding,
                'Algorithm' : 'KMeans',  
                'Num Clusters' : nc, 
                'Silhouette Score' : score
                }, ignore_index = True)

        KMeans_df.sort_values('Silhouette Score', ascending=False, inplace=True)
        KMeans_df.to_csv('{}/KMeans_results.csv'.format(self.embedding), index=False)
        KMeans_df = KMeans_df.reset_index(drop=True)
        
        return KMeans_df
    
    def try_AggClustering(self, samples):
        AFFINITIES = ["l1", "l2", "cosine"]
        LINKAGE = ["ward", "complete", "average", "single"]
        AggClustering_df = pd.DataFrame(columns=['Title', 'Embeddings', 'Algorithm', 'Num Clusters', 'Linkage', 'Affinity', 'Silhouette Score'])

        for link in LINKAGE:
            for aff in AFFINITIES:
                for nc in range(6,21,2):
                    if link == "ward" and aff != "euclidean":
                        continue

                    title = "{}_AggClustering_nc:{}_link:{}_aff:{}".format(self.embedding, nc, link, aff)
                    hc = AgglomerativeClustering(n_clusters=nc, affinity=aff, linkage=link)
                    hc.fit(samples)
                    score = silhouette_score(samples, hc.labels_)

                    AggClustering_df = AggClustering_df.append({
                        'Title': title,
                        'Embeddings' : self.embedding,
                        'Algorithm' : 'Agg_Clustering', 
                        'Num Clusters' : nc, 
                        'Linkage': link,
                        'Affinity': aff,
                        'Silhouette Score' : score
                    }, ignore_index = True)

        AggClustering_df.sort_values('Silhouette Score', ascending=False, inplace=True)
        AggClustering_df.to_csv('{}/AggClustering_results.csv'.format(self.embedding), index=False)
        AggClustering_df = AggClustering_df.reset_index(drop=True)
        
        return AggClustering_df
    
    def try_CURE(self, samples):
        CURE_df = pd.DataFrame(columns=['Title', 'Embeddings', 'Algorithm', 'Num Clusters', 'Silhouette Score'])

        for nc in range(6,21,2):
            cure_instance = cure(samples, nc);
            cure_instance.process();
            clusters = cure_instance.get_clusters();
            labels_ = np.zeros(shape=(samples.shape[0],))
            for i, pts in enumerate(clusters):
                labels_[pts] = i
            score = silhouette_score(samples, labels_)

            title = "{}_CURE_nc:{}".format(self.embedding, nc)
            CURE_df = CURE_df.append({
                'Title': title,
                'Embeddings' : self.embedding,
                'Algorithm' : 'CURE',  
                'Num Clusters' : nc, 
                'Silhouette Score' : score
                }, ignore_index = True)

        CURE_df.sort_values('Silhouette Score', ascending=False, inplace=True)
        CURE_df.to_csv('{}/CURE_results.csv'.format(self.embedding), index=False)
        CURE_df = CURE_df.reset_index(drop=True)
        
        return CURE_df
    
    def find_optimal_eps(self, k, X, metric):
        neigh = NearestNeighbors(n_neighbors=k, metric=metric)
        neigh.fit(X)
        dist, ind = neigh.kneighbors(X)
        distances = sorted([dist[i][k - 1] for i in range(len(dist))])

        # Find optimal eps from k-distance graph
        sec_drv = np.diff(distances, 2)
        start, end = int(X.shape[0]*0.01), int(-X.shape[0]*0.05)
        idx = np.argmax(sec_drv[start:end]) + start
        eps = distances[idx]

        return idx, eps

    def try_DBSCAN(self, X):
        METRICS = ['l1', 'l2', 'cosine', 'correlation']
        Dbscan_df = pd.DataFrame(columns=['Title', 'Embeddings', 'Algorithm', 'Num Clusters', 
                                          'Metric', 'MinPts', 'NumNoise', 'Silhouette Score'])

        n_features = X.shape[1]
        num_noise = 0
        MIN_PTS = random.sample(range(int(n_features/2), n_features*2), 10)

        for metric in METRICS:
            for mp in MIN_PTS:
                samples = np.copy(X)
                title = "{}_DBSCAN_metric:{}_minPts:{}".format(self.embedding, metric, mp)
                k = mp
                idx, eps = self.find_optimal_eps(k, samples, metric)

                # Do DBScan
                db = DBSCAN(eps=eps, min_samples=mp, metric=metric).fit(samples)
                nc = len(set(db.labels_)) - (1 if -1 in db.labels_ else 0)
                labels = db.labels_

                if -1 in labels:
                    notNoiseInd = np.where(labels!=-1)[0]
                    num_noise = samples.shape[0] - len(notNoiseInd)
                    samples = samples[notNoiseInd]
                    labels = labels[notNoiseInd]
                else:
                    num_noise = 0
                
                if nc == 1:
                    print("Bad Clustering")
                    score = -1
                else:
                    score = silhouette_score(samples, labels)


                Dbscan_df = Dbscan_df.append({
                    'Title': title,
                    'Embeddings' : self.embedding,
                    'Algorithm' : 'DBScan',  
                    'Num Clusters' : nc,
                    'Metric': metric,
                    'MinPts': mp,
                    'NumNoise': num_noise,
                    'Silhouette Score' : score
                    }, ignore_index = True)

        Dbscan_df.sort_values('Silhouette Score', ascending=False, inplace=True)
        Dbscan_df.to_csv('{}/DBSCAN_results.csv'.format(self.embedding), index=False)
        Dbscan_df = Dbscan_df.reset_index(drop=True)
        
        return Dbscan_df
    
    def combine_results(self, num_samples):
        Overall_df = pd.DataFrame(columns=['Title', 'Embeddings', 'Algorithm', 'Num Clusters', 'Silhouette Score'])
        
        km_df = self.kmeans[Overall_df.columns]
        agg_df = self.aggClustering[Overall_df.columns]
        cure_df = self.cure[Overall_df.columns]
        dbscan_df = self.dbscan.loc[self.dbscan['NumNoise'] < num_samples*0.2] 
        dbscan_df = dbscan_df[Overall_df.columns]
        
        Overall_df = pd.concat([Overall_df, km_df, agg_df, cure_df, dbscan_df])
        Overall_df = Overall_df.loc[Overall_df['Num Clusters'] >= 6] 
        
        Overall_df.sort_values('Silhouette Score', ascending=False, inplace=True)
        Overall_df = Overall_df.reset_index(drop=True)
        Overall_df = Overall_df.head(20)
        
        Overall_df.to_csv('{}/Overall_results.csv'.format(self.embedding), index=False)
        return Overall_df
    
    def plot_best(self, samples, positions, algDict, gk_present=True):
        if gk_present:
            nonGK_ind = remove_GK(samples)
            subset_samples = samples[nonGK_ind]
        else:
            subset_samples = samples
        
        for algorithm, best_n in algDict.items():
            for i in range(best_n):
                if algorithm == "KMeans":
                    kmeans = KMeans(n_clusters=self.kmeans.at[i, "Num Clusters"], random_state=42).fit(subset_samples)
                    subset_labels = kmeans.labels_
                    title = self.kmeans.at[i, "Title"]

                elif algorithm == "AggClustering":
                    hc = AgglomerativeClustering(n_clusters=self.aggClustering.at[i, "Num Clusters"], 
                                                 affinity=self.aggClustering.at[i, "Affinity"], 
                                                 linkage=self.aggClustering.at[i, "Linkage"])
                    hc.fit(subset_samples)
                    subset_labels = hc.labels_
                    title = self.aggClustering.at[i, "Title"]

                elif algorithm == "CURE":
                    cure_instance = cure(subset_samples, self.cure.at[i, "Num Clusters"]);
                    cure_instance.process();
                    clusters = cure_instance.get_clusters();
                    subset_labels = np.zeros(shape=(subset_samples.shape[0],))
                    for c, pts in enumerate(clusters):
                        subset_labels[pts] = c
                    title = self.cure.at[i, "Title"]

                elif algorithm == "DBSCAN":
                    idx, eps = self.find_optimal_eps(self.dbscan.at[i,"MinPts"], subset_samples, self.dbscan.at[i, "Metric"])

                    # Do DBScan
                    db = DBSCAN(eps=eps, 
                                min_samples=self.dbscan.at[i,"MinPts"], 
                                metric=self.dbscan.at[i, "Metric"]).fit(subset_samples)
                    subset_labels = db.labels_
                    title = self.dbscan.at[i,"Title"]

                if gk_present:
                    labels = group_samples(len(samples), nonGK_ind, subset_labels)
                else:
                    labels = subset_labels
                labels_in_cluster(labels, positions, do_mining=False, title=title, save_link=self.embedding+"/img")
                labels_in_cluster(labels, positions, title=title, save_link=self.embedding+"/img")
                if len(samples) > 8000:
                    _, subset_sampl, _, subset_lab = train_test_split(samples, labels, 
                                                                        test_size=8000, random_state=42,
                                                                        stratify=labels)
                    cosine_matrix(subset_sampl, subset_lab, title=title, save_link=self.embedding+"/img")
                else:
                    cosine_matrix(samples, labels, title=title, save_link=self.embedding+"/img")
                silhouette_blob(samples, labels, title=title, save_link=self.embedding+"/img")

# Datasets

## 0 OG Dataset

In [None]:
df = pd.read_csv('../../datasets/cleaned_soccer_data_2016_v2.csv', index_col=0)
df = df.drop(["id", "player_fifa_api_id", "player_api_id", "date", "player_name", 'player_positions'], axis=1)
df

In [None]:
min_max_scaler = MinMaxScaler()
X = min_max_scaler.fit_transform(df)
np.random.shuffle(X)

In [None]:
nonGK_ind = remove_GK(X)
X = X[nonGK_ind]

In [None]:
og = ScoreGenerator(X, "OG")

In [None]:
og.overall

## 1 PCA-with-label

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/soccer_player_embeddings_feature_labels.csv", index_col=0)
df

In [None]:
X = df.iloc[:, 2:12].values
X

In [None]:
nonGK_ind = remove_GK(X)
X = X[nonGK_ind]

In [None]:
pca_with_label = ScoreGenerator(X, "PCA-with-label")

In [None]:
pca_with_label.overall

## 2 PCA-no-label

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/soccer_player_embeddings_feature_no_labels.csv", index_col=0)
df

In [None]:
X = df.iloc[:, 2:16].values
X

In [None]:
nonGK_ind = remove_GK(X)
X = X[nonGK_ind]

In [None]:
pca_no_label = ScoreGenerator(X, "PCA-no-label")

In [None]:
pca_no_label.overall

## 3 LDA

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/soccer_player_embeddings_feature_no_labels_LDA.csv", index_col=0)
df

In [None]:
X = df.iloc[:, 0:42].values
X

In [None]:
nonGK_ind = remove_GK(X)
X = X[nonGK_ind]

In [None]:
lda = ScoreGenerator(X, "LDA")

In [None]:
lda.overall

## 4 DNN_(5,36,32)

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/DNN_player_embeddings_(5,36,32).csv", index_col=0)
df

In [None]:
X = df.iloc[:, 0:32].values
X

In [None]:
nonGK_ind = remove_GK(X)
X = X[nonGK_ind]

In [None]:
dnn_shaped = ScoreGenerator(X, "DNN_5_36_32")

In [None]:
dnn_shaped.overall

## 5 DNN v2

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/DNN_player_embeddings_v2.csv", index_col=0)
df

In [None]:
X = df.iloc[:, 0:32].values
X

In [None]:
nonGK_ind = remove_GK(X)
X = X[nonGK_ind]

In [None]:
dnn_v2= ScoreGenerator(X, "DNN_v2")
dnn_v2.overall

## 6 PCA 0 Mean

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/embeddings_PCA_zero_mean.csv")
df

In [None]:
X = df.iloc[:, 0:16].values
X

In [None]:
nonGK_ind = remove_GK(X)
X = X[nonGK_ind]

In [None]:
pca_0_mean = ScoreGenerator(X, "PCA-0-mean")
pca_0_mean.overall

## 7 DNN SH

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/dnn_embeddings_sh.csv")
df

In [None]:
X = df.iloc[:, 7:].values
X

In [None]:
nonGK_ind = remove_GK(X)
X = X[nonGK_ind]

In [None]:
dnn_sh = ScoreGenerator(X, "DNN_sh")
dnn_sh.overall

## 8 DNN lr

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/dnn_embeddings_lr_no_gk.csv")
df

In [None]:
X = df.iloc[:, :26].values
X

In [None]:
dnn_lr = ScoreGenerator(X, "DNN_lr")
dnn_lr.overall

## Combine best of all models

In [None]:
combined_df = pd.concat([og.overall, pca_with_label.overall, pca_no_label.overall, lda.overall, dnn_shaped.overall, dnn_v2.overall, pca_0_mean.overall, dnn_sh.overall, dnn_lr.overall]
                        , ignore_index=True)
combined_df.sort_values('Silhouette Score', ascending=False, inplace=True)
combined_df = combined_df.reset_index(drop=True)
combined_df.to_csv('Combined_results.csv', index=False)
combined_df

In [None]:
sns.set_theme(style="whitegrid", palette="muted")
f = sns.catplot(y="Embeddings", x="Silhouette Score", hue="Algorithm", kind="swarm", data=combined_df)
f.fig.set_figwidth(12)
f.fig.set_figheight(7)

# Visualise best models

In [None]:
df = pd.read_csv('../../datasets/cleaned_soccer_data_2016_v2.csv', index_col=0)
pos = df["player_positions"]
df = df.drop(["id", "player_fifa_api_id", "player_api_id", "date", "player_name", 'player_positions'], axis=1)
min_max_scaler = MinMaxScaler()
X = min_max_scaler.fit_transform(df)
np.random.shuffle(X)

In [None]:
printDict = {
    "DBSCAN": 2,
    "AggClustering": 2,
    "KMeans": 1,
    "CURE": 1
}

og.plot_best(X, pos, printDict)

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/embeddings_PCA_zero_mean.csv")
X = df.iloc[:, 0:16].values
pos = df["player_positions"].values

In [None]:
printDict = {
    "DBSCAN": 3,
    "KMeans": 1,
    "AggClustering": 1,
    "CURE": 1
}

pca_0_mean.plot_best(X, pos, printDict)

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/dnn_embeddings_lr_no_gk.csv")
X = df.iloc[:, :26].values
pos = df["player_positions"].values

In [None]:
printDict = {
    "KMeans": 2,
    "AggClustering": 2,
    "CURE": 1,
    "DBSCAN": 1
}

dnn_lr.plot_best(X, pos, printDict, gk_present=False)

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/dnn_embeddings_sh.csv")
X = df.iloc[:, 7:].values
pos = df["player_positions"].values

In [None]:
printDict = {
    "KMeans": 2,
    "AggClustering": 2,
    "CURE": 1,
    "DBSCAN": 1
}

dnn_sh.plot_best(X, pos, printDict)

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/DNN_player_embeddings_(5,36,32).csv", index_col=0)
X = df.iloc[:, 0:32].values
pos = df["player_positions"].values

In [None]:
printDict = {
    "AggClustering": 2,
    "CURE": 2,
    "KMeans": 1,
    "DBSCAN": 1
}

dnn_shaped.plot_best(X, pos, printDict)

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/soccer_player_embeddings_feature_no_labels.csv", index_col=0)
X = df.iloc[:, 2:16].values
pos = df["player_positions"].values

In [None]:
printDict = {
    "DBSCAN": 2,
    "KMeans": 1,
    "AggClustering": 1,
    "CURE": 1,
}

pca_no_label.plot_best(X, pos, printDict)

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/soccer_player_embeddings_feature_labels.csv", index_col=0)
X = df.iloc[:, 2:12].values
pos = df["player_positions"].values

In [None]:
printDict = {
    "DBSCAN": 2,
    "KMeans": 2,
    "AggClustering": 1,
    "CURE": 1
}

pca_with_label.plot_best(X, pos, printDict)

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/DNN_player_embeddings_v2.csv", index_col=0)
X = df.iloc[:, 0:32].values
pos = df["player_positions"].values

In [None]:
printDict = {
    "KMeans": 1,
    "AggClustering": 1,
    "CURE": 1,
    "DBSCAN": 1
}

dnn_v2.plot_best(X, pos, printDict)

In [None]:
df = pd.read_csv("../../datasets/final_embeddings/soccer_player_embeddings_feature_no_labels_LDA.csv", index_col=0)
X = df.iloc[:, :42].values
pos = df["player_positions"].values

In [None]:
printDict = {
    "KMeans": 1,
    "AggClustering": 1,
    "CURE": 1,
    "DBSCAN": 1
}

lda.plot_best(X, pos, printDict)