In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from pyclustering.cluster.cure import cure
from pyclustering.utils import read_sample
from sklearn.neighbors import NearestNeighbors


from sklearn.metrics import silhouette_score

In [38]:
def try_KMeans(samples, embedding_name):
    KMeans_df = pd.DataFrame(columns=['Title', 'Embeddings', 'Algorithm', 'Num Clusters', 'Silhouette Score'])
    
    for nc in range(2,21,2):
        kmeans = KMeans(n_clusters=nc, random_state=42).fit(samples)
        score = silhouette_score(samples, kmeans.labels_)
        
        title = "{}_KMeans_nc:{}".format(embedding_name, nc)
        KMeans_df = KMeans_df.append({
            'Title': title,
            'Embeddings' : embedding_name,
            'Algorithm' : 'KMeans',  
            'Num Clusters' : nc, 
            'Silhouette Score' : score
            }, ignore_index = True)
        
    KMeans_df.sort_values('Silhouette Score', ascending=False, inplace=True)
    KMeans_df.to_csv('{}/KMeans_results.csv'.format(embedding_name))

In [39]:
def try_AggClustering(samples, embedding_name):
    AFFINITIES = ["euclidean", "l1", "l2", "manhattan", "cosine"]
    LINKAGE = ["ward", "complete", "average", "single"]
    AggClustering_df = pd.DataFrame(columns=['Title', 'Embeddings', 'Algorithm', 'Num Clusters', 'Linkage', 'Affinity', 'Silhouette Score'])

    for link in LINKAGE:
        for aff in AFFINITIES:
            for nc in range(2,21,2):
                if link == "ward" and aff != "euclidean":
                    continue
                    
                title = "{}_AggClustering_nc:{}_link:{}_aff:{}".format(embedding_name, nc, link, aff)
                hc = AgglomerativeClustering(n_clusters=nc, affinity=aff, linkage=link)
                hc.fit(samples)
                score = silhouette_score(samples, hc.labels_)

                AggClustering_df = AggClustering_df.append({
                    'Title': title,
                    'Embeddings' : embedding_name,
                    'Algorithm' : 'Agg_Clustering', 
                    'Num Clusters' : nc, 
                    'Linkage': link,
                    'Affinity': aff,
                    'Silhouette Score' : score
                }, ignore_index = True)
    
    AggClustering_df.sort_values('Silhouette Score', ascending=False, inplace=True)
    AggClustering_df.to_csv('{}/AggClustering_results.csv'.format(embedding_name))

In [40]:
def try_CURE(samples, embedding_name):
    CURE_df = pd.DataFrame(columns=['Title', 'Embeddings', 'Algorithm', 'Num Clusters', 'Silhouette Score'])

    for nc in range(2,21,2):
        cure_instance = cure(samples, nc);
        cure_instance.process();
        clusters = cure_instance.get_clusters();
        labels_ = np.zeros(shape=(samples.shape[0],))
        for i, pts in enumerate(clusters):
            labels_[pts] = i
        score = silhouette_score(samples, labels_)
            
        title = "{}_CURE_nc:{}".format(embedding_name, nc)
        CURE_df = CURE_df.append({
            'Title': title,
            'Embeddings' : embedding_name,
            'Algorithm' : 'CURE',  
            'Num Clusters' : nc, 
            'Silhouette Score' : score
            }, ignore_index = True)
        
    CURE_df.sort_values('Silhouette Score', ascending=False, inplace=True)
    CURE_df.to_csv('{}/CURE_results.csv'.format(embedding_name))

**Wanqi** fill in the DBSCAN below. Need to loop over multiple eps and minPts but idk how you definted either. You can refer to the try_AggClustering() method above to see how we can loop it. Also don't worry abt the format into the df. I will do it.

In [None]:
def find_optimal_eps(k, X):
    neigh = NearestNeighbors(n_neighbors=k, metric='euclidean')
    neigh.fit(X)
    dist, ind = neigh.kneighbors(X)
    distances = sorted([dist[i][k - 1] for i in range(len(dist))])
    
    # Find optimal eps from k-distance graph
    sec_drv = np.diff(dist, 2)
    start, end = 100, -300
    idx = np.argmax(sec_drv[start:end]) + start
    eps = dist[idx]
    
    return idx, eps

def try_DBSCAN(samples, embedding_name):
    METRICS = ['euclidean', 'l1', 'l2', 'manhattan', 'cosine', 'correlation']
    Dbscan_df = pd.DataFrame(columns=['Title', 'Embeddings', 'Algorithm', 'Num Clusters', 
                                      'Eps', 'MinPts', 'Metric', 'Silhouette Score'])
    
    n_features = len(samples.columns)
    MIN_PTS = [i for i in range((n_features*2)-5, (n_features*2)+6)]
    
    for metric in METRICS:
        for mp in MIN_PTS:
            k = mp
            idx, eps = find_optimal_eps(k, samples)
            
            # Do DBScan
            db = DBSCAN(eps=eps, min_samples=mp, metric=metric).fit(samples)
            score = silhouette_score(samples, db.labels_)
            nc = len(set(db.labels_)) - (1 if -1 in db.labels_ else 0)
            
            if nc == 2:
                boolArr = db.labels_ == 0
                new_samples = samples[boolArr]
                
                new_idx, new_eps = find_optimal_eps(k, new_samples)
                # Do DBScan
                new_db = DBSCAN(eps=new_eps, min_samples=mp, metric=metric).fit(new_samples)
                score = silhouette_score(new_samples, new_db.labels_)
                new_nc = len(set(new_db.labels_)) - (1 if -1 in new_db.labels_ else 0)
                
            
            title = ''
            Dbscan_df = Dbscan_df.append({
            'Title': title,
            'Embeddings' : embedding_name,
            'Algorithm' : 'DBScan',  
            'Num Clusters' : nc,
            'Eps': eps,
            'MinPts': mp,
            'Metric': metric,
            'Silhouette Score' : score
            }, ignore_index = True)

In [28]:
df = pd.read_csv("../../datasets/final_embeddings/soccer_player_embeddings_feature_labels.csv")
embedding_name = "PCA-with-label"
df

Unnamed: 0.1,Unnamed: 0,player_positions,player_fifa_api_id,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,player_positions_general
0,0,CB,218353,0.232763,-0.672787,-0.228158,-0.305577,-0.407377,0.189798,0.084585,-0.138398,-0.113971,0.152152,DEF
1,1,LB,189615,-1.099358,0.106072,1.203658,0.733224,-0.393621,-0.237771,0.361361,0.183376,0.022748,-0.037838,DEF
2,2,"CB, RB",140161,0.326359,-0.965983,-0.234747,-0.201358,-0.437487,0.305188,-0.120059,0.082468,-0.126675,0.261866,DEF
3,3,CB,17725,0.376258,-1.065433,-0.231514,-0.151791,-0.467773,0.292523,0.020891,0.022543,-0.113073,0.245690,DEF
4,4,"CAM, CM, LM",158138,-0.938991,0.463336,1.045653,-0.850755,0.240846,-0.229985,-0.399875,0.129875,0.151130,-0.056541,MID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6353,6353,"RM, LM, CAM",179712,-0.536263,0.689018,1.081778,-0.848373,0.278421,-0.348142,-0.003771,-0.184443,0.216812,-0.046247,MID
6354,6354,ST,219943,0.426899,-0.153158,-0.265021,-0.451604,-0.276636,0.272468,-0.064036,-0.453398,-0.210569,-0.035855,ATK
6355,6355,RM,105454,-1.091925,1.313828,1.128123,0.287005,-0.097212,-0.352417,-0.092048,0.053856,0.137433,-0.105834,MID
6356,6356,"LM, LW",195840,0.034705,0.378481,1.073720,-0.633337,0.256512,0.048677,-0.187749,-0.443787,-0.044613,-0.026683,ATK


In [29]:
X = df.iloc[:, 3:13].values

In [37]:
try_CURE(X, embedding_name)

In [41]:
df = pd.read_csv("../../datasets/final_embeddings/soccer_player_embeddings_feature_no_labels.csv")
embedding_name = "PCA-no-label"
df

Unnamed: 0.1,Unnamed: 0,player_positions,player_fifa_api_id,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,player_positions_general
0,0,CB,218353,0.378084,0.493866,-0.218586,-0.238711,-0.500447,0.219325,0.125196,-0.119898,-0.141964,0.142511,-0.056706,0.011622,0.019336,0.023303,DEF
1,1,LB,189615,-1.046989,-0.190204,1.120495,0.563388,-0.440816,-0.198932,0.338791,0.223150,0.041387,-0.025337,0.229595,0.055748,-0.099781,0.020387,DEF
2,2,"CB, RB",140161,0.594404,0.605691,-0.234075,-0.077747,-0.538758,0.315616,-0.124424,0.063273,-0.152510,0.248130,0.055065,0.101277,-0.090169,-0.209208,DEF
3,3,CB,17725,0.643853,0.650631,-0.236806,-0.021595,-0.580045,0.301078,0.030522,0.030255,-0.134572,0.236830,0.041646,0.433323,-0.002238,-0.071184,DEF
4,4,"CAM, CM, LM",158138,-0.530779,0.024310,1.111265,-0.766894,0.325830,-0.241209,-0.423695,0.052389,0.165501,-0.052863,0.004543,0.038298,-0.022417,-0.046864,MID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6353,6353,"RM, LM, CAM",179712,-0.390017,-0.158299,1.155698,-0.784450,0.405633,-0.345112,0.008903,-0.210231,0.217657,-0.050972,-0.024585,0.020848,-0.068654,-0.015761,MID
6354,6354,ST,219943,0.538714,0.073519,-0.208930,-0.374983,-0.233859,0.236014,0.014948,-0.430333,-0.226670,-0.048914,0.050747,-0.013027,-0.169855,0.185591,ATK
6355,6355,RM,105454,-1.181839,-0.834783,1.126125,0.124608,-0.005613,-0.326030,-0.066737,0.061923,0.136190,-0.100342,0.062356,0.099283,-0.009242,0.017854,MID
6356,6356,"LM, LW",195840,0.041128,-0.016171,1.131554,-0.475397,0.332072,0.022145,-0.129566,-0.460448,-0.062958,-0.039493,-0.001819,-0.245802,0.165231,0.309484,ATK


In [44]:
X = df.iloc[:, 3:17]

In [46]:
df = pd.read_csv("../../datasets/final_embeddings/soccer_player_embeddings_feature_no_labels_LDA.csv")
embedding_name = "LDA"
df

Unnamed: 0.1,Unnamed: 0,LDA_PC1,LDA_PC2,LDA_PC3,LDA_PC4,LDA_PC5,LDA_PC6,LDA_PC7,LDA_PC8,LDA_PC9,...,LDA_PC36,LDA_PC37,LDA_PC38,LDA_PC39,LDA_PC40,LDA_PC41,LDA_PC42,player_fifa_api_id,player_positions,player_positions_general
0,0,-2.719920,-3.231187,-2.064467,-0.070135,-2.153782,-0.467022,-1.415772,-1.066637,-0.392626,...,-0.032899,0.023300,-0.607117,1.884155,0.418476,0.100440,-0.682479,218353,CB,DEF
1,1,-2.836651,-2.672179,2.376146,3.147144,-0.044590,1.308416,-0.822004,0.446822,-0.281568,...,0.861496,-0.214361,0.721046,0.127230,0.847701,-0.241239,0.023072,189615,LB,DEF
2,2,-1.019822,-4.523343,-1.828925,-0.106660,-1.443644,-0.873780,0.530965,1.182204,-1.141854,...,2.689023,-1.574170,-0.720701,0.848310,-2.254750,-0.798258,-0.956001,140161,"CB, RB",DEF
3,3,-1.617575,-4.856155,-2.945825,-0.353930,-0.510797,-0.427046,-0.072336,0.513706,-0.184841,...,2.241386,0.369035,0.307744,-0.278059,-0.283471,0.025885,-1.795962,17725,CB,DEF
4,4,-1.398834,3.657507,1.204130,-0.463391,0.459694,2.169088,1.198253,-0.867736,-0.452581,...,-0.914054,-0.660580,0.508823,0.448629,0.510699,0.396636,-0.001646,158138,"CAM, CM, LM",MID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6353,6353,-2.568563,4.038766,1.120707,0.426766,-1.862563,2.337459,0.332486,-0.168584,0.626593,...,1.214361,0.192009,-0.065067,0.435947,-0.511096,-0.243167,-0.187067,179712,"RM, LM, CAM",MID
6354,6354,-2.236150,0.561127,-2.037762,-0.894163,2.206501,0.023568,-1.081539,-0.589792,-0.355477,...,-0.419688,0.347494,0.628497,1.008669,-0.332957,0.836211,-1.021299,219943,ST,ATK
6355,6355,-2.146690,3.815578,1.211419,0.764462,-1.474408,2.178364,-1.507989,0.813233,0.616119,...,-0.206656,-0.349856,0.161889,-1.687243,0.261756,-0.554413,-0.324061,105454,RM,MID
6356,6356,-1.840281,3.618222,0.713696,3.347161,2.049061,0.867121,2.610342,1.117139,3.607444,...,0.542816,1.316894,-1.287097,0.272812,0.850898,0.676544,-1.870341,195840,"LM, LW",ATK


In [50]:
X = df.iloc[:, 1:43]

In [52]:
df = pd.read_csv("../../datasets/final_embeddings/DNN_player_embeddings_(5,36,32).csv")
embedding_name = "DNN_(5,36,32)"
df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,29,30,31,id,player_fifa_api_id,player_api_id,date,player_name,player_positions,player_positions_general
0,0,0.098888,-0.018730,-0.132960,0.008572,-0.072785,0.113407,0.040542,-0.073642,0.047720,...,-0.110338,0.047237,-0.148190,1,218353,505942,2016-02-18,Aaron Appindangoye,CB,DEF
1,1,0.150163,0.058801,-0.146037,0.074920,0.062421,0.119012,0.071093,-0.110336,0.086393,...,-0.073713,0.099040,-0.145002,6,189615,155782,2016-04-21,Aaron Cresswell,LB,DEF
2,2,0.109585,-0.045788,-0.141302,0.008358,-0.087123,0.154844,0.036263,-0.066735,0.042441,...,-0.167139,0.043497,-0.119312,65,140161,30572,2016-04-21,Aaron Galindo,"CB, RB",DEF
3,3,0.095147,-0.090690,-0.130869,-0.021235,-0.136777,0.172417,0.023485,-0.094813,0.036122,...,-0.175876,0.030490,-0.179021,88,17725,23780,2015-12-24,Aaron Hughes,CB,DEF
4,4,0.119317,0.097978,-0.131563,0.096871,0.087272,0.023734,0.082145,-0.052841,0.071903,...,-0.018879,0.085028,-0.090481,113,158138,27316,2016-04-28,Aaron Hunt,"CAM, CM, LM",MID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6353,6353,0.074431,0.116289,-0.175340,0.108144,0.103742,-0.053632,0.072020,-0.023802,0.051137,...,-0.035590,0.081411,-0.030351,183823,179712,107281,2016-01-21,Zoltan Stieber,"RM, LM, CAM",MID
6354,6354,0.039375,-0.029534,-0.204848,-0.018328,-0.115096,0.008530,0.018655,-0.025226,0.009211,...,-0.199369,0.026830,-0.074051,183857,219943,491794,2015-11-12,Zoran Josipovic,ST,ATK
6355,6355,0.128044,0.099773,-0.181800,0.101307,0.090423,0.021981,0.082250,-0.105097,0.082158,...,-0.052014,0.097500,-0.128483,183873,105454,99031,2016-05-05,Zoran Tosic,RM,MID
6356,6356,0.037696,0.112264,-0.236330,0.092926,0.084303,-0.115713,0.056856,0.006905,0.016320,...,-0.126222,0.064923,0.027929,183896,195840,192132,2016-01-21,Zouhaier Dhaouadhi,"LM, LW",ATK


In [55]:
X = df.iloc[:, 1:33]

In [56]:
df = pd.read_csv("../../datasets/final_embeddings/DNN_player_embeddings_v2.csv")
embedding_name = "DNN_v2"
df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,29,30,31,id,player_fifa_api_id,player_api_id,date,player_name,player_positions,player_positions_general
0,0,1.409394,-0.644381,-0.690456,-0.797573,0.199402,-0.089646,-0.410039,-0.336073,-0.423743,...,-0.604337,-0.135848,-0.575796,1,218353,505942,2016-02-18,Aaron Appindangoye,CB,DEF
1,1,1.540995,-0.637391,-0.747262,-0.865316,0.261265,-0.305409,-0.783761,-0.702415,-0.676207,...,-0.902550,-0.446335,-0.808141,6,189615,155782,2016-04-21,Aaron Cresswell,LB,DEF
2,2,1.484467,-0.705997,-0.785929,-0.973436,0.108547,-0.118075,-0.303505,-0.328585,-0.421469,...,-0.605547,-0.203093,-0.620154,65,140161,30572,2016-04-21,Aaron Galindo,"CB, RB",DEF
3,3,1.473089,-0.617511,-0.762134,-0.931697,0.055465,-0.134311,-0.329057,-0.326673,-0.351196,...,-0.598318,-0.251248,-0.539769,88,17725,23780,2015-12-24,Aaron Hughes,CB,DEF
4,4,1.297807,-0.397530,-0.745224,-0.551820,0.593255,-0.237985,-0.859844,-0.495693,-0.497671,...,-0.851617,-0.248346,-0.869322,113,158138,27316,2016-04-28,Aaron Hunt,"CAM, CM, LM",MID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6353,6353,1.136542,-0.343924,-0.635094,-0.464421,0.534967,-0.268051,-0.904608,-0.512822,-0.519205,...,-0.801973,-0.310293,-0.748810,183823,179712,107281,2016-01-21,Zoltan Stieber,"RM, LM, CAM",MID
6354,6354,1.077179,-0.647516,-0.705758,-0.595332,0.307480,-0.047721,-0.548778,-0.152842,-0.261911,...,-0.662276,-0.053952,-0.575016,183857,219943,491794,2015-11-12,Zoran Josipovic,ST,ATK
6355,6355,1.272975,-0.629640,-0.679687,-0.568447,0.595411,-0.291497,-1.094056,-0.577804,-0.667612,...,-1.137547,-0.351278,-0.954436,183873,105454,99031,2016-05-05,Zoran Tosic,RM,MID
6356,6356,0.960799,-0.252555,-0.456112,-0.301157,0.485259,-0.173997,-0.699809,-0.344446,-0.478135,...,-0.656439,-0.179624,-0.623041,183896,195840,192132,2016-01-21,Zouhaier Dhaouadhi,"LM, LW",ATK


In [57]:
X = df.iloc[:, 1:33]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,1.409394,-0.644381,-0.690456,-0.797573,0.199402,-0.089646,-0.410039,-0.336073,-0.423743,0.239713,...,-0.827737,-0.730887,-0.435432,-0.797773,-0.520826,-0.249745,-0.110783,-0.604337,-0.135848,-0.575796
1,1.540995,-0.637391,-0.747262,-0.865316,0.261265,-0.305409,-0.783761,-0.702415,-0.676207,0.568637,...,-0.865606,-1.181979,-0.619250,-1.040805,-0.561156,-0.334559,-0.156191,-0.902550,-0.446335,-0.808141
2,1.484467,-0.705997,-0.785929,-0.973436,0.108547,-0.118075,-0.303505,-0.328585,-0.421469,0.139994,...,-0.906856,-0.667299,-0.413151,-0.850948,-0.538736,-0.371803,-0.023204,-0.605547,-0.203093,-0.620154
3,1.473089,-0.617511,-0.762134,-0.931697,0.055465,-0.134311,-0.329057,-0.326673,-0.351196,0.146744,...,-0.894332,-0.702894,-0.467794,-0.734323,-0.447292,-0.205359,-0.045407,-0.598318,-0.251248,-0.539769
4,1.297807,-0.397530,-0.745224,-0.551820,0.593255,-0.237985,-0.859844,-0.495693,-0.497671,0.707045,...,-0.589504,-0.973829,-0.377364,-1.129830,-0.620835,-0.170006,-0.208947,-0.851617,-0.248346,-0.869322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6353,1.136542,-0.343924,-0.635094,-0.464421,0.534967,-0.268051,-0.904608,-0.512822,-0.519205,0.788661,...,-0.461430,-1.036482,-0.338974,-1.002939,-0.531023,-0.169505,-0.222472,-0.801973,-0.310293,-0.748810
6354,1.077179,-0.647516,-0.705758,-0.595332,0.307480,-0.047721,-0.548778,-0.152842,-0.261911,0.288206,...,-0.609686,-0.683326,-0.231183,-0.673606,-0.498344,-0.187587,-0.219697,-0.662276,-0.053952,-0.575016
6355,1.272975,-0.629640,-0.679687,-0.568447,0.595411,-0.291497,-1.094056,-0.577804,-0.667612,0.788621,...,-0.520666,-1.151020,-0.406648,-1.100529,-0.611898,-0.276970,-0.216061,-1.137547,-0.351278,-0.954436
6356,0.960799,-0.252555,-0.456112,-0.301157,0.485259,-0.173997,-0.699809,-0.344446,-0.478135,0.663295,...,-0.563338,-0.914220,-0.300697,-1.100941,-0.475234,-0.005453,-0.435495,-0.656439,-0.179624,-0.623041
