# Exercise 2 Unsupervised Learning

In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

Our working set is the same as previously, and the pre-processing is similar

In [2]:
df = pd.read_csv('CrowdstormingDataJuly1st.csv', index_col='playerShort')
df.sample(10)

Unnamed: 0_level_0,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
luuk-de-jong,Luuk de Jong,Bor. Mönchengladbach,Germany,27.08.1990,188.0,86.0,Center Forward,1,0,0,...,0.0,2169,44,ENGL,0.32669,44791.0,1e-05,0.356446,46916.0,3.7e-05
kevin-monnet-paquet,Kévin Monnet-Paquet,FC Lorient,France,19.08.1988,182.0,71.0,Right Winger,4,2,1,...,,2796,7,FRA,0.334684,2882.0,0.000151,0.336101,3011.0,0.000586
philipp-klement,Philipp Klement,1. FC Nürnberg,Germany,09.09.1992,174.0,69.0,Center Midfielder,2,0,1,...,0.0,2896,8,DEU,0.336628,7749.0,5.5e-05,0.335967,7974.0,0.000225
ronny-garbuschewski,Ronny Garbuschewski,Fortuna Düsseldorf,Germany,23.02.1986,178.0,75.0,Right Midfielder,1,1,0,...,0.25,1799,8,DEU,0.336628,7749.0,5.5e-05,0.335967,7974.0,0.000225
adam-szalai,Ádám Szalai,1. FSV Mainz 05,Germany,09.12.1987,193.0,87.0,Center Forward,1,1,0,...,0.25,375,8,DEU,0.336628,7749.0,5.5e-05,0.335967,7974.0,0.000225
bernd-leno,Bernd Leno,Bayer Leverkusen,Germany,04.03.1992,190.0,79.0,Goalkeeper,1,0,1,...,0.0,110,52,RUS,0.398174,526.0,0.000809,1.212727,550.0,0.004521
marc-hornschuh,Marc Hornschuh,Borussia Dortmund,Germany,02.03.1991,188.0,76.0,Center Back,3,1,0,...,0.25,2920,8,DEU,0.336628,7749.0,5.5e-05,0.335967,7974.0,0.000225
pirmin-schwegler,Pirmin Schwegler,Eintracht Frankfurt,Germany,09.03.1987,178.0,68.0,Defensive Midfielder,4,1,2,...,0.0,1646,8,DEU,0.336628,7749.0,5.5e-05,0.335967,7974.0,0.000225
alexey-bosetti,Alexey Bosetti,OGC Nice,France,23.04.1993,172.0,65.0,,5,0,3,...,,2619,7,FRA,0.334684,2882.0,0.000151,0.336101,3011.0,0.000586
peter-odemwingie,Peter Odemwingie,West Bromwich Albion,England,15.07.1981,182.0,75.0,Center Forward,3,3,0,...,0.5,3045,52,RUS,0.398174,526.0,0.000809,1.212727,550.0,0.004521


We can only work with the soccer players that have been rated by the 2 raters and we define the 'target' to be the average of the raters' ratings (that will not be part of the features)

In [3]:
# Remove rows with no rating
df = df.dropna(subset=['rater1', 'rater2'])
df['target'] = (df.rater1 + df.rater2)/2

In [4]:
sdf = df.reset_index().groupby('playerShort')

In [5]:
#functions to apply for aggregation
feature_function_mapper={
    'height': np.average,
    'weight': np.average,
    'victories': np.sum,
    'ties': np.sum,
    'defeats': np.sum,
    'goals': np.sum,
    'yellowCards': np.sum,
    'yellowReds': np.sum,
    'redCards': np.sum,
    'target': np.average,
    'meanIAT': np.average,
    'nIAT': np.average,
    'seIAT': lambda x: np.sqrt(np.average(np.square(x))),
    'meanExp': np.average,
    'nExp': np.average,
    'seExp': lambda x: np.sqrt(np.average(np.square(x))),
}

sdf_agg=sdf.agg(feature_function_mapper)
sdf_agg.sample(10)

Unnamed: 0_level_0,meanIAT,ties,redCards,defeats,yellowCards,height,yellowReds,meanExp,weight,victories,nIAT,seExp,seIAT,nExp,target,goals
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
romelu-lukaku,0.349331,45,0,63,12,193.0,1,0.4965,95.0,103,14774.345679,0.008675,0.00128,15467.407407,0.75,79
steve-harper,0.336581,58,0,65,1,188.0,0,0.435982,82.0,97,34573.139241,0.001378,0.000272,36215.088608,0.125,0
mike-pollitt,0.32669,50,3,100,5,193.0,0,0.356446,93.0,55,44791.0,3.7e-05,1e-05,46916.0,0.25,0
gabi,0.362223,85,0,112,112,180.0,7,0.538952,74.0,186,6477.987342,0.001407,0.000315,6795.227848,0.0,25
lkay-guendogan,0.341748,30,0,44,12,180.0,0,0.389297,79.0,94,6820.850746,0.002916,0.000577,7057.402985,0.25,31
baba-diawara,0.384629,41,0,65,12,179.0,1,0.696257,76.0,57,2126.960784,0.005326,0.000941,2232.294118,1.0,47
kassim-abdallah,0.339774,49,3,41,14,185.0,1,0.370064,74.0,48,3465.254902,0.002153,0.000449,3622.843137,1.0,2
nemanja-pejcinovic,0.341417,34,1,54,18,185.0,0,0.37859,84.0,44,4006.736842,0.002498,0.0005,4150.385965,0.25,7
ammar-jemal,,27,0,31,19,186.0,2,,81.0,41,,,,,0.625,12
sebastien-bassong,0.336094,67,3,98,17,187.0,1,0.412868,84.0,66,18766.885714,0.02849,0.00538,19649.228571,1.0,6


Now that we have our set cleaned, let's have a "blind" clustering technique with all features

In [6]:
features=['redCards', 'weight', 'meanExp', 'nExp', 'defeats', 'yellowCards', 'seExp', 'victories', 'seIAT', 'height', 'nIAT', 'goals', 'ties']
X_unsup=sdf_agg[features]

#Clustering process cannot be done with NaN values
X_unsup.fillna(X_unsup.mean(axis=0), inplace=True).head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Unnamed: 0_level_0,redCards,weight,meanExp,nExp,defeats,yellowCards,seExp,victories,seIAT,height,nIAT,goals,ties
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
aaron-hughes,0,71.0,0.494575,20637.277108,228,19,0.027536,247,0.003315,182.0,19710.156627,9,179
aaron-hunt,1,73.0,0.44922,26864.454545,122,42,0.002506,141,0.000543,183.0,26104.292929,62,73
aaron-lennon,0,63.0,0.491482,22238.742574,115,11,0.008914,200,0.001268,165.0,21234.861386,31,97
aaron-ramsey,1,76.0,0.514693,39719.980769,68,31,0.104541,150,0.028116,178.0,38285.826923,39,42
abdelhamid-el-kaoutari,2,73.0,0.335587,2953.837838,43,8,0.023426,41,0.006309,180.0,2832.351351,1,40


Let's use KMeans as clustering algorithm, with 2 clusters

In [7]:
km=KMeans(n_clusters=2).fit(X_unsup)
km.predict(X_unsup)

array([0, 0, 0, ..., 0, 0, 0])

In [8]:
print('Silhouette score: {}'.format(silhouette_score(X_unsup, km.labels_, metric='euclidean')))

Silhouette score: 0.8995915023031196


The silhouette score is pretty good but how good is the clustering compared to the predicted target?
We cannot know in advance what cluster corresponds to the white/black skin colour, but there should be a cluster K with target values 0 to 0.5 and the other 0.5 to 1.

In [9]:
X_unsup['K']=km.labels_
X_unsup['target']=sdf_agg['target']
X_unsup.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0_level_0,redCards,weight,meanExp,nExp,defeats,yellowCards,seExp,victories,seIAT,height,nIAT,goals,ties,K,target
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
aaron-hughes,0,71.0,0.494575,20637.277108,228,19,0.027536,247,0.003315,182.0,19710.156627,9,179,0,0.125
aaron-hunt,1,73.0,0.44922,26864.454545,122,42,0.002506,141,0.000543,183.0,26104.292929,62,73,0,0.125
aaron-lennon,0,63.0,0.491482,22238.742574,115,11,0.008914,200,0.001268,165.0,21234.861386,31,97,0,0.25
aaron-ramsey,1,76.0,0.514693,39719.980769,68,31,0.104541,150,0.028116,178.0,38285.826923,39,42,0,0.0
abdelhamid-el-kaoutari,2,73.0,0.335587,2953.837838,43,8,0.023426,41,0.006309,180.0,2832.351351,1,40,0,0.25


In [10]:
#X_unsup.target <= 0.5 : white skin
df1 = X_unsup[X_unsup.target <= 0.5]
df2 = df1[df1.K == 0]
white_in_clust0 = len(df2)
clust0_size = len(X_unsup[X_unsup.K == 0])
white_in_clust0 / clust0_size

0.8321633694958519

In [11]:
#X_unsup.target <= 0.5 : white skin
df3 = X_unsup[X_unsup.target <= 0.5]
df4 = df3[df3.K == 1]
white_in_clust1 = len(df4)
clust1_size = len(X_unsup[X_unsup.K == 1])
white_in_clust1 / clust1_size

0.6666666666666666

Perfect results should be 0% in one cluster and 100% in ther other, so we can conclude that our first result is not good

Let's try to be a bit more clever and keep only the features related to race:

In [12]:
features = ['meanExp', 'meanIAT', 'nIAT', 'nExp', 'seExp', 'seIAT']
X_unsup = sdf_agg[features]
X_unsup.sample(10)

Unnamed: 0_level_0,meanExp,meanIAT,nIAT,nExp,seExp,seIAT
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
david-cobeno,0.584075,0.368464,1909.818182,2024.4,0.001003,0.000228
predrag-stevanovic,0.335967,0.336628,7749.0,7974.0,0.000225,5.5e-05
chris-mavinga,0.59344,0.361852,2977.111111,3105.611111,0.00326,0.000636
paul-verhaegh,0.407955,0.346578,6355.5,6544.419355,0.000391,8.8e-05
julian-korb,0.340854,0.337254,7624.946237,7846.946237,0.000297,6.9e-05
sercan-sararer,0.37189,0.341438,7450.435897,7680.089744,0.002061,0.000407
xabi-alonso,0.53581,0.352473,7995.116564,8371.239264,0.028202,0.010972
loris-karius,0.335967,0.336628,7749.0,7974.0,0.000225,5.5e-05
aranzubia,0.560017,0.36336,3048.148148,3209.962963,0.012749,0.003078
diego-perotti,0.565462,0.360078,4667.492537,4898.686567,0.046287,0.003444


In [13]:
X_unsup.fillna(X_unsup.mean(axis=0), inplace=True).head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Unnamed: 0_level_0,meanExp,meanIAT,nIAT,nExp,seExp,seIAT
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aaron-hughes,0.494575,0.346459,19710.156627,20637.277108,0.027536,0.003315
aaron-hunt,0.44922,0.348818,26104.292929,26864.454545,0.002506,0.000543
aaron-lennon,0.491482,0.345893,21234.861386,22238.742574,0.008914,0.001268
aaron-ramsey,0.514693,0.346821,38285.826923,39719.980769,0.104541,0.028116
abdelhamid-el-kaoutari,0.335587,0.3316,2832.351351,2953.837838,0.023426,0.006309


In [14]:
km = KMeans(n_clusters=2).fit(X_unsup)
km.predict(X_unsup)
print('Silhouette score: {}'.format(silhouette_score(X_unsup, km.labels_, metric='euclidean')))
X_unsup['K']=km.labels_
X_unsup['target']=sdf_agg['target']

Silhouette score: 0.8996092291080771


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [15]:
X_unsup.head(15)

Unnamed: 0_level_0,meanExp,meanIAT,nIAT,nExp,seExp,seIAT,K,target
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
aaron-hughes,0.494575,0.346459,19710.156627,20637.277108,0.027536,0.003315,0,0.125
aaron-hunt,0.44922,0.348818,26104.292929,26864.454545,0.002506,0.000543,0,0.125
aaron-lennon,0.491482,0.345893,21234.861386,22238.742574,0.008914,0.001268,0,0.25
aaron-ramsey,0.514693,0.346821,38285.826923,39719.980769,0.104541,0.028116,0,0.0
abdelhamid-el-kaoutari,0.335587,0.3316,2832.351351,2953.837838,0.023426,0.006309,0,0.25
abdou-traore_2,0.296562,0.320079,4523.666667,4741.404762,0.029404,0.01216,0,0.75
abdoulaye-diallo_2,0.400818,0.341625,2646.7,2762.5,0.001499,0.000353,0,0.875
abdoulaye-keita_2,0.417225,0.355406,1728.0,1808.5,0.002139,0.000516,0,0.875
abdoulwhaid-sissoko,0.42963,0.348178,2470.196429,2593.214286,0.000771,0.000184,0,1.0
abdul-rahman-baba,0.361068,0.342072,6829.62963,7039.222222,0.002483,0.00043,0,0.875


In [16]:
df1 = X_unsup[X_unsup.target <= 0.5]
df2 = df1[df1.K == 0]
white_in_clust0 = len(df2)
clust0_size = len(X_unsup[X_unsup.K == 0])
print('Proportion of white players in cluster 0 : {}'.format(white_in_clust0 / clust0_size))


df3 = X_unsup[X_unsup.target <= 0.5]
df4 = df3[df3.K == 1]
white_in_clust1 = len(df4)
clust1_size = len(X_unsup[X_unsup.K == 1])
print('Proportion of white players in cluster 1 : {}'.format(white_in_clust1 / clust1_size))
print('Clustering score : {}'.format(np.abs(white_in_clust0 / clust0_size - white_in_clust1 / clust1_size)))

Proportion of white players in cluster 0 : 0.8321633694958519
Proportion of white players in cluster 1 : 0.6666666666666666
Clustering score : 0.16549670282918527


This is not optimal either... We want now to determine the feature to drop by algorithm

# Trying all possible combinations of features

In [17]:
'''
    Computes the clustering score, return a value between 0 and 1.
    1 means that all samples were assigned to the correct cluster,
    0 the opposite.
    i.e all white skin players are in the same cluster and 
    all black skin players are in the same cluster, which is different from the 
    one with the white players 
'''
def compute_cluster_score(table):
    #table.target > 0.5 : black skin
    df1=table[table.target > 0.5]
    df2=df1[df1.K == 0]
    black_in_clust0 = len(df2)
    clust0_size=len(table[table.K == 0])
    prop0 =black_in_clust0 / clust0_size
    
    df3=table[table.target > 0.5]
    df4=df3[df3.K == 1]
    black_in_clust1=len(df4)
    clust1_size=len(table[table.K == 1])
    prop1=black_in_clust1 / clust1_size
    
    return np.abs(prop0- prop1)

In [18]:
def compute_kmeans(table):
    km=KMeans(n_clusters=2).fit(table)
    predictions = km.predict(table)
    table['K']=km.labels_
    score=silhouette_score(table, predictions, metric='euclidean')
    prop=compute_cluster_score(table)
    
    return score, prop

We want to find all possible combinations of the features, from 1 to 15 features

In [19]:
#found on http://python.jpvweb.com/mesrecettespython/doku.php?id=combinaisons
def combinliste(seq, k):
    p = []
    i, imax = 0, 2**len(seq)-1
    while i<=imax:
        s = []
        j, jmax = 0, len(seq)-1
        while j<=jmax:
            if (i>>j)&1==1:
                s.append(seq[j])
            j += 1
        if len(s)==k:
            p.append(s)
        i += 1 
    return p

In [20]:
def generate_feature_combinations(feats):
    combinations = []
    for i in range(len(feats)):
        combinations.append(combinliste(feats, i))
        
    return combinations

Main algorithm : computes all scores of combinations and outputs them in a dataframe

In [21]:
def compute_all_combinations(combinations, table) :
    X_comb = table.copy()
    X_comb.fillna(X_comb.mean(axis=0), inplace=True)
    silhouettes =[]
    clust_scores = []
    features = []
    for i in range(len(combinations)):
        for j in range(len(combinations[i])):
            X_temp = X_comb.drop(combinations[i][j], axis=1)
            silhouette, clustering_score = compute_kmeans(X_temp)
            silhouettes.append(silhouette)
            clust_scores.append(clustering_score)
            features.append(combinations[i][j]) 
            
    kmean_stats = pd.DataFrame({'Silhouette': silhouettes,
                        'Clustering_scores' : clust_scores,
                        'Dropped Features' :features})
    kmean_stats.set_index('Clustering_scores', inplace=True)
    kmean_stats.sort_index(ascending=False, inplace=True)
    
    return kmean_stats

From all the available features, we choose to drop some features that we judged that have nothing to do with the skin colour to have a lighter computation process. 
So let's find the ones that maximize the score when removed.

In [22]:
drop_feats = ['goals','ties','height','victories','weight','defeats']
all_combinations = generate_feature_combinations(drop_feats)

result = compute_all_combinations(all_combinations, sdf_agg)

In [23]:
result.head(10)

Unnamed: 0_level_0,Dropped Features,Silhouette
Clustering_scores,Unnamed: 1_level_1,Unnamed: 2_level_1
0.165497,[],0.899591
0.165497,"[goals, ties, victories, defeats]",0.899607
0.165497,[goals],0.899594
0.165497,"[goals, height, defeats]",0.899597
0.165497,"[goals, victories, defeats]",0.899604
0.165497,"[goals, weight, defeats]",0.899597
0.165497,"[ties, weight, defeats]",0.899596
0.165497,"[height, weight, defeats]",0.899594
0.165497,"[victories, weight, defeats]",0.899602
0.165497,"[goals, ties, height, victories]",0.899603


Let's the find the best combination of features!

In [24]:
#The features we want to iteratively remove are now the following:
drop_feats = ['nIAT','meanExp','yellowCards','redCards','seExp','meanIAT','seIAT','nExp','yellowReds']
all_combinations = generate_feature_combinations(drop_feats)

In [25]:
noise_generating_features=result.max()['Dropped Features']
noiseless_table=sdf_agg.copy()
noiseless_table.drop(noise_generating_features, inplace=True, axis =1)
noiseless_table.sample(10)

Unnamed: 0_level_0,meanIAT,ties,redCards,yellowCards,height,yellowReds,meanExp,victories,nIAT,seExp,seIAT,nExp,target,goals
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
alexander-buettner,0.345416,26,0,18,174.0,1,0.446322,54,17221.285714,0.002749,0.000581,17986.571429,0.25,12
jesse-lingard,0.333823,12,1,1,168.0,0,0.395531,16,40388.366667,0.00235,0.000458,42306.0,0.5,10
allan-nyom,,36,0,42,186.0,1,,48,,,,,0.75,1
samuel-umtiti,0.347975,25,1,15,182.0,1,0.499077,46,4399.617021,0.005201,0.000987,4596.489362,1.0,4
jussi-jaeaeskelaeinen,0.338893,137,4,28,193.0,0,0.452891,171,25106.337349,0.003991,0.000834,26292.746988,0.0,0
kaka,,102,0,44,186.0,2,,318,,,,,0.0,161
moussa-sissoko,0.343987,73,0,39,187.0,1,0.41194,103,11157.379747,0.001939,0.000413,11682.683544,1.0,27
mario-gomez,0.35331,86,0,36,189.0,1,0.496693,254,6041.738806,0.003058,0.000613,6280.828358,0.25,251
arizmendi,0.365541,72,0,35,189.0,2,0.567017,100,1917.156627,0.002046,0.000434,2018.409639,0.25,36
ioannis-gelios,0.338574,22,1,1,190.0,0,0.350772,27,7255.522727,0.005804,0.000782,7466.704545,0.0,0


We found the feature(s) to drop, now we try dropping other features with first removing the ones we just found.

In [26]:
result1=compute_all_combinations(all_combinations, noiseless_table)

In [27]:
result1.head(10)

Unnamed: 0_level_0,Dropped Features,Silhouette
Clustering_scores,Unnamed: 1_level_1,Unnamed: 2_level_1
0.260004,"[nIAT, seExp, yellowReds]",0.932653
0.206325,"[nIAT, yellowCards, redCards, seExp]",0.928065
0.206325,"[nIAT, meanExp, redCards, seIAT]",0.928064
0.206325,"[yellowCards, seExp, seIAT, nExp, yellowReds]",0.928698
0.206325,"[meanExp, yellowCards, meanIAT]",0.928377
0.165497,"[nIAT, meanExp, seExp, meanIAT, seIAT]",0.899103
0.165497,"[meanExp, yellowCards, redCards, meanIAT, seIAT]",0.899595
0.165497,"[nIAT, yellowCards, redCards, meanIAT, seIAT]",0.899105
0.165497,"[nIAT, meanExp, redCards, seExp, seIAT]",0.899103
0.165497,"[nIAT, meanExp, yellowCards, seExp, seIAT]",0.899105


In [28]:
best_dropped_features=result1.iloc[0]['Dropped Features']
best_features=set(drop_feats) - set(best_dropped_features)
print('Best configuration of features {}'.format(best_features))
final_result=result1.reset_index()
best_clustering_score=final_result.iloc[0]['Clustering_scores']
best_silhouette_score=final_result.iloc[0]['Silhouette']
print('Best clustering score : {}, best silhouette score : {}'.format(best_clustering_score, best_silhouette_score))

Best configuration of features {'meanIAT', 'redCards', 'yellowCards', 'seIAT', 'meanExp', 'nExp'}
Best clustering score : 0.26000362122035126, best silhouette score : 0.932653161512224


As a conclusion, we found an optimal configuration of features with high silhouette score, however the cluster score is quite low.
We can then argue that the results are not very good and ask ourselves if the provided features were the best to assess the players' skin colours.