# Exercise 2 Unsupervised Learning

In [2]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

Our working set is the same as previously, and the pre-processing is similar

In [3]:
df = pd.read_csv('CrowdstormingDataJuly1st.csv', index_col='playerShort')
df.sample(10)

OSError: File b'CrowdstormingDataJuly1st.csv' does not exist

We can only work with the soccer players that have been rated by the 2 raters and we define the 'target' to be the average of the raters' ratings (that will not be part of the features)

In [None]:
# Remove rows with no rating
df = df.dropna(subset=['rater1', 'rater2'])
df['target'] = (df.rater1 + df.rater2)/2

In [None]:
sdf = df.reset_index().groupby('playerShort')

In [None]:
#functions to apply for aggregation
feature_function_mapper={
    'height': np.average,
    'weight': np.average,
    'victories': np.sum,
    'ties': np.sum,
    'defeats': np.sum,
    'goals': np.sum,
    'yellowCards': np.sum,
    'yellowReds': np.sum,
    'redCards': np.sum,
    'target': np.average,
    'meanIAT': np.average,
    'nIAT': np.average,
    'seIAT': lambda x: np.sqrt(np.average(np.square(x))),
    'meanExp': np.average,
    'nExp': np.average,
    'seExp': lambda x: np.sqrt(np.average(np.square(x))),
}

sdf_agg=sdf.agg(feature_function_mapper)
sdf_agg.sample(10)

Now that we have our set cleaned, let's have a "blind" clustering technique with all features

In [None]:
features=['redCards', 'weight', 'meanExp', 'nExp', 'defeats', 'yellowCards', 'seExp', 'victories', 'seIAT', 'height', 'nIAT', 'goals', 'ties']
X_unsup=sdf_agg[features]

#Clustering process cannot be done with NaN values
X_unsup.fillna(X_unsup.mean(axis=0), inplace=True).head()

Let's use KMeans as clustering algorithm, with 2 clusters

In [None]:
km=KMeans(n_clusters=2).fit(X_unsup)
km.predict(X_unsup)

In [None]:
print('Silhouette score: {}'.format(silhouette_score(X_unsup, km.labels_, metric='euclidean')))

The silhouette score is pretty good but how good is the clustering compared to the predicted target?
We cannot know in advance what cluster corresponds to the white/black skin colour, but there should be a cluster K with target values 0 to 0.5 and the other 0.5 to 1.

In [None]:
X_unsup['K']=km.labels_
X_unsup['target']=sdf_agg['target']
X_unsup.head()

In [None]:
#X_unsup.target <= 0.5 : white skin
df1 = X_unsup[X_unsup.target <= 0.5]
df2 = df1[df1.K == 0]
white_in_clust0 = len(df2)
clust0_size = len(X_unsup[X_unsup.K == 0])
white_in_clust0 / clust0_size

In [None]:
#X_unsup.target <= 0.5 : white skin
df3 = X_unsup[X_unsup.target <= 0.5]
df4 = df3[df3.K == 1]
white_in_clust1 = len(df4)
clust1_size = len(X_unsup[X_unsup.K == 1])
white_in_clust1 / clust1_size

Perfect results should be 0% in one cluster and 100% in ther other, so we can conclude that our first result is not good

Let's try to be a bit more clever and keep only the features related to race:

In [None]:
features = ['meanExp', 'meanIAT', 'nIAT', 'nExp', 'seExp', 'seIAT']
X_unsup = sdf_agg[features]
X_unsup.sample(10)

In [None]:
X_unsup.fillna(X_unsup.mean(axis=0), inplace=True).head()

In [None]:
km = KMeans(n_clusters=2).fit(X_unsup)
km.predict(X_unsup)
print('Silhouette score: {}'.format(silhouette_score(X_unsup, km.labels_, metric='euclidean')))
X_unsup['K']=km.labels_
X_unsup['target']=sdf_agg['target']

In [None]:
X_unsup.head(15)

In [None]:
df1 = X_unsup[X_unsup.target <= 0.5]
df2 = df1[df1.K == 0]
white_in_clust0 = len(df2)
clust0_size = len(X_unsup[X_unsup.K == 0])
print('Proportion of white players in cluster 0 : {}'.format(white_in_clust0 / clust0_size))


df3 = X_unsup[X_unsup.target <= 0.5]
df4 = df3[df3.K == 1]
white_in_clust1 = len(df4)
clust1_size = len(X_unsup[X_unsup.K == 1])
print('Proportion of white players in cluster 1 : {}'.format(white_in_clust1 / clust1_size))
print('Clustering score : {}'.format(np.abs(white_in_clust0 / clust0_size - white_in_clust1 / clust1_size)))

This is not optimal either... We want now to determine the feature to drop by algorithm

# Trying all possible combinations of features

In [None]:
'''
    Computes the clustering score, return a value between 0 and 1.
    1 means that all samples were assigned to the correct cluster,
    0 the opposite.
    i.e all white skin players are in the same cluster and 
    all black skin players are in the same cluster, which is different from the 
    one with the white players 
'''
def compute_cluster_score(table):
    #table.target > 0.5 : black skin
    df1=table[table.target > 0.5]
    df2=df1[df1.K == 0]
    black_in_clust0 = len(df2)
    clust0_size=len(table[table.K == 0])
    prop0 =black_in_clust0 / clust0_size
    
    df3=table[table.target > 0.5]
    df4=df3[df3.K == 1]
    black_in_clust1=len(df4)
    clust1_size=len(table[table.K == 1])
    prop1=black_in_clust1 / clust1_size
    
    return np.abs(prop0- prop1)

In [None]:
def compute_kmeans(table):
    km=KMeans(n_clusters=2).fit(table)
    predictions = km.predict(table)
    table['K']=km.labels_
    score=silhouette_score(table, predictions, metric='euclidean')
    prop=compute_cluster_score(table)
    
    return score, prop

We want to find all possible combinations of the features, from 1 to 15 features

In [None]:
#found on http://python.jpvweb.com/mesrecettespython/doku.php?id=combinaisons
def combinliste(seq, k):
    p = []
    i, imax = 0, 2**len(seq)-1
    while i<=imax:
        s = []
        j, jmax = 0, len(seq)-1
        while j<=jmax:
            if (i>>j)&1==1:
                s.append(seq[j])
            j += 1
        if len(s)==k:
            p.append(s)
        i += 1 
    return p

In [None]:
def generate_feature_combinations(feats):
    combinations = []
    for i in range(len(feats)):
        combinations.append(combinliste(feats, i))
        
    return combinations

Main algorithm : computes all scores of combinations and outputs them in a dataframe

In [None]:
def compute_all_combinations(combinations, table) :
    X_comb = table.copy()
    X_comb.fillna(X_comb.mean(axis=0), inplace=True)
    silhouettes =[]
    clust_scores = []
    features = []
    for i in range(len(combinations)):
        for j in range(len(combinations[i])):
            X_temp = X_comb.drop(combinations[i][j], axis=1)
            silhouette, clustering_score = compute_kmeans(X_temp)
            silhouettes.append(silhouette)
            clust_scores.append(clustering_score)
            features.append(combinations[i][j]) 
            
    kmean_stats = pd.DataFrame({'Silhouette': silhouettes,
                        'Clustering_scores' : clust_scores,
                        'Dropped Features' :features})
    kmean_stats.set_index('Clustering_scores', inplace=True)
    kmean_stats.sort_index(ascending=False, inplace=True)
    
    return kmean_stats

From all the available features, we choose to drop some features that we judged that have nothing to do with the skin colour to have a lighter computation process. 
So let's find the ones that maximize the score when removed.

In [None]:
drop_feats = ['goals','ties','height','victories','weight','defeats']
all_combinations = generate_feature_combinations(drop_feats)

result = compute_all_combinations(all_combinations, sdf_agg)

In [None]:
result.head(10)

Let's the find the best combination of features!

In [None]:
#The features we want to iteratively remove are now the following:
drop_feats = ['nIAT','meanExp','yellowCards','redCards','seExp','meanIAT','seIAT','nExp','yellowReds']
all_combinations = generate_feature_combinations(drop_feats)

In [None]:
noise_generating_features=result.max()['Dropped Features']
noiseless_table=sdf_agg.copy()
noiseless_table.drop(noise_generating_features, inplace=True, axis =1)
noiseless_table.sample(10)

We found the feature(s) to drop, now we try dropping other features with first removing the ones we just found.

In [None]:
result1=compute_all_combinations(all_combinations, noiseless_table)

In [None]:
result1.head(10)

In [None]:
best_dropped_features=result1.iloc[0]['Dropped Features']
best_features=set(drop_feats) - set(best_dropped_features)
print('Best configuration of features {}'.format(best_features))
final_result=result1.reset_index()
best_clustering_score=final_result.iloc[0]['Clustering_scores']
best_silhouette_score=final_result.iloc[0]['Silhouette']
print('Best clustering score : {}, best silhouette score : {}'.format(best_clustering_score, best_silhouette_score))

As a conclusion, we found an optimal configuration of features with high silhouette score, however the cluster score is quite low.
We can then argue that the results are not very good and ask ourselves if the provided features were the best to assess the players' skin colours.