In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../../datasets/soccer_player_embeddings_v1.csv')
EMBEDDINGS = "PCA"
df

In [None]:
X = df.iloc[:, 1:15]

In [None]:
y = df['player_positions']

In [None]:
labels = []
for l in y.values:
    if ',' in l:
        labels.extend(l.split(", "))
    else:
        labels.append(l)

labels, counts = np.unique(labels, return_counts=True)
for i, l in enumerate(labels):
    print("Position: {}, count {}".format(l, counts[i]))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

# Clustering Evaluation

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.cm as cm

In [None]:
def silhouette_blob(samples, cluster_labels, cluster_centres=None, title=None, save_link=None):
    n_clusters = len(np.unique(cluster_labels))
    
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 10)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(samples) + (n_clusters + 1) * 10])

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(samples, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(samples, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(samples[:, 0], samples[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                    c=colors, edgecolor='k')

        if cluster_centres is not None:
            # Draw white circles at cluster centers
            ax2.scatter(cluster_centres[:, 0], cluster_centres[:, 1], marker='o',
                        c="white", alpha=1, s=200, edgecolor='k')

            for i, c in enumerate(cluster_centres):
                ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                            s=50, edgecolor='k')

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(("Silhouette analysis for %s" % (title)),
                     fontsize=14, fontweight='bold')
    
    if save_link:
        plt.savefig('{}/{} Silhouette Scores.png'.format(save_link, title))
    plt.show()
    
    return silhouette_avg

In [None]:
def labels_in_cluster(given_cluster, num_clusters, title=None, save_link=None):
    y_train_vals = y_train.values
    classes_in_cluster = np.zeros(shape=(num_clusters, len(labels)), dtype=int)
    for i in range(num_clusters):
        dataInd = np.argwhere(given_cluster==i).flatten()
        for ind in dataInd:
            if ',' in y_train_vals[ind]:
                split = y_train_vals[ind].split(', ')
                for s in split:
                    j, = np.where(labels == s)
                    classes_in_cluster[i][j] += 1
            else:
                j, = np.where(labels == y_train_vals[ind])
                classes_in_cluster[i][j] += 1
    
    plt.figure(figsize=(15,5))
    if title:
        plt.title("Labels within clusters by {}".format(title))
    else:
        plt.title("Labels within clusters")
    sns.heatmap(classes_in_cluster, annot=True, cmap='Blues', fmt="d")
    plt.xticks([(i+0.5) for i in np.arange(len(labels))], labels=labels)
    
    if save_link:
        plt.savefig('{}/{} Labels within Clusters.png'.format(save_link, title))
    plt.show()

In [None]:
def cosine_matrix(samples, labels, title=None, save_link=None):
    unique_labels, counts = np.unique(labels, return_counts=True)
    tick_loc = [(sum(counts[:i])+counts[i]/2) for i in np.arange(len(unique_labels))]
    
    num_samples = samples.shape[0]
    sortedInd = np.argsort(labels)
    
    plt.figure(figsize=(20,15))
    if title:
        plt.title("Cosine Matrix by Clusters from {}".format(title))
    else:
        plt.title("Cosine Matrix by Clusters")
    sns.heatmap(cosine_similarity(samples[sortedInd]), cmap='Blues')
    plt.yticks(tick_loc, labels=unique_labels)
    plt.xticks(tick_loc, labels=unique_labels, rotation='horizontal')
    
    if save_link:
        plt.savefig('{}/{} Cosine Similarity.png'.format(save_link, title))
    plt.show()

In [None]:
cosine_matrix(X_train.values, kmeans.labels_)

# Clustering Techniques

## KMeans

In [None]:
from sklearn.cluster import KMeans

In [None]:
kMeans_df = pd.DataFrame(columns=['Algorithm', 'Embeddings', 'Num Clusters', 'Silhouette Score'])
save_link = "KMeans/PCA"

In [None]:
for nc in range(2,21,2):
    title = "kMeans {} clusters".format(nc)
    kmeans = KMeans(n_clusters=nc, random_state=42).fit(X_train)
    score = silhouette_blob(X_train.values, kmeans.labels_, kmeans.cluster_centers_, title, save_link)
    kMeans_df = kMeans_df.append({'Algorithm' : 'KMeans', 'Embeddings' : EMBEDDINGS, 'Num Clusters' : nc, 'Silhouette Score' : score},  
                ignore_index = True) 
    labels_in_cluster(kmeans.labels_, nc, title, save_link)
    cosine_matrix(X_train.values, kmeans.labels_, title, save_link)

In [None]:
kMeans_df

In [None]:
kMeans_df.to_csv('{}/results.csv'.format(save_link))

## Agglomerative Clustering

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

In [None]:
def find_elbow(linkageArr, startSearch=1, title=None):
    last = linkageArr[-20:, 2]
    last_rev = last[::-1]
    idxs = np.arange(1, len(last) + 1)
    acceleration = np.diff(last, 2)  # 2nd derivative of the distances
    acceleration_rev = acceleration[::-1]
    
    if title:
        plt.title("Elbow search for {}".format(title))
    else:
        plt.title("Elbow search")
    plt.plot(idxs, last_rev, label="Distances")
    plt.plot(idxs[:-2] + 1, acceleration_rev, label="2nd Derviv")
    plt.legend(loc="best")
    plt.show()

    k = acceleration_rev[startSearch:].argmax() + 2 + startSearch  # if idx 0 is the max of this we want 2 clusters
    max_dist = last_rev[k-2]
    print("clusters: {} at max_dist: {}".format(k, max_dist))
    
    return k, max_dist

In [None]:
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

def plot_dendogram(linkageArr, numClusters, maxDist=None, title=None):
    plt.figure(figsize=(15,7))
    if title:
        plt.title('Hierarchical Clustering Dendrogram (truncated) for {}'.format(title))
    else:
        plt.title('Hierarchical Clustering Dendrogram (truncated)')
    fancy_dendrogram(
        linkageArr,
        truncate_mode='lastp',
        p=numClusters,
        leaf_rotation=90.,
        leaf_font_size=12.,
        show_contracted=True,
        annotate_above=10,
        max_d=maxDist,
    )
    plt.show()

In [None]:
aggClustering_df = pd.DataFrame(columns=['Algorithm', 'Embeddings', 'Linkage', 'Affinity', 'Num Clusters', 'Silhouette Score'])
save_link = "Agg_Clustering/PCA"

In [None]:
AFFINITIES = ["euclidean", "l1", "l2", "manhattan", "cosine"]
LINKAGE = ["ward", "complete", "average", "single"]

In [None]:
for link in LINKAGE:
    for aff in AFFINITIES:
        for nc in range(2,21,2):
            if link == "ward" and aff != "euclidean":
                continue
            title = "{}-{} {} clusters".format(link, aff, nc)
            hc = AgglomerativeClustering(n_clusters=nc, affinity=aff, linkage=link)
            hc.fit(X_train)
            score = silhouette_blob(X_train.values, hc.labels_, title=title, save_link=save_link)
            labels_in_cluster(hc.labels_, nc, title, save_link=save_link)
            cosine_matrix(X_train.values, hc.labels_, title, save_link)
            
            aggClustering_df = aggClustering_df.append(
                {'Algorithm' : 'Agg_Clustering', 
                 'Embeddings' : EMBEDDINGS,
                 'Linkage': link,
                 'Affinity': aff,
                 'Num Clusters' : nc, 
                 'Silhouette Score' : score},  
                ignore_index = True)

In [None]:
aggClustering_df

In [None]:
aggClustering_df.sort_values('Silhouette Score', ascending=False)

### Single Link

In [None]:
title = "Single link"
Z_single = linkage(X_train, 'single')

In [None]:
k_single, maxDist_single = find_elbow(Z_single, 7, title=title)

In [None]:
plot_dendogram(Z_single, numClusters=int(k_single)+10, maxDist=maxDist_single-10**(-1), title=title)

In [None]:
single_hc = AgglomerativeClustering(n_clusters=k_single, affinity='euclidean', linkage='single')
single_hc.fit(X_train)
silhouette_blob(X_train.values, single_hc.labels_, "Single Link")
labels_in_cluster(kmeans.labels_, nc, "Single Link")

### Complete Link

In [None]:
title = "Complete link"
Z_complete = linkage(X_train, 'complete')

In [None]:
k_complete, maxDist_complete= find_elbow(Z_complete, title=title)

In [None]:
plot_dendogram(Z_complete, numClusters=int(k_complete)+10, maxDist=maxDist_complete-10**(-1), title=title)

In [None]:
complete_kMeans = KMeans(n_clusters=k_complete, random_state=42).fit(X_train)
labels_in_cluster(complete_kMeans.labels_, k_complete, title)

In [None]:
silhouette_blob(X_train.values, complete_kMeans.labels_, complete_kMeans.cluster_centers_)

### Average Link

In [None]:
title = "Average link"
Z_average = linkage(X_train, 'average')

In [None]:
k_average, maxDist_average= find_elbow(Z_average, startSearch=5, title=title)

In [None]:
plot_dendogram(Z_average, numClusters=int(k_average)+10, maxDist=maxDist_average-5*10**(-2), title=title)

In [None]:
average_kMeans = KMeans(n_clusters=k_average, random_state=42).fit(X_train)
labels_in_cluster(average_kMeans.labels_, k_average, title)

In [None]:
silhouette_blob(X_train.values, average_kMeans.labels_, average_kMeans.cluster_centers_)

### Weighted Link

In [None]:
title = "Weighted link"
Z_weighted = linkage(X_train, 'weighted')

In [None]:
k_weighted, maxDist_weighted= find_elbow(Z_weighted, title=title)

In [None]:
plot_dendogram(Z_weighted, numClusters=int(k_weighted)+10, maxDist=maxDist_weighted-5*10**(-2), title=title)

In [None]:
weighted_kMeans = KMeans(n_clusters=k_weighted, random_state=42).fit(X_train)
labels_in_cluster(weighted_kMeans.labels_, k_weighted, title)

In [None]:
silhouette_blob(X_train.values, weighted_kMeans.labels_, weighted_kMeans.cluster_centers_)

### Ward LInk

In [None]:
title = "Ward link"
Z_ward = linkage(X_train, 'ward')

In [None]:
k_ward, maxDist_ward= find_elbow(Z_ward, title=title)

In [None]:
plot_dendogram(Z_ward, numClusters=int(k_ward)+10, maxDist=maxDist_ward-5, title=title)

In [None]:
agg_with_kmeans(num_clusters=k_ward, title=title)

## Cure

In [None]:
from pyclustering.cluster import cluster_visualizer
from pyclustering.cluster.cure import cure
from pyclustering.utils import read_sample

In [None]:
# Allocate three clusters.
cure_instance = cure(X_train.values, 20);
cure_instance.process();
clusters = cure_instance.get_clusters();

In [None]:
len(clusters[0])

In [None]:
title = "CURE"
given_clusters = np.zeros(shape=(X_train.shape[0],))
for i, pts in enumerate(clusters):
    given_clusters[pts] = i

In [None]:
labels_in_cluster(given_clusters, 20, title)

In [None]:
silhouette_blob(X_train.values, given_clusters, np.array(cure_instance.get_means()))

# OG Dataset

In [None]:
og_df = pd.read_csv('../../datasets/cleaned_soccer_data_2016_v2.csv')
og_df.columns

In [None]:
og_pos = og_df['player_positions']
og_df = og_df.drop(["Unnamed: 0", "id", "player_fifa_api_id", "player_api_id", "date", "player_name", 'player_positions'], axis=1)
og_df

In [None]:
for l in labels:
    og_df[l] = [0] * len(og_df)

for i, p in enumerate(og_pos.values):
    if ',' in p:
        split = p.split(', ')
        for s in split:
            og_df.loc[i, s] = 1
    else:
        og_df.loc[i, p] = 1
        
og_df

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
og_scaled = min_max_scaler.fit_transform(og_df)

In [None]:
np.random.shuffle(og_scaled)
samples = og_scaled[:5000]
len(samples)

In [None]:
for nc in [4,6,8,10,11,12,14,16]:
    kmeans = KMeans(n_clusters=nc, random_state=42).fit(samples)
    silhouette_blob(samples, kmeans.labels_, kmeans.cluster_centers_)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
pca.fit(og_scaled)

In [None]:
pca.explained_variance_ratio_