In [122]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from IPython.display import clear_output
import imageio

In [16]:
players = pd.read_csv('players_22.csv')

## Data exploration and cleaning

In [21]:
players.head()

### Subsetting data and removing missing values

In [17]:
features = ['overall', 'potential', 'wage_eur', 'value_eur', 'age']
players_dropped = players[features].copy()
players_dropped_na = players_dropped.dropna()

In [22]:
data = players_dropped_na.copy()

In [20]:
print(f'Data size before dropping NAs: {players_dropped.shape[0]}')
print(f'Data size after dropping NAs: {players_dropped_na.shape[0]}')

### Scaling features
- We will use `min-max scaling`, where $$x_{scaled}=\frac{x-x_{min}}{x_{max}-x_{min}}$$
- We will than multiply that by 9, to rescale everything from 0 to 9
- Finally, add 1 so that everything is on a 1-10 scale.

In [25]:
data = ((data - data.min())/(data.max() - data.min())) * 9 + 1

In [27]:
data.describe()

In [28]:
data.head()

## K-Means implementation
1. Initialize random centroids
2. Assign each datapoint to centroid based on proximity
3. Calculate new centroid of each cluster
4. Repeat last two steps until locations of centroids do not change

### Initialize random centroids

In [102]:
def random_centroids(data, k):
    centroids = []
    for i in range(k):
        centroid = data.apply(lambda x: float(x.sample()), axis=0)
        centroids.append(centroid)
    return pd.concat(centroids, axis=1)

In [103]:
centroids = random_centroids(data, 4)
centroids

### Assign each datapoint to centroid based on proximity

We will use the Euclidian distance, defined as $$\sqrt{(x_2-x_1)^2+(y_2-y_1)^2}$$

In [104]:
def get_labels(data, centroids):
    distances = centroids.apply(lambda x: np.sqrt(((data - x)**2).sum(axis=1)))
    return distances.idxmin(axis=1)

In [120]:
labels = get_labels(data, centroids)
labels.value_counts()

### Calculate new centroids of each cluster

This is a simplified version of getting the geometric mean of each cluster

In [111]:
def new_centroids(data, labels):
    return data.groupby(labels).apply(lambda x: np.exp(np.log(x).mean())).T

### Iterative loop

In [129]:
def plot_clusters(data, labels, centroids, iteration):
    pca = PCA(n_components=2)
    data_2d = pca.fit_transform(data)
    centroids_2d = pca.transform(centroids.T)
    clear_output(wait=True)
    plt.title(f'Iteration {iteration}')
    plt.scatter(x=data_2d[:, 0], y=data_2d[:, 1], c=labels)
    plt.scatter(x=centroids_2d[:, 0], y=centroids_2d[:, 1])
    plt.show()

In [130]:
max_iterations = 100
k = 3

centroids = random_centroids(data, k)
old_centroids = pd.DataFrame()
iteration = 1

while iteration < max_iterations and not old_centroids.equals(centroids):
    old_centroids = centroids
    
    labels = get_labels(data, centroids)
    centroids = new_centroids(data, labels)
    plot_clusters(data, labels, centroids, iteration)
    iteration += 1