In [None]:
import numpy as np
from sklearn.cluster import KMeans

def kmeans_missing(X, n_clusters, max_iter=10):
    """Perform K-Means clustering on data with missing values.

    Args:
      X: An [n_samples, n_features] array of data to cluster.
      n_clusters: Number of clusters to form.
      max_iter: Maximum number of EM iterations to perform.

    Returns:
      labels: An [n_samples] vector of integer labels.
      centroids: An [n_clusters, n_features] array of cluster centroids.
      X_hat: Copy of X with the missing values filled in.
      
    From: http://stackoverflow.com/questions/35611465/python-scikit-learn-clustering-with-missing-data
    """

    # Initialize missing values to their column means
    missing = ~np.isfinite(X)
    mu = np.nanmean(X, 0, keepdims=1)
    X_hat = np.where(missing, mu, X)

    for i in range(max_iter):
        if i > 0:
            # initialize KMeans with the previous set of centroids. this is much
            # faster and makes it easier to check convergence (since labels
            # won't be permuted on every iteration), but might be more prone to
            # getting stuck in local minima.
            cls = KMeans(n_clusters, init=prev_centroids)
        else:
            # do multiple random initializations in parallel
            cls = KMeans(n_clusters, n_jobs=-1,n_init=100)

        # perform clustering on the filled-in data
        labels = cls.fit_predict(X_hat)
        centroids = cls.cluster_centers_

        # fill in the missing values based on their cluster centroids
        X_hat[missing] = centroids[labels][missing]

        # when the labels have stopped changing then we have converged
        if i > 0 and np.all(labels == prev_labels):
            break

        prev_labels = labels
        prev_centroids = cls.cluster_centers_

    return labels, centroids, X_hat

In [None]:
conda install tensorflow
pip install fancyimpute

In [None]:
#copy the df
df_kmeans = df_fixed.copy()

#keep the values
X = np.array(df_kmeans.loc[:,cols])

#Calculate hte groups
labels,centroids,kmeans_imputed = kmeans_missing(X,n_clusters=9,max_iter=100)

#Copy back the values (now imputed)
df_kmeans.loc[:,cols] = kmeans_imputed

#Create a column with labels (group number)
df_kmeans["label"] = labels
df_kmeans.head()

In [None]:
#Make a plot to see how well we did

#First, make a list of booleans to keep only the missing values
values = np.isnan(df_fixed["LABOUR_PRODUCTIVITY"])

#Make plot only of missing values
plt.figure(figsize=(6,4))
plt.scatter(df_original.loc[values,["LABOUR_PRODUCTIVITY"]],df_kmeans.loc[values,["LABOUR_PRODUCTIVITY"]],label="KMEANS")
plt.scatter(df_original.loc[values,["LABOUR_PRODUCTIVITY"]],mean_inputed.loc[values,["LABOUR_PRODUCTIVITY"]],color="r",label="MEAN")
plt.plot([0.75,1.19],[0.75,1.19],label="Perfect match near this line")
plt.xlim((0.75,1.2))
plt.ylim((0.75,1.2))
plt.legend(loc=0)
plt.xlabel("Original productivity")
plt.ylabel("Imputed productivity")
plt.savefig("figures/kmeans_vs_mean_worst_case.png")