In [1]:
import numpy as np
import pandas as pd

# Implémentation de l'algorithme k-modes

### Simulation de données

Il doit exister deux groupes : 
 * Les individus vivant à Marseille qui aiment le foot
 * Les individus vivant à New York qui aiment le basket
 * Le salaire, l'expérience sont uniformément distribués

In [17]:
# Parametres de la simulation
#
salaires_choices = np.array(["riche","moyen","pauvre"])
experience_choices = np.array(["expert","moyen","nul"])
hobby_choices = np.array(["foot","basket","tennis"])
location_choices = np.array(["Marseille","New York","Mexico"])
#
p_location = np.array([0.4,0.4,0.2])
p_marseille_hobby = np.array([0.8,0.1,0.1])
p_newyork_hobby = np.array([0.1,0.8,0.1])
p_experience = np.array([0.1,0.3,0.6])
p_salaire = np.array([0.1,0.3,0.6])

In [18]:
# Construction du dataframe I
data_ville = np.random.choice(location_choices,size = 100, p = p_location)
data_experience = np.random.choice(experience_choices,size = 100, p = p_experience)
data_salaire = np.random.choice(salaires_choices,size = 100, p = p_salaire)
dic = {"ville" : data_ville,"experience" : data_experience,"salaire" : data_salaire}
ville_pop = pd.Series(data_ville).value_counts()
df = pd.DataFrame.from_dict(data = dic, orient = "columns").sort_values(["ville"])
df.head()

In [19]:
# Construction du dataframe II
hobby_marseille = np.random.choice(hobby_choices,
                                   size = ville_pop["Marseille"],
                                   p = p_marseille_hobby)
hobby_mexico = np.random.choice(hobby_choices,size = ville_pop["Mexico"])
hobby_newyork = np.random.choice(hobby_choices,
                                   size = ville_pop["New York"],
                                   p = p_newyork_hobby)
df["hobby"] = np.hstack((hobby_marseille,hobby_mexico,hobby_newyork))
df = pd.get_dummies(df)
df.head()

### Les fonctions de l'implémenation k-modes quick and dirty

In [68]:
def pickup_centroids(df,k):
    centroids_idx = np.random.choice(a = df.index.values, replace = False, size = k)
    centroid_a = df.loc[centroids_idx[0]].values
    centroid_b = df.loc[centroids_idx[1]].values
    return(centroid_a,centroid_b)

In [69]:
def distance_mismatch(a,b):
    return (a != b).sum()

In [79]:
def compute_distances_to_centroids(centroid_a,centroid_b,df):
    
    dic_distances = {}
    for idx,row in df.iterrows():
        candidat = row.values
        distance_to_a = distance_mismatch(candidat,centroid_a)
        distance_to_b = distance_mismatch(candidat,centroid_b)
        affectation = np.argmin([distance_to_a,distance_to_b])
        dic_distances[idx] = {"distance_to_a" : distance_to_a,
                              "distance_to_b" : distance_to_b,
                              "affectation" : affectation}
        
    return dic_distances

In [90]:
def extract_assigned_data(dic_distances,df):
    alist = []
    blist = []
    for k,v in dic_distances.items():
        if v["affectation"] == 0:
            alist.append(df.loc[k].values)
        else:
            blist.append(df.loc[k].values)
    return np.vstack(alist),np.vstack(blist)

In [108]:
def compute_mode(array):
    return ((np.sum(array,axis = 0) - array.shape[0]/2) > 0).astype(int)

In [None]:
def performance(dic_distances):
    for k,v in dic_distances.items():
        if v["affectation"] == 0:
            distances_list.append(v[distance_to_a])
        else:
            distances_list.append(v[distance_to_b])
    return (np.array(distances_list)).sum()

In [184]:
def kmodes(df,k=2,threshold=1,iterations=10, verbose = True, n_clusterings = 5):
    
    # Initi centroids
    centroid_a,centroid_b = pickup_centroids(df,k)
    
    for i in range(iterations):
        if verbose:
            print("iteration : ",i)
            print(df.columns.values[centroid_a.astype(bool)])
            print(df.columns.values[centroid_b.astype(bool)])
        dic_distances = compute_distances_to_centroids(centroid_a,centroid_b,df)
        array_a,array_b = extract_assigned_data(dic_distances,df)
        
        if len(array_a)==0 or len(array_b)==0:
            continue

        futur_centroid_a = compute_mode(array_a)
        futur_centroid_b = compute_mode(array_b)
        d = distance_mismatch(futur_centroid_a,centroid_a) + distance_mismatch(futur_centroid_b,centroid_b)
        if verbose:
            print("distance parcourue : ", d)
            print("")

        if d<threshold:
            break
        centroid_a = futur_centroid_a
        centroid_b = futur_centroid_b

    clustering_df = pd.DataFrame.from_dict(dic_distances,orient="index")
    clustering_df.drop(['distance_to_a',"distance_to_b"], axis=1, inplace = True) 
    stats_value_cluster_a = np.sum(array_a,axis = 0) / array_a.shape[0]
    cluster_a_caracterisation = stats_value_cluster_a / (df.sum(axis = 0) / len(df))
    stats_value_cluster_b = np.sum(array_b,axis = 0) / array_b.shape[0]
    cluster_b_caracterisation = stats_value_cluster_b / (df.sum(axis = 0) / len(df))
    clusters_caracterisation = pd.concat([cluster_a_caracterisation,cluster_b_caracterisation],axis = 1)

    return clustering_df,clusters_caracterisation
    

### Test de la fonction kmodes

In [186]:
solution_clustering = kmodes(df)
solution_clustering[1]

iteration :  0
['ville_Marseille' 'experience_nul' 'salaire_pauvre' 'hobby_foot']
['ville_Marseille' 'experience_nul' 'salaire_moyen' 'hobby_basket']
distance parcourue :  4

iteration :  1
['experience_nul' 'salaire_pauvre' 'hobby_foot']
['ville_New York' 'salaire_moyen' 'hobby_basket']
distance parcourue :  1

iteration :  2
['ville_Marseille' 'experience_nul' 'salaire_pauvre' 'hobby_foot']
['ville_New York' 'salaire_moyen' 'hobby_basket']
distance parcourue :  2

iteration :  3
['ville_Marseille' 'experience_nul' 'salaire_pauvre' 'hobby_foot']
['ville_New York' 'salaire_pauvre' 'hobby_basket']
distance parcourue :  0



Unnamed: 0,0,1
ville_Marseille,1.971977,0.138058
ville_Mexico,1.241135,0.786164
ville_New York,0.090539,1.806503
experience_expert,0.490998,1.451379
experience_moyen,1.033435,0.97035
experience_nul,1.104746,0.907112
salaire_moyen,0.892244,1.095557
salaire_pauvre,0.990462,1.008458
salaire_riche,1.353965,0.686106
hobby_basket,0.130265,1.771275


{'distance_to_a': 1, 'distance_to_b': 8, 'affectation': 0}

In [178]:
stats_value_cluster = np.sum(array_a,axis = 0) / array_a.shape[0]
stats_label = df.columns.values
clusters_odds_ratio_representation = stats_value_cluster / (df.sum(axis = 0) / len(df))

In [179]:
clusters_odds_ratio_representation

ville_Marseille      1.897019
ville_Mexico         1.481481
ville_New York       0.094563
experience_expert    0.512821
experience_moyen     1.523810
experience_nul       0.769231
salaire_moyen        1.218638
salaire_pauvre       0.766284
salaire_riche        1.616162
hobby_basket         0.181406
hobby_foot           2.108262
hobby_tennis         0.740741
dtype: float64

ville_Marseille      0.41
ville_Mexico         0.12
ville_New York       0.47
experience_expert    0.13
experience_moyen     0.35
experience_nul       0.52
salaire_moyen        0.31
salaire_pauvre       0.58
salaire_riche        0.11
hobby_basket         0.49
hobby_foot           0.39
hobby_tennis         0.12
dtype: float64

In [165]:
df.sum(axis = 0)

ville_Marseille      41
ville_Mexico         12
ville_New York       47
experience_expert    13
experience_moyen     35
experience_nul       52
salaire_moyen        31
salaire_pauvre       58
salaire_riche        11
hobby_basket         49
hobby_foot           39
hobby_tennis         12
dtype: int64

In [159]:
pd.Series(stats_value,index = stats_label)

ville_Marseille      0.777778
ville_Mexico         0.177778
ville_New York       0.044444
experience_expert    0.066667
experience_moyen     0.533333
experience_nul       0.400000
salaire_moyen        0.377778
salaire_pauvre       0.444444
salaire_riche        0.177778
hobby_basket         0.088889
hobby_foot           0.822222
hobby_tennis         0.088889
dtype: float64