<a href="https://colab.research.google.com/github/josemage16/JDisplay/blob/main/01_Segmentacion_de_clientes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Segmentación de clientes usando K-means

**Autor:** Roberto Muñoz <br />
**Github:** <https://github.com/rpmunoz> <br />

Este notebook muestra cómo entrenar un modelo no supervisado para identificar los diferentes tipos de clientes de una empresa. Se muestra la lectura de los datos, el análisis exploratorio, el análisis de correlación y la construcción de un modelo de clustering usando el método de K-means.

Este notebook se apoya en múltiples librerías estándar de python:

- numpy
- pandas
- matplotlib
- seaborn
- scikit-learn

Este dataset corresponde a las ventas realizadas durante el año 2019 por una fábrica de bebidas en Chile. Las columnas del archivo son las siguientes:

- ID cliente: Identificación única del cliente
- Volumen medio anual: Número total de cajas vendidas durante el año 2019
- Precio promedio por caja: Valor en pesos chilenos de una caja
- Dias de retraso: Número de días transcurridos entre la emisión de la orden de compra y el pago del cliente
- Periodos con compra promedio: Número de meses en el año en que el cliente realizó compras

In [1]:
import os
import requests
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

plt.rcParams.update({'font.size': 16})
pd.set_option('display.max_columns', None)

## 1. Lectura de los datos desde un archivo Excel a un DataFrame en Pandas

In [2]:
dataDir='data'
resultsDir='results'

if not os.path.exists(dataDir):
    os.mkdir(dataDir)

if not os.path.exists(resultsDir):
    os.mkdir(resultsDir)

In [None]:
dataURL = 'https://rmunoz-public.s3.amazonaws.com/ml/Base_de_clientes.xlsx'
dataFile = 'data/Base_de_clientes.xlsx'

r = requests.get(dataURL, allow_redirects=True)
open(dataFile, 'wb').write(r.content)

dataDF=pd.read_excel(dataFile, header=0)
dataDF.head()

In [None]:
print(len(dataDF))

In [None]:
figFile = os.path.join(resultsDir, "pairwise_plot_ALL.png")

sns.set_style('darkgrid')
sns.set_context("talk")#, rc={"font.size":18,"axes.titlesize":18,"axes.labelsize":17}) 
g = sns.pairplot(dataDF)

xlabels,ylabels = [],[]

for ax in g.axes[-1,:]:
    xlabel = ax.xaxis.get_label_text()
    xlabels.append(xlabel)
for ax in g.axes[:,0]:
    ylabel = ax.yaxis.get_label_text()
    ylabels.append(ylabel)

for i in range(len(xlabels)):
    for j in range(len(ylabels)):
        g.axes[j,i].xaxis.set_label_text(xlabels[i])
        #g.axes[j,i].yaxis.set_label_text(ylabels[j])

print(xlabels)
print(ylabels)

fig = g.fig 
fig.subplots_adjust(top=0.98, bottom=0.05, right=0.98, wspace=0.1, hspace=0.1)
g.fig.set_size_inches(16,16)

plt.savefig(figFile)

## 2. Analisis de correlacion

In [None]:
figFile = os.path.join(resultsDir, "correlation_plot_ALL.png")

fig, ax = plt.subplots(figsize=(16,16))
sns.set_context("talk", rc={"font.size":18,"axes.titlesize":18,"axes.labelsize":24}) 
g = sns.heatmap(dataDF.corr(), annot=True, annot_kws={"fontsize":24}, ax=ax)
#sns.set(font_scale=1)
plt.xticks(rotation=45)
plt.yticks(rotation=45)
#g.set_size_inches(16,16)
plt.tight_layout()
plt.savefig(figFile)

In [None]:
sns.set_context("talk", rc={"font.size":20,"axes.titlesize":20,"axes.labelsize":25}) 
g= sns.lmplot('Periodos con Compra promedio', 'Volumen Medio Anual', data=dataDF)
g.fig.set_size_inches(16,12)
plt.title('Volumen medio anual vs Periodos con compra')

plt.tight_layout()

## 3. Analisis de clustering usando metodo K-means

In [None]:
import matplotlib.cm as cm
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin
from sklearn.metrics import silhouette_samples, silhouette_score

In [None]:
def plot_elbow(X):

    sse={}
    for k in range(1, 10):
        kmeans = KMeans(n_clusters=k, max_iter=1000).fit(X)
        sse[k] = kmeans.inertia_ 

    plt.figure(figsize=(16,9))
    plt.plot(list(sse.keys()), list(sse.values()))
    plt.xlabel("Number of cluster")
    plt.show()

In [None]:
def find_clusters(X, n_clusters, rseed=2):
    # 1. Randomly choose clusters
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    centers = X[i]
    
    while True:
        # 2a. Assign labels based on closest center
        labels = pairwise_distances_argmin(X, centers)
        
        # 2b. Find new centers from means of points
        new_centers = np.array([X[labels == i].mean(0)
                                for i in range(n_clusters)])
        
        # 2c. Check for convergence
        if np.all(centers == new_centers):
            break
        centers = new_centers
    
    return centers, labels

In [None]:
def plot_silhouette(X):

  sse={}
  for n_clusters in range(2, 10):
      # Create a subplot with 1 row and 2 columns
      fig, (ax1, ax2) = plt.subplots(1, 2)
      fig.set_size_inches(18, 7)

      # The 1st subplot is the silhouette plot
      # The silhouette coefficient can range from -1, 1 but in this example all
      # lie within [-0.1, 1]
      ax1.set_xlim([-0.1, 1])
      # The (n_clusters+1)*10 is for inserting blank space between silhouette
      # plots of individual clusters, to demarcate them clearly.
      ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

      # Initialize the clusterer with n_clusters value and a random generator
      # seed of 10 for reproducibility.
      clusterer = KMeans(n_clusters=n_clusters, random_state=10)
      cluster_labels = clusterer.fit_predict(X)

      # The silhouette_score gives the average value for all the samples.
      # This gives a perspective into the density and separation of the formed
      # clusters
      silhouette_avg = silhouette_score(X, cluster_labels)
      sse[n_clusters] = silhouette_avg
      print("For n_clusters =", n_clusters,
            "The average silhouette_score is :", silhouette_avg)

      # Compute the silhouette scores for each sample
      sample_silhouette_values = silhouette_samples(X, cluster_labels)

      y_lower = 10
      for i in range(n_clusters):
          # Aggregate the silhouette scores for samples belonging to
          # cluster i, and sort them
          ith_cluster_silhouette_values = \
              sample_silhouette_values[cluster_labels == i]

          ith_cluster_silhouette_values.sort()

          size_cluster_i = ith_cluster_silhouette_values.shape[0]
          y_upper = y_lower + size_cluster_i

          color = cm.nipy_spectral(float(i) / n_clusters)
          ax1.fill_betweenx(np.arange(y_lower, y_upper),
                            0, ith_cluster_silhouette_values,
                            facecolor=color, edgecolor=color, alpha=0.7)

          # Label the silhouette plots with their cluster numbers at the middle
          ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

          # Compute the new y_lower for next plot
          y_lower = y_upper + 10  # 10 for the 0 samples

      ax1.set_title("The silhouette plot for the various clusters.")
      ax1.set_xlabel("The silhouette coefficient values")
      ax1.set_ylabel("Cluster label")

      # The vertical line for average silhouette score of all the values
      ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

      ax1.set_yticks([])  # Clear the yaxis labels / ticks
      ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

      # 2nd Plot showing the actual clusters formed
      colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
      ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                  c=colors, edgecolor='k')

      # Labeling the clusters
      centers = clusterer.cluster_centers_
      # Draw white circles at cluster centers
      ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                  c="white", alpha=1, s=200, edgecolor='k')

      for i, c in enumerate(centers):
          ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                      s=50, edgecolor='k')

      ax2.set_title("The visualization of the clustered data.")
      ax2.set_xlabel("Feature space for the 1st feature")
      ax2.set_ylabel("Feature space for the 2nd feature")

      plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                    "with n_clusters = %d" % n_clusters),
                  fontsize=14, fontweight='bold')

  plt.show()

  plt.figure(figsize=(16,9))
  plt.plot(list(sse.keys()), list(sse.values()))
  plt.xlabel("Number of cluster")
  plt.show()

In [None]:
dataDF.columns

In [None]:
maskColumns = [1,2,3,4]
X = dataDF.iloc[:, maskColumns]

# We standarize the features using a Normal distribution
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

## Gráfico de codo

In [None]:
plot_elbow(X_std)

## Gráfico de silueta

In [None]:
plot_silhouette(X_std)

## ¿Cuál es el número óptimo de segmentos? 

Lea este articulo y defina cual es el número optimo de clusters https://towardsdatascience.com/silhouette-method-better-than-elbow-method-to-find-optimal-clusters-378d62ff6891

Reemplace la variable nClusters con el número optimo de clusters



In [None]:
nClusters = 1
current_palette = sns.color_palette("tab10", n_colors=nClusters)

centers, labels = find_clusters(X_std, nClusters)
X['cluster'] = labels

In [None]:
figFile = os.path.join(resultsDir, "pairwise_plot_CLUSTERS.png")

sns.set_style('darkgrid')
sns.set_context("talk")
#sns.set_context("talk", rc={"font.size":18,"axes.titlesize":18,"axes.labelsize":17})

g = sns.pairplot(X, hue='cluster',  vars=['Volumen Medio Anual', 'Precio Promedio por caja',
       'Dias de Retraso', 'Periodos con Compra promedio'],
       palette=current_palette, plot_kws={"alpha":1.0})

g._legend.remove()
handles = g._legend_data.values()
labels = g._legend_data.keys()
g.fig.legend(handles=handles, labels=labels, loc='center right', ncol=1, frameon=True)

xlabels,ylabels = [],[]

for ax in g.axes[-1,:]:
    xlabel = ax.xaxis.get_label_text()
    xlabels.append(xlabel)
for ax in g.axes[:,0]:
    ylabel = ax.yaxis.get_label_text()
    ylabels.append(ylabel)

for i in range(len(xlabels)):
    for j in range(len(ylabels)):
        g.axes[j,i].xaxis.set_label_text(xlabels[i])

fig = g.fig 
fig.subplots_adjust(top=0.98, bottom=0.05, right=0.98, wspace=0.1, hspace=0.1)

g.fig.set_size_inches(16,16)
#plt.tight_layout()
plt.savefig(figFile)


In [None]:
sns.set_style('darkgrid')
sns.set_context("talk")

g = sns.FacetGrid(X, hue="cluster")
g.fig.set_size_inches(12,9)
g = g.map(sns.kdeplot, "Volumen Medio Anual")
plt.tight_layout()
plt.show()

g = sns.FacetGrid(X, hue="cluster")
g.fig.set_size_inches(12,9)
g = g.map(sns.kdeplot, "Precio Promedio por caja", linewidth=4)
plt.tight_layout()
figFile = os.path.join(resultsDir, "histogram_precio_promedio_caja.png")
plt.savefig(figFile)
plt.show()

g = sns.FacetGrid(X, hue="cluster")
g.fig.set_size_inches(8,6)
g = g.map(sns.kdeplot, "Periodos con Compra promedio")
plt.show()

fig, ax = plt.subplots(figsize=(12,9))
g=sns.scatterplot( 'Precio Promedio por caja', 'Volumen Medio Anual', data=X,
                hue='cluster', s=80, palette=current_palette, ax=ax)
#g.legend(loc='center left', bbox_to_anchor=(1.1, 0.5), ncol=1)
plt.tight_layout()
figFile = os.path.join(resultsDir, "scatter_precio_promedio_volumen_medio.png")
plt.savefig(figFile)
plt.show()

fig, ax = plt.subplots(figsize=(12,9))
g=sns.scatterplot( 'Periodos con Compra promedio',  'Volumen Medio Anual', data=X,
                hue='cluster', s=80, palette=current_palette, ax=ax)
#g.legend(loc='center left', bbox_to_anchor=(1.1, 0.5), ncol=1)
figFile = os.path.join(resultsDir, "scatter_periodos_compra_volumen_medio.png")
plt.savefig(figFile)
plt.show()


fig, ax = plt.subplots(figsize=(8,6))
g=sns.scatterplot( 'Dias de Retraso', 'Volumen Medio Anual', data=X,
                hue='cluster', s=50, palette=current_palette, ax=ax)
g.legend(loc='center left', bbox_to_anchor=(1.1, 0.5), ncol=1)
plt.show()

fig, ax = plt.subplots(figsize=(8,6))
g=sns.scatterplot('Periodos con Compra promedio', 'Precio Promedio por caja', data=X,
                hue='cluster', s=50, palette=current_palette, ax=ax)
g.legend(loc='center left', bbox_to_anchor=(1.1, 0.5), ncol=1)
plt.show()

## 4. Interpretación de resultados

In [None]:
summaryList=[]
for name, group in X.groupby("cluster"):
    tempMean = group.mean()
    tempDict = {"Segmento": name, "N clientes":len(group),
              "Venta promedio por cliente": np.round(tempMean['Volumen Medio Anual']*tempMean['Precio Promedio por caja'])/len(group),
              "Periodos con compra": tempMean['Periodos con Compra promedio'],
              "Precio promedio por caja": tempMean['Precio Promedio por caja'],
              "Volumen cajas promedio": tempMean['Volumen Medio Anual'],
              "Ventas anuales":np.round(tempMean['Volumen Medio Anual']*tempMean['Precio Promedio por caja']) }
    summaryList.append(tempDict)

    print("\nGroup name: ", name)
    print("Numero de clientes: ", len(group))
    print(group.mean())

summaryDF = pd.DataFrame(summaryList)
print(summaryDF)


In [None]:
fig, ax = plt.subplots(figsize=(12,9))

sp = sns.scatterplot(x="Periodos con compra", y="Venta promedio por cliente", data=summaryDF, hue='Segmento',
                     size="Ventas anuales", sizes=(50, 10000), palette=current_palette, legend=None)

fig, ax = plt.subplots(figsize=(12,9))
sp = sns.scatterplot(x="Periodos con compra", y="Volumen cajas promedio", data=summaryDF, hue='Segmento',
                     size="Ventas anuales", sizes=(50, 10000), palette=current_palette, legend=None)
plt.xlim((1,12))
plt.ylim((-1e3,15e3))

fig, ax = plt.subplots(figsize=(12,9))
sp = sns.scatterplot(x="Periodos con compra", y="Precio promedio por caja", data=summaryDF, hue='Segmento',
                     size="Ventas anuales", sizes=(50, 10000), palette=current_palette, legend=None)


#sp._legend.remove()