Votre mission est d’aider les équipes d’Olist à comprendre les différents types d'utilisateurs. Vous utiliserez donc des méthodes non supervisées pour regrouper ensemble des clients de profils similaires. Ces catégories pourront être utilisées par l’équipe marketing pour mieux communiquer.

Votre client, Olist, a spécifié sa demande ainsi :
* La segmentation proposée doit être exploitable et facile d’utilisation pour l’équipe marketing.
* Vous évaluerez la fréquence à laquelle la segmentation doit être mise à jour, afin de pouvoir effectuer un devis de contrat de maintenance.
* Le code fourni doit respecter la convention PEP8, pour être utilisable par Olist.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
pd.set_option('display.max_columns', None)
from sklearn import preprocessing
from sklearn import decomposition
import plotly
import plotly.express as px
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding
from scipy.stats import mannwhitneyu
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.cluster import KMeans

In [None]:
def compare_time(timeSerie, referenceTime = None):
    """
    Difference between 2 time series
    Return 
        time series (days)
    """
    if referenceTime is None:
        referenceTime = pd.Timestamp.today()
    return referenceTime - timeSerie 

In [None]:
def PCA_graph(df_X, pca, dim1, dim2, title = None, path = None):
    pcs = pca.components_
    f, ax = plt.subplots(figsize=(10, 10))
    for i, (x, y) in enumerate(zip(pcs[dim1, :], pcs[dim2, :])):
        # Afficher un segment de l'origine au point (x, y)
        plt.plot([0, x], [0, y], color='k')
        # Afficher le nom (data.columns[i]) de la performance
        plt.text(x, y, df_X.columns[i], fontsize='14')
    # Afficher une ligne horizontale y=0
    plt.plot([-1, 1], [0, 0], color='grey', ls='--')
    # Afficher une ligne verticale x=0
    plt.plot([0, 0], [-1, 1], color='grey', ls='--')
    plt.xlim([-1, 1])
    plt.ylim([-1, 1])
    plt.xlabel("PC {0}".format(dim1+1))
    plt.ylabel("PC {0}".format(dim2+1))
    if title is not None:
        plt.title(title)
    if path is not None:
        plt.savefig(path)
    plt.show()

In [None]:
# projeter X sur les composantes principales
def plot_composant(df_X, X_projected, dim1, dim2, label, limit = 10, title = None, path = None):
    """
    Project data on PCA matrix and display it.
    Arguments:
        df_X: data frame with raw values (use for colorization)
        X_scaled: scaled matrix
        pca: scikit learn objet
        dim1: dimension use (int)
        dim2: dimension use (int)
        label: df_X feature use to colorize points
        limit: axis limit (int)
    """
    #X_projected = pca.transform(X_scaled)
    # afficher chaque observation
    plt.xlim([-limit, limit])
    plt.ylim([-limit, limit])
    plt.scatter(X_projected[:, dim1], X_projected[:, dim2],
        # colorer en utilisant la variable 'Rank'
        c=df_X.get(label), alpha = 0.2)
    plt.plot([-limit, limit], [0,0], color ="black")
    plt.plot([0,0], [-limit, limit], color ="black")
    cbar = plt.colorbar()
    #cbar.ax.get_yaxis().set_ticks([])
    cbar.ax.get_yaxis().labelpad = 10
    cbar.ax.set_ylabel(label, rotation=90)
    if title is not None:
        plt.title(title, size =18)
    if path is not None:
        plt.savefig(path)
    plt.show()

In [None]:
def proj3D_PCA(df_X, pca, label):
    """
    Realize a PCA on df_X and add a colone (for colorization)
    Arguments:
        _df_X: data Frame
        _pca: PCA matrix (sklearn PCA) use to project values
        _label: dimension used for color
    Return:
        proj_data
    """
    #normalize data
    std_scale = preprocessing.StandardScaler().fit(df_X)
    X_scaled = std_scale.transform(df_X)
    #projet data
    X_projected = pca.transform(X_scaled)
    #dimension used for colorization
    color = df_X[label].to_numpy()
    #create data frame and add color vector
    proj_data = np.concatenate((X_projected, np.reshape(color, [-1, 1])), axis=1)
    proj_data = pd.DataFrame(proj_data)
    col = proj_data.shape[1] - 1
    return proj_data.rename(columns = {col : label})

In [None]:
def scatter_plot(dt, label1, label2, title, path = None):
    x = dt.loc[:,label1].to_numpy()
    y = dt.loc[:,label2].to_numpy()
    sns.jointplot(data=dt, x = label1, y= label2,  marker="+", s=100, marginal_kws=dict(bins=60))
    #sns.jointplot(data=dt, x = label1, y= label2, kind="kde")
    plt.title(title, size = 20)
    if path is not None:
        plt.savefig(path)
    #ax.legend(loc='best')
    plt.show()

In [None]:
def rootline_mannwhitneyu(X1, X2, alpha = 0.05):
    """
    Non parametric test. Compare two distribution
    Hypothesis:
        H0: Sample distribution are equal
        H1: Sample distribution are not equal
    Arguments:
        X1: first sample (1D array)
        X2: second sample (1D array)
        alpha: risk (to reject or no H0)
    """
    stat, p = mannwhitneyu(X1, X2)
    print('Statistics={0:.3f}, p={1}'.format(stat, p))
    # interpret
    alpha = 0.05
    if p > alpha:
        print('Same distribution (fail to reject H0)')
    else:
        print('Different distribution (reject H0)')




# Importation des données

In [None]:
#on spécifie les colonnes qui correspodnent à des dates
df = pd.read_csv("clean_achat_data.csv", infer_datetime_format = True, parse_dates = [6,7])
df.describe()

In [None]:
df.head(3)

# Normalisation des données

In [None]:
#df.iloc[:,-11:].head()
data = df[["price", "freight_proportion_price", "payment_installments",
    "weight", "quantity", "delay", "days_since_last_command",
    "boleto", "credit_card", "voucher_debit_card"]]

In [None]:
df_X = data
print(df_X.shape)
std_scale = preprocessing.StandardScaler().fit(df_X)
X_scaled = std_scale.transform(df_X)

In [None]:
#Distribution des variables
labels = data.columns
for i in range(len(labels)):
    sns.kdeplot(X_scaled[:,i], bw=0.5)
    plt.title(labels[i])
    plt.show()


# Structure globale

## PCA

In [None]:
pca = decomposition.PCA(n_components=10)
pca.fit(X_scaled)

In [None]:
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_[0:5].sum())

### Variance expliquée

In [None]:
#contribution cumulée des composantes
dimensions = np.arange(len(pca.explained_variance_ratio_))
contribution = []
for i in range(len(pca.explained_variance_ratio_)):
    if i == 0:
        contribution.append(pca.explained_variance_ratio_[i])
    else:
        contribution.append(contribution[i-1] + pca.explained_variance_ratio_[i])

In [None]:
plt.plot(dimensions+1,contribution, "-+")
plt.title("PCA", size = 20)
plt.ylabel("Variance expliquée", size = 18)
plt.ylim(0,1.1)
plt.xlabel("Dimension", size = 18)
plt.show()

5 composantes suffisent pour décrire 72% de la variabilité de notre jeu de données.
La première et seconde composantes expliquent respectivement 23.1% et 17.0% de la variabilité et 11.7%, 10.7% et 9.5% pour les composantes 3, 4 et 5.

# Visualisation PCA

## Composante 1 et 2

In [None]:
PCA_graph(df_X, pca, dim1 = 0, dim2 = 1, title = "Projection des composantes 1 et 2")

In [None]:
plot_composant(df_X, pca.transform(X_scaled), 0, 1, 'credit_card', title = "ACP: composante 1 et 2")

In [None]:
plot_composant(df_X, pca.transform(X_scaled), 0, 1, 'freight_proportion_price', title = "ACP: composante 1 et 2")

Les deux premières composantes ne sont pas fortement corrélées avec une des variables utilisées.
La composante 1 explique 50% du paiement par carte de crédit.
Les commandes sont spérarés via la diagonal des composantes, suivant leur prix et frais de port.

## Composante 2 et 3

In [None]:
PCA_graph(df_X, pca, 1, 2)

In [None]:
plot_composant(df_X, pca.transform(X_scaled), 1, 2, 'delay' , title = "ACP: composante 2 et 3", limit = 17)

In [None]:
plot_composant(df_X, pca.transform(X_scaled), 1, 2, 'review_score' , title = "ACP: composante 2 et 3", limit = 17)

La composante 3 est corrélée avec le prix

## Composante 3 et 4

In [None]:
PCA_graph(df_X, pca, 2, 3)

In [None]:
plot_composant(df_X, pca.transform(X_scaled), 2, 3, 'voucher_debit_card', title = "ACP: composante 3 et 4", limit = 12)

La 4ième composante est en lien avec la variable **voucher debit card**.
97% des clients ont effectué des achats avec un autre moyen de paiement.

## Composante 4 et 5

In [None]:
PCA_graph(df_X, pca, 3, 4)

In [None]:
plot_composant(df_X, pca.transform(X_scaled), 3, 4, 'quantity', title = "ACP: composante 4 et 5", limit = 15)

La composante 5 est liée au nombre de jours écoulés depuis le dernier achat.

In [None]:
#Recap des dimensions d'intérêts
"price"
"credit_card"
"delay"
"voucher_debit_card"
"quantity"

## Projection 3D

In [None]:
df_X.head(1)

### Composante 1, 2 et 3

In [None]:
#réalise une réduction de dimension par ACP des données et renvoie un data frame
#avec les projection des données et une colonne supplémentaire correspondant à la
#dimension (non normalisée) utilisée pour la coloration.
proj_price = proj3D_PCA(df_X, pca, 'price')
fig = px.scatter_3d(proj_price, x=0, y=1, z=2, color='price', opacity=0.2,  size_max=0.5)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

In [None]:
proj_delay = proj3D_PCA(df_X, pca, 'delay')
fig = px.scatter_3d(proj_delay, x=0, y=1, z=2, color='delay', opacity=0.2,  size_max=0.5)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

In [None]:
sns.histplot(data=df, x="credit_card", kde=True)
plt.show()
proj_credit_card = proj3D_PCA(df_X, pca, 'credit_card')
fig = px.scatter_3d(proj_credit_card, x=0, y=1, z=2, color='credit_card', opacity=0.2,  size_max=0.5)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))


In [None]:
sns.histplot(data=df, x="voucher_debit_card", kde=True)
plt.show()
proj_voucher = proj3D_PCA(df_X, pca, 'voucher_debit_card')
fig = px.scatter_3d(proj_voucher, x=0, y=1, z=2, color='voucher_debit_card', opacity=0.2,  size_max=0.5)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

### Composante 1, 2 et 4

In [None]:
fig = px.scatter_3d(proj_voucher, x=0, y=1, z=3, color='voucher_debit_card', opacity=0.2,  size_max=0.5)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

### Composante 2, 3 et 4

In [None]:
fig = px.scatter_3d(proj_voucher, x=1, y=2, z=3, color='voucher_debit_card', opacity=0.2,  size_max=0.5)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

In [None]:
"price"
"credit_card"
"delay"
"voucher_debit_card"
"quantity"
df_X.head(3)

### Composante 2, 4 et 5

In [None]:
fig = px.scatter_3d(proj_voucher, x=1, y=3, z=4, color='voucher_debit_card', opacity=0.2,  size_max=0.5)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

In [None]:
proj_days = proj3D_PCA(df_X, pca, 'days_since_last_command')
fig = px.scatter_3d(proj_days, x=1, y=3, z=4, color='days_since_last_command', opacity=0.2,  size_max=0.5)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

# Isomap

Find a low-dimensional representation of the data (here 2D) in which the distances respect well the distances in the original high-dimensional space

In [None]:
df.iloc[:,-10:].head(3)

In [None]:
#utilisation des valeurs standardisées
#normalize data
lignes = np.random.choice(df.shape[0], 9000,  replace=False)
df_X = df.iloc[lignes,-10:]
std_scale = preprocessing.StandardScaler().fit(df_X)
X_scaled = std_scale.transform(df_X)
#reduction dimensionnellle
projection = Isomap(n_neighbors = 15, n_components = 2, n_jobs = -1).fit_transform(X_scaled)

In [None]:
sns.jointplot(x=projection[:,0], y=projection[:,1], kind="kde")
plt.show()

In [None]:
def isomap_plot(projection, vect_color, title = "Isomap", 
                c_label = None, vmin = None, vmax = None, 
                xlim = None, ylim= None, path = None):
    """
    Plot a isomap graphic
    Arguments:
        data's matrix after isomap transformation
        vect_color: list of values use to color points
        title: plot title
        c_label: color label's title
        vmin: lowest value use for color scale
        vmax: highest value use for color scale
        xlim: X axis limit (list)
        ylim: Y axis limit (list)
    """
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)
    plt.scatter(projection[:, 0], projection[:, 1], c=vect_color, 
                alpha = 0.1, vmin = vmin, vmax = vmax)
    plt.title(title)
    plt.axis('tight')
    cbar = plt.colorbar()
    if c_label is not None:
        cbar.ax.get_yaxis().labelpad = 14
        cbar.ax.set_ylabel(c_label, rotation=90)
    if xlim is not None:
        if len(xlim) != 2:
            print("expect list of 2 elements for xlim")
        else:
            plt.xlim(np.min(xlim), np.max(xlim))
    if ylim is not None:
        if len(ylim) != 2:
            print("expect list of 2 elements for ylim")
        else:
            plt.ylim(np.min(ylim), np.max(ylim))
    if path is not None:
        plt.savefig(path)
    plt.show()

In [None]:
vmax = df_X["days_since_last_command"].max()
vmin = 0
colors = df_X.get("days_since_last_command")
isomap_plot(projection, colors, "Isomap on days since last command",
            'Days since last command', vmax, vmin)

In [None]:
vmax = df_X["payment_value"].max()
vmin = 0
colors = df_X.get("payment_value")
isomap_plot(projection, colors, "Isomap on number of payment value",
           "Payment value", vmax, vmin)

In [None]:
vmax = df_X["voucher_debit_card"].max()
vmin = 0
colors = df_X.get("voucher_debit_card")
isomap_plot(projection, colors, "Isomap on payment type voucher/debit card",
           "Voucher dbit card", vmax, vmin)

Il y a 3 structures locales qui semblent se dégager. 1 correspondant à des paiements **voucher/debit card** (environ 3% des commandes) et deux structures qui correspondent à d'autres types de paiements.

In [None]:
df_X["voucher_debit_card"].unique()
#nombre d'éléments
#df_X["voucher_debit_card"].nunique()
tmp = df_X["voucher_debit_card"].value_counts(True)
tmp.index = np.round(tmp.index,2)
tmp.plot.bar()
plt.title("Proportion of voucher/debit card in payment")
plt.xlabel("Ration in total payment")
plt.ylabel("Proportion")
plt.show()

In [None]:
#plot limit
xlim = [np.min(projection[:,0])-1, np.max(projection[:,0])+1]
ylim = [np.min(projection[:,1])-1, np.max(projection[:,1])+1]
#color vector
col = np.where(df_X.columns == "voucher_debit_card")[0]
#Display payment made without voucher/debit card
tmp = df_X["voucher_debit_card"] == 0
indices = np.where(tmp)[0]
isomap_plot(projection[indices,:], df_X.iloc[indices,col].values, 
                "Isomap for command bought without voucher_debit_card (0.0)",
                vmin = 0, vmax = 1, xlim = xlim, ylim = ylim)
#Display payment made with voucher/debit card
tmp = df_X["voucher_debit_card"] > 0
indices = np.where(tmp)[0]
isomap_plot(projection[indices,:], df_X.iloc[indices,col].values, 
                "Isomap for command bought with voucher_debit_card (>0.0)",
                vmin = 0, vmax = 1, xlim = xlim, ylim = ylim)

# Structure local

## Locally Linear Embedding

In [None]:
def LLE_plot(projection, vect_color, title, vmin = None, vmax = None):
    """
    Display locally Linear Embeding plot
    Arguments:
        projection: matrix
        vect_color: list of values (color points)
        title: (str)
        vmin: lowest value use for color scale
        vmax: highest value use for color scale
    """
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111)
    plt.scatter(projection[:,0], projection[:,1], c=vect_color, alpha = 0.2,
               vmin = vmin, vmax = vmax)
    plt.title(title)
    plt.colorbar()

In [None]:
df.iloc[:,-10:].head(3)

In [None]:
lignes = np.random.choice(df.shape[0], 9000,  replace=False)
df_X = df.iloc[lignes,-10:]
std_scale = preprocessing.StandardScaler().fit(df_X)
X_scaled = std_scale.transform(df_X)
#reduction dimensionnellle
embedding = LocallyLinearEmbedding(n_components=2, n_neighbors=15, method='modified',
                               eigen_solver='dense', max_iter = 10000, n_jobs = -1)
projection = embedding.fit_transform(X_scaled)

In [None]:
sns.jointplot(x=projection[:,0], y=projection[:,1], kind="kde")
plt.show()

In [None]:
LLE_plot(projection, df_X.get("payment_value"), "LLE on number of payment value")

In [None]:
LLE_plot(projection, df_X.get("days_since_last_command"), "LLE on days since last command")

In [None]:
LLE_plot(projection, df_X.get("voucher_debit_card"), "LLE on voucher and debit card")

In [None]:
col = np.where(df_X.columns == "voucher_debit_card")[0]
#Display points as function of voucher_debit_card's values
for val in pd.unique(df_X["voucher_debit_card"].values):
    tmp = df_X["voucher_debit_card"] == val
    indices = np.where(tmp)[0]
    LLE_plot(projection[indices,:], df_X.iloc[indices,col].values, "Isomap on voucher_debit_card", 0, 1)

## Conclusion

Il y a 3 structures globales et locales défini par la valeur prise par **voucher_debit_card**.
Cependant ce n'est pas suffisant pour séparer correctement les clients, puisque les deux premiers groupes représentent 97% des individus.