In [None]:
!pip install scikit-learn-extra

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.io import arff
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import k_means, AgglomerativeClustering
from sklearn.metrics import silhouette_score, silhouette_samples ,pairwise_distances
from sklearn_extra.cluster import KMedoids

In [2]:
data = arff.loadarff('Dataset/PhishingData.arff')
df_base = pd.DataFrame(data[0])
df = df_base[df_base.columns[:-1]]
df.head()

Unnamed: 0,SFH,popUpWidnow,SSLfinal_State,Request_URL,URL_of_Anchor,web_traffic,URL_Length,age_of_domain,having_IP_Address
0,b'1',b'-1',b'1',b'-1',b'-1',b'1',b'1',b'1',b'0'
1,b'-1',b'-1',b'-1',b'-1',b'-1',b'0',b'1',b'1',b'1'
2,b'1',b'-1',b'0',b'0',b'-1',b'0',b'-1',b'1',b'0'
3,b'1',b'0',b'1',b'-1',b'-1',b'0',b'1',b'1',b'0'
4,b'-1',b'-1',b'1',b'-1',b'0',b'0',b'-1',b'1',b'0'


In [3]:
one_hot_encoder = OneHotEncoder()
df_encoded = one_hot_encoder.fit_transform(df).toarray()
#df_encoded = df_encoded[:, :6]

In [4]:
pd.DataFrame(df_encoded).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


In [5]:
def plot_elbow_and_silhouette_curves(elbow_values, silhouette_scores, start_n_clusters=1):
    fig, ax = plt.subplots(2, 1 , figsize=(20, 9))
    ax[0].set_ylabel('Variance Intraclasses')
    ax[0].set_xlabel('Nombre de cluster')
    ax[0].set_xticks(np.arange(start_n_clusters, len(elbow_values)+start_n_clusters))
    ax[0].plot(np.arange(start_n_clusters, len(elbow_values)+start_n_clusters), elbow_values, 'go-', linewidth=2, markersize=5)
    ax[1].set_ylabel('Silhouette')
    ax[1].set_xlabel('Nombre de cluster')
    ax[1].set_xticks(np.arange(start_n_clusters, len(silhouette_scores)+start_n_clusters))
    ax[1].plot(np.arange(start_n_clusters, len(silhouette_scores)+start_n_clusters), silhouette_scores, 'go-', linewidth=2, markersize=5)
    fig.tight_layout()
    plt.show()

# KMeans Clustering

In [None]:
%%time
k_means_elbow_values = []
k_means_silhouette_scores = []
for i in np.arange(1, 30):
    centroid, label, inertia, n_iter = k_means(df_encoded, n_clusters=i, return_n_iter=True)
    k_means_silhouette_scores.append(silhouette_score(df_encoded, label) if i>1 else 0)
    k_means_elbow_values.append(inertia)

In [None]:
plot_elbow_and_silhouette_curves(k_means_elbow_values, k_means_silhouette_scores)

## Interprétration

### Avec toutes les variables
On remarque qu'il est assez difficile de faire un choix du nombre de cluster sur la base de la courbe de Elbow. Car son évolution dans l'intervalle \[3, 7\]. est quasi-linéaire. De plus, l'évolution de la courbe des valeurs silhouette n'indique pas un valeur résonnable du nombre de classe à choisir.<br>
Il est important de remarqué aussi que les valuers silhouette ne vont pas au dela de 0.16, ce qui est très bas. L'on peut conclure que KMeans n'est pas un algorithme approrié pour le clustering de ces données.

# PAM Clustering

In [None]:
def hamming_distance(x1, x2):
    return np.sum(x1 != x2)

In [None]:
%%time
#D_matrix = pairwise_distances(df, metric = hamming_distance)
D_matrix = pairwise_distances(df_encoded, metric = hamming_distance)

In [None]:
# Describe D_matrix
pd.DataFrame(D_matrix.reshape(1353**2, 1), columns=['dists']).describe()

In [None]:
%%time
k_means_elbow_values = []
k_means_silhouette_scores = []
for i in np.arange(2, 30):
    kmedoids = KMedoids(n_clusters=i, init='build', method='pam', metric='precomputed').fit(D_matrix)
    k_means_silhouette_scores.append(silhouette_score(D_matrix, kmedoids.labels_, metric='precomputed') if i>1 else 0)
    k_means_elbow_values.append(kmedoids.inertia_)

In [None]:
plot_elbow_and_silhouette_curves(k_means_elbow_values, k_means_silhouette_scores, 2)

In [None]:
n_clusters = 2
kmedoids = KMedoids(n_clusters=n_clusters, init='build', method='pam', metric='precomputed').fit(D_matrix)

sample_silhouette_values = silhouette_samples(D_matrix, kmedoids.labels_, metric='precomputed')


fig, ax = plt.subplots(figsize=(7, 7))
y_lower = 10
for i in range(n_clusters):
    # Aggregate the silhouette scores for samples belonging to
    # cluster i, and sort them
    ith_cluster_silhouette_values = sample_silhouette_values[kmedoids.labels_ == i]

    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color = cm.nipy_spectral(float(i) / n_clusters)
    ax.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        ith_cluster_silhouette_values,
        facecolor=color,
        edgecolor=color,
        alpha=0.7,
    )

    # Label the silhouette plots with their cluster numbers at the middle
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    # Compute the new y_lower for next plot
    y_lower = y_upper + 10  # 10 for the 0 samples

ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")

# The vertical line for average silhouette score of all the values
ax.axvline(x=silhouette_score(D_matrix, kmedoids.labels_, metric='precomputed'), color="red", linestyle="--")

ax.set_yticks([])  # Clear the yaxis labels / ticks

# Interprétation

Il est important de noter que la métrique de distance que nous avons utilisé est celle de **Hamming**

# CAH Clustering

In [None]:
def hamming_distance(x1, x2):
    return np.sum(x1 != x2)

In [None]:
%%time
D_matrix = pairwise_distances(df, metric = hamming_distance)
#D_matrix = pairwise_distances(df_encoded, metric = hamming_distance)

In [None]:
# Describe D_matrix
pd.DataFrame(D_matrix.reshape(1353**2, 1), columns=['dists']).describe()

In [None]:
%%time
k_means_elbow_values = []
k_means_silhouette_scores = []
for i in np.arange(1, 30):
    cah = AgglomerativeClustering(n_clusters=i, affinity='precomputed', linkage='single').fit(D_matrix)
    k_means_silhouette_scores.append(silhouette_score(D_matrix, cah.labels_, metric='precomputed') if i>1 else 0)
    k_means_elbow_values.append(0)

In [None]:
plot_elbow_and_silhouette_curves(k_means_elbow_values, k_means_silhouette_scores, 2)