In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.preprocessing import scale, StandardScaler
from sklearn.cluster import AgglomerativeClustering

### Cargar los datos

In [None]:
titanic = pd.read_csv("/home/silil/Documents/itam/mineria_datos_licenciatura/data/titanic/titanic.csv")

In [None]:
titanic.head()

In [None]:
titanic.rename(columns={col: col.lower().replace('/',"_") for col in titanic.columns.values}, inplace=True)
titanic.head()

### Transformación de variables

In [None]:
# modificamos sex
titanic['sex'] = titanic.mask(titanic.sex == 'male', 0)
titanic['sex'] = titanic.mask(titanic.sex == 'female', 1)

titanic['sex'] = titanic.sex.astype('int')

titanic.head()

In [None]:
titanic.describe()

In [None]:
dataset = titanic.drop(['name'], axis=1)
dataset.head()

In [None]:
# solo unas variables parte 2

In [None]:
dataset = titanic[['pclass','age']]

### Escalando los datos

2 formas diferentes de hacerlo

In [None]:
dataset_scaled = scale(dataset)
dataset_scaled[:4]

In [None]:
ss = StandardScaler()
dataset_scaled = ss.fit_transform(dataset)
dataset_scaled[:4]

In [None]:
dataset_scaled.shape

### Hclust

+ Single linkage

In [None]:
# la distancia en 0 asegura que se crea todo el árbol
ac = AgglomerativeClustering(n_clusters=None, linkage="single", distance_threshold=0)

In [None]:
m1 = ac.fit(dataset_scaled)

In [None]:
m1.labels_[:5]

In [None]:
m1.n_clusters_

### Graficar el dendrograma

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
plt.title('Hierarchical Clustering Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(m1, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
#plt.axhline(y=0.4, color='grey', linestyle='--')
plt.show()

Ahora que sabemos en qué *threshold* lo queremos cortar podemos agregar a `hclust` este *threshold*.

In [None]:
ac2 = AgglomerativeClustering(n_clusters=None, linkage="single", distance_threshold=0.4)

In [None]:
m2 = ac2.fit(dataset_scaled)

In [None]:
m2.n_clusters_

In [None]:
m2.labels_

In [None]:
titanic['grupo'] = m2.labels_

In [None]:
titanic.head()

In [None]:
titanic.groupby(['grupo'], as_index=False)['survived']\
.count()\
.rename(columns={'survived': 'count'})

In [None]:
titanic[titanic.grupo == 0].describe()

In [None]:
titanic[titanic.grupo == 1].describe()

In [None]:
titanic[titanic.grupo == 2].describe()

In [None]:
titanic[titanic.grupo == 3].describe()

In [None]:
titanic[titanic.grupo == 3]

In [None]:
titanic[titanic.grupo == 4].describe()

In [None]:
titanic[titanic.grupo == 4]

In [None]:
titanic['grupo'] = titanic.grupo.astype(str)

In [None]:
titanic.dtypes

In [None]:
sns.scatterplot(x='age', y='sex', hue="grupo", data=titanic)