In [8]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import pandas as pd
from functions import split_label
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score



In [6]:
# División en train (80%) y test (20%) para clasificación, con clase 'Survived'
titanic = pd.read_csv('data/titanic_ml.csv')
train_X, train_y, test_X, test_y = split_label(titanic, 0.2, 'Survived')

In [9]:
# Etapa de one hot encoding
ohe = ColumnTransformer( [("embarked_ohe", OneHotEncoder(categories='auto'), ['Embarked'])], 
                         remainder='passthrough')

In [11]:
titanic_1=ohe.fit_transform(titanic)

In [12]:
min_max_scaler= MinMaxScaler()

In [13]:
titanic_2=min_max_scaler.fit_transform(titanic_1)

In [14]:
# Etapa de escalado en rango [0,1]
sca = MinMaxScaler()

# Etapa de clustering
clu = KMeans(n_clusters=3)

# Creación del pipeline
pipe = Pipeline([('ohe', ohe), ('sca', sca), ('clu',clu)])

In [15]:


# Entrenamiento del pipeline
pipe.fit(titanic)
print("Centros de los clústeres:\n", pipe.named_steps['clu'].cluster_centers_)

# Evaluación de los clústeres
print('silhouette_score:', silhouette_score(titanic_2, pipe.named_steps['clu'].labels_))
print('calinski_harabasz:', calinski_harabasz_score(titanic_2, pipe.named_steps['clu'].labels_))



Centros de los clústeres:
 [[-1.94289029e-16  3.79146919e-02  9.62085308e-01  9.90521327e-01
   4.95260664e-01  3.27014218e-01  3.43576883e-01  9.95260664e-02
   9.47867299e-02  7.78150129e-02]
 [ 1.00000000e+00  6.93889390e-18  6.66133815e-16  6.07692308e-01
   3.73076923e-01  5.30769231e-01  3.81939799e-01  8.46153846e-02
   6.92307692e-02  1.33306411e-01]
 [-2.49800181e-16  5.39083558e-02  9.46091644e-01  1.05471187e-15
   7.77628032e-01  8.49056604e-01  3.75477998e-01  1.11051213e-01
   6.01976640e-02  3.85185852e-02]]
silhouette_score: 0.403603358828354
calinski_harabasz: 369.21898121397425
