In [252]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import random
import os
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from itertools import cycle
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [253]:
train = pd.read_csv('./train.csv')

In [254]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [255]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [256]:
target = train['Survived']
train.drop(['PassengerId', 'Survived', 'Ticket', 'Name', 'Cabin'], axis=1, inplace=True)
categorical_df = train.select_dtypes(include=['object'])
numeric_df = train.select_dtypes(exclude=['object'])
categorical_columns = list(categorical_df.columns)
numeric_columns = list(numeric_df.columns)

print("Categorical columns:\n", categorical_columns)
print("\nNumeric columns:\n", numeric_columns)

Categorical columns:
 ['Sex', 'Embarked']

Numeric columns:
 ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


In [257]:
numeric_df = train[numeric_columns]
def definePCAPipeline(numeric_columns):

    numeric_transformer = Pipeline(steps=[('imp', SimpleImputer(strategy='mean')),
                                            ('scaler', StandardScaler())])

    data_transformations = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_columns)])

    pca_pipeline = Pipeline(steps=[('data_transformations', data_transformations),
                                   ('feature_transf', PCA(n_components=2, whiten=True))])
    
    return pca_pipeline

In [258]:
pca_pipeline = definePCAPipeline(numeric_columns)
pca_arr = pca_pipeline.fit_transform(numeric_df)


In [259]:
pca_df = pd.DataFrame(pca_arr, columns=["PC1", "PC2"])
pca_df['Survived'] = target
pca_df.head(10)

Unnamed: 0,PC1,PC2,Survived
0,-0.884668,-0.054166,0
1,1.35191,0.111857,1
2,-0.683747,-0.562316,1
3,1.113582,0.063975,1
4,-0.44161,-0.7351,0
5,-0.580039,-0.63053,0
6,1.699638,-0.74448,0
7,-1.510072,1.883549,0
8,-0.697097,0.640734,1
9,-0.274808,0.283845,1


In [260]:
pca_df_survivors = pca_df[pca_df['Survived'] == 1]
pca_df_nonsurvivors = pca_df[pca_df['Survived'] == 0]

scatter_obj_survs = go.Scatter(x=pca_df_survivors['PC1'],
                               y=pca_df_survivors['PC2'],
                               mode="markers",
                               name='Survivors',
                               marker=dict(color='forestgreen'))

scatter_obj_nonsurvs = go.Scatter(x=pca_df_nonsurvivors['PC1'],
                                  y=pca_df_nonsurvivors['PC2'],
                                  mode="markers",
                                  name='Non-survivors',
                                  marker=dict(color='darkred'))


data = [scatter_obj_survs, scatter_obj_nonsurvs]

layout = go.Layout(title='visualization of survivors and non-survivors',
                   xaxis=dict(title='PC1'), yaxis=dict(title='PC2'))

fig = go.Figure(data=data, layout=layout)
fig.show()

In [261]:
kmeans = KMeans(n_clusters=2, random_state=7)
kmeans_defined_clusters = kmeans.fit_predict(transformed_df)
pca_df['KMeans_Defined_Clusters'] = ''
pca_df['KMeans_Defined_Clusters'] = kmeans_defined_clusters
pca_df['KMeans_Defined_Clusters'] = pca_df['KMeans_Defined_Clusters'].astype(str)
pca_df.head(10)

Unnamed: 0,PC1,PC2,Survived,KMeans_Defined_Clusters
0,-0.884668,-0.054166,0,1
1,1.35191,0.111857,1,0
2,-0.683747,-0.562316,1,1
3,1.113582,0.063975,1,0
4,-0.44161,-0.7351,0,1
5,-0.580039,-0.63053,0,1
6,1.699638,-0.74448,0,0
7,-1.510072,1.883549,0,1
8,-0.697097,0.640734,1,1
9,-0.274808,0.283845,1,1


In [262]:
colors = ['orange', 'steelblue']
cyclecolors = cycle(colors)
color = next(cyclecolors)
kmeans_clusters = pca_df.KMeans_Defined_Clusters.unique()
kmeans_centroids = centroids_df.cluster_centroid.unique()
data_kmeans = []
for cluster in kmeans_clusters:
    scatter_obj_cluster_kmeans = go.Scatter(x=pca_df[(pca_df['KMeans_Defined_Clusters'] == cluster)]['PC1'],
                                            y=pca_df[(pca_df['KMeans_Defined_Clusters'] == cluster)]['PC2'],
                                            mode='markers',
                                            name=cluster,
                                            marker_color=color)
    data_kmeans.append(scatter_obj_cluster_kmeans)
    color = next(cyclecolors)

for centroid in kmeans_centroids:
    scatter_obj_centroid_kmeans = go.Scatter(x=centroids_df[(centroids_df['cluster_centroid'] == centroid)]['X_coord'],
                                             y=centroids_df[(centroids_df['cluster_centroid'] == centroid)]['Y_coord'],
                                             mode='markers',
                                             name=centroid,
                                             marker_size=12,
                                             marker_symbol='x-dot',
                                             marker_color='black')
    data_kmeans.append(scatter_obj_centroid_kmeans)

layout_kmeans = go.Layout(title='K-Means Algorithm',
                          xaxis=dict(title='PC1'), yaxis=dict(title='PC2'))

fig_kmeans = go.Figure(data=data_kmeans, layout=layout_kmeans)
fig_kmeans.show()

In [263]:
ac = AgglomerativeClustering(n_clusters=2)
ac_defined_clusters = ac.fit_predict(transformed_df)

pca_df['AgglomerativeClustering_Defined_Clusters'] = ''
pca_df['AgglomerativeClustering_Defined_Clusters'] = ac_defined_clusters
pca_df['AgglomerativeClustering_Defined_Clusters'] = pca_df['AgglomerativeClustering_Defined_Clusters'].astype(str)

pca_df.head(10)

Unnamed: 0,PC1,PC2,Survived,KMeans_Defined_Clusters,AgglomerativeClustering_Defined_Clusters
0,-0.884668,-0.054166,0,1,0
1,1.35191,0.111857,1,0,1
2,-0.683747,-0.562316,1,1,0
3,1.113582,0.063975,1,0,1
4,-0.44161,-0.7351,0,1,0
5,-0.580039,-0.63053,0,1,0
6,1.699638,-0.74448,0,0,1
7,-1.510072,1.883549,0,1,0
8,-0.697097,0.640734,1,1,0
9,-0.274808,0.283845,1,1,0


In [264]:
colors = ['orange', 'steelblue']

cyclecolors = cycle(colors)
color = next(cyclecolors)

ac_clusters = pca_df.AgglomerativeClustering_Defined_Clusters.unique()

data_ac = []

for cluster in ac_clusters:
    scatter_obj_cluster_ac = go.Scatter(x=pca_df[(pca_df['AgglomerativeClustering_Defined_Clusters'] == cluster)]['PC1'],
                                            y=pca_df[(pca_df['AgglomerativeClustering_Defined_Clusters'] == cluster)]['PC2'],
                                            mode='markers',
                                            name=cluster,
                                            marker_color=color)
    data_ac.append(scatter_obj_cluster_ac)
    color = next(cyclecolors)

layout_ac = go.Layout(title='Agglomerative Clustering algorithm',
                          xaxis=dict(title='PC1'), yaxis=dict(title='PC2'))

fig_ac = go.Figure(data=data_ac, layout=layout_ac)
fig_ac.show()