In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn import metrics

In [5]:
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings("ignore")

In [8]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'heartdisease'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
heartdisease_df = pd.read_sql_query('select * from heartdisease',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

In [9]:
X = heartdisease_df.iloc[:, :13]
y = heartdisease_df.iloc[:, 13]

# Replace missing values (marked by `?`) with a `0`
X = X.replace(to_replace='?', value=0)

# Binarize y so that `1` means heart disease diagnosis and `0` means no diagnosis
y = np.where(y > 0, 0, 1)

In [10]:
X_std=StandardScaler().fit_transform(X)

Apply GMM to the heart disease dataset by setting n_components=2. Get ARI and silhouette scores for your solution and compare it with those of the k-means and hierarchical clustering solutions that you implemented in the previous checkpoint assignments. Which algorithm performs best?



In [11]:
gmm=GaussianMixture(n_components=2)
gmm.fit(X_std)

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
                means_init=None, n_components=2, n_init=1, precisions_init=None,
                random_state=None, reg_covar=1e-06, tol=0.001, verbose=0,
                verbose_interval=10, warm_start=False, weights_init=None)

In [13]:
clusters = gmm.fit_predict(X_std)

In [16]:
print("Adjusted Rand Index of the GMM solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters)))
print("The silhoutte score of the GMM solution: {}"
      .format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

Adjusted Rand Index of the GMM solution: 0.4207322145049338
The silhoutte score of the GMM solution: 0.16118591340148433


It does about equal to k means

GMM implementation of scikit-learn has a parameter called covariance_type. This parameter determines the type of covariance parameters to use. There are four types that you can specify:

* full: This is the default. Each component has its own general covariance matrix.
* tied: All components share the same general covariance matrix.
* diag: Each component has its own diagonal covariance matrix.
* spherical: Each component has its own single variance.

Try all of these. Which one performs best in terms of ARI and silhouette scores?



In [23]:
types=['full', 'tied', 'diag', 'spherical']
ns=[2,3,4]
for n in ns:
    for t in types:
        gmm=GaussianMixture(n_components=2, covariance_type=t)
        clusters = gmm.fit_predict(X_std)
        rand=metrics.adjusted_rand_score(y, clusters)
        print(f"Adjusted Rand Index of the {t} {n} clusters GMM solution: {rand}")
        sil=metrics.silhouette_score(X_std, clusters, metric='euclidean')
        print(f"The silhoutte score of the {t} {n} clusters GMM solution: {sil}")
    print('\n')

Adjusted Rand Index of the full 2 clusters GMM solution: 0.4207322145049338
The silhoutte score of the full 2 clusters GMM solution: 0.16118591340148433
Adjusted Rand Index of the tied 2 clusters GMM solution: 0.46482432589803474
The silhoutte score of the tied 2 clusters GMM solution: 0.16607012124631088
Adjusted Rand Index of the diag 2 clusters GMM solution: 0.18389186035089963
The silhoutte score of the diag 2 clusters GMM solution: 0.13628813153331445
Adjusted Rand Index of the spherical 2 clusters GMM solution: 0.20765243525722465
The silhoutte score of the spherical 2 clusters GMM solution: 0.12468753110276873


Adjusted Rand Index of the full 3 clusters GMM solution: 0.4207322145049338
The silhoutte score of the full 3 clusters GMM solution: 0.16118591340148433
Adjusted Rand Index of the tied 3 clusters GMM solution: 0.18389186035089963
The silhoutte score of the tied 3 clusters GMM solution: 0.13628813153331445
Adjusted Rand Index of the diag 3 clusters GMM solution: 0.3787884

full seems much better than the rest for ARI