In [37]:
Team Members: Prachitee Chouhan, Jay Singfhvi, Minh Vu

# Clustering
# 1. Mean Shift Clustering

### A. Clustering with preproprocessed data after dealing with outliers

In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import estimate_bandwidth,MeanShift

In [2]:
data= pd.read_csv('../Data/data_preprocessed.csv', index_col=0)

In [3]:
data_cols=['ph', 'Hardness','Solids','Chloramines','Conductivity','Trihalomethanes','Organic_carbon','Sulfate','Turbidity']

scaler=StandardScaler()
data_scaled= scaler.fit_transform(data[data_cols])
print(scaler.mean_)

[7.07042021e+00 1.96560808e+02 2.15589512e+04 7.12207235e+00
 4.25249290e+02 6.64654481e+01 1.42836805e+01 3.33845625e+02
 3.96758734e+00]


In [6]:
for b_w in (np.arange(0.1,1,0.1)):
    bandwidth= estimate_bandwidth(data_scaled,quantile=b_w, n_samples=500,random_state=0)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(data_scaled)  
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print("\nQuantile: %f"%b_w)
    print("Bandwidth: %f"%bandwidth)
    print("Number of clusters: %d"%n_clusters_)


Quantile: 0.100000
Bandwidth: 3.010944
Number of clusters: 1

Quantile: 0.200000
Bandwidth: 3.379553
Number of clusters: 1

Quantile: 0.300000
Bandwidth: 3.652485
Number of clusters: 1

Quantile: 0.400000
Bandwidth: 3.888542
Number of clusters: 1

Quantile: 0.500000
Bandwidth: 4.112761
Number of clusters: 1

Quantile: 0.600000
Bandwidth: 4.340260
Number of clusters: 1

Quantile: 0.700000
Bandwidth: 4.583425
Number of clusters: 1

Quantile: 0.800000
Bandwidth: 4.869908
Number of clusters: 1

Quantile: 0.900000
Bandwidth: 5.265695
Number of clusters: 1


### B. Clustering with preproprocessed data with outliers

In [7]:
data_with_outliers=pd.read_csv("../Data/data_preprocessed_MICE.csv",index_col=0)
data_with_outliers.head()
scaler=StandardScaler()
data_with_outliers_scaled= scaler.fit_transform(data_with_outliers[data_cols])


In [8]:
for b_w in (np.arange(0.1,1,0.1)):
    bandwidth= estimate_bandwidth(data_with_outliers_scaled,quantile=b_w, n_samples=500,random_state=0)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(data_with_outliers_scaled)  
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print("\nQuantile: %f"%b_w)
    print("Bandwidth: %f"%bandwidth)
    print("Number of clusters: %d"%n_clusters_)


Quantile: 0.100000
Bandwidth: 2.935282
Number of clusters: 10

Quantile: 0.200000
Bandwidth: 3.291729
Number of clusters: 3

Quantile: 0.300000
Bandwidth: 3.555617
Number of clusters: 2

Quantile: 0.400000
Bandwidth: 3.789651
Number of clusters: 1

Quantile: 0.500000
Bandwidth: 4.014613
Number of clusters: 1

Quantile: 0.600000
Bandwidth: 4.248310
Number of clusters: 1

Quantile: 0.700000
Bandwidth: 4.506218
Number of clusters: 1

Quantile: 0.800000
Bandwidth: 4.818700
Number of clusters: 1

Quantile: 0.900000
Bandwidth: 5.259018
Number of clusters: 1


In [20]:
bandwidth = estimate_bandwidth(data_with_outliers_scaled,quantile=0.3, n_samples=500, random_state=0)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(data_with_outliers_scaled)  
labels = ms.labels_
labels_unique = np.unique(labels)
n_clusters = len(labels_unique)
cluster_centers = ms.cluster_centers_
print("Number of clusters: %d"%n_clusters)

Number of clusters: 2


Take away:
1. Two mean shift clustering models were prepared, one with dataset after dealing with outliers and another before 
dealing with outliers. The model with dataset after dealing with outliers was not able to correctly estimate
the number of clusters. While, the model with later dataset, was correctly varying the number of clusters along the 
range of bandwidth (hence, quauntile). With increase in bandwidth (and quantile),number of clusters decreases. So, 
optimal quantile is 0.3 and bandwidth is 3.55