### Imports

In [115]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder # for preprocessing the data 
from sklearn.compose import ColumnTransformer # for preprocessing the data with different types of features
from sklearn.pipeline import Pipeline # for building the pipeline to preprocess the data 
from sklearn.impute import SimpleImputer # for missing value imputation 
from sklearn.neighbors import NearestNeighbors # for outlier detection  
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score # for clustering evaluation     

### Loading The Dataset

In [116]:
ds3 = pd.read_csv('../dataset3.csv')
# Replace placeholders like '.' with NaN
ds3.replace('.', np.nan, inplace=True)
ds3.head(40)

Unnamed: 0,COUNTY,HOSPITAL,LINKS TO COMMENT LETTERS,ESOPHAGEAL RESECTION (Risk-Adjusted Mortality Rate),ESOPHAGEAL RESECTION (# of Deaths),ESOPHAGEAL RESECTION (# of Cases),ESOPHAGEAL RESECTION (Outlier Ratings),PANCREATIC RESECTION (Risk-Adjusted Mortality Rate),PANCREATIC RESECTION (# of Deaths),PANCREATIC RESECTION (# of Cases),...,PNEUMONIA (# of Cases),PNEUMONIA (Outlier Ratings),PCI (Risk-Adjusted Mortality Rate),PCI (# of Deaths),PCI (# of Cases),PCI (Outlier Ratings),CAROTID ENDARTERECTOMY (Risk-Adjusted Mortality Rate),CAROTID ENDARTERECTOMY (# of Deaths),CAROTID ENDARTERECTOMY (# of Cases),CAROTID ENDARTERECTOMY (Outlier Ratings)
0,Alameda,Alameda County Medical Center - Highland Campus,,,,,,,,,...,212,,,,,,0.0,0.0,3.0,
1,Alameda,Alameda Hospital,,,,,,,,,...,150,,,,,,0.0,0.0,3.0,
2,Alameda,Alta Bates Summit Medical Center - Alta Bates ...,,,,,,,,,...,245,,2.6,5.0,95.0,,6.9,1.0,13.0,
3,Alameda,Alta Bates Summit Medical Center - Summit Camp...,,0.0,0.0,3.0,,0.0,0.0,3.0,...,371,,2.9,19.0,792.0,,7.2,1.0,21.0,
4,Alameda,Eden Medical Center,,,,,,,,,...,195,,,,,,0.0,0.0,7.0,
5,Alameda,Kaiser Foundation Hospital - Hayward,,,,,,12.2,1.0,10.0,...,378,,0.0,0.0,6.0,,0.0,0.0,38.0,
6,Alameda,Kaiser Foundation Hospital - Oakland Campus,,,,,,0.0,0.0,10.0,...,377,,,,,,0.0,0.0,32.0,
7,Alameda,Saint Rose Hospital,,,,,,,,,...,132,,3.4,7.0,113.0,,0.0,0.0,3.0,
8,Alameda,San Leandro Hospital,,,,,,,,,...,110,,,,,,0.0,0.0,23.0,
9,Alameda,Valleycare Medical Center,,,,,,,,,...,234,,2.8,4.0,125.0,,0.0,0.0,31.0,


### Separating the Dataset into Numerical and Categorical DataFrames:

In [117]:
numerical_cols = [
    'ESOPHAGEAL RESECTION (Risk-Adjusted Mortality Rate)',
    'ESOPHAGEAL RESECTION (# of Deaths)',
    'ESOPHAGEAL RESECTION (# of Cases)',
    'PANCREATIC RESECTION (Risk-Adjusted Mortality Rate)',
    'PANCREATIC RESECTION (# of Deaths)',
    'PANCREATIC RESECTION (# of Cases)',
    'AAA REPAIR (Risk-Adjusted Mortality Rate)',
    'AAA REPAIR (# of Deaths)',
    'AAA REPAIR (# of Cases)',
    'CRANIOTOMY (Risk-Adjusted Mortality Rate)',
    'CRANIOTOMY (# of Deaths)',
    'CRANIOTOMY (# of Cases)',
    'ACUTE MYOCARDIAL INFARCTION (Risk-Adjusted Mortality Rate)',
    'ACUTE MYOCARDIAL INFARCTION (# of Deaths)',
    'ACUTE MYOCARDIAL INFARCTION (# of Cases)',
    'HEART FAILURE (Risk-Adjusted Mortality Rate)',
    'HEART FAILURE (# of Deaths)',
    'HEART FAILURE (# of Cases)',
    'ACUTE STROKE (Risk-Adjusted Mortality Rate)',
    'ACUTE STROKE (# of Deaths)',
    'ACUTE STROKE (# of Cases)',
    'GI HEMORRHAGE (Risk-Adjusted Mortality Rate)',
    'GI HEMORRHAGE (# of Deaths)',
    'GI HEMORRHAGE (# of Cases)',
    'HIP FRACTURE (Risk-Adjusted Mortality Rate)',
    'HIP FRACTURE (# of Deaths)',
    'HIP FRACTURE (# of Cases)',
    'PNEUMONIA (Risk-Adjusted Mortality Rate)',
    'PNEUMONIA (# of Deaths)',
    'PNEUMONIA (# of Cases)',
    'PCI (Risk-Adjusted Mortality Rate)',
    'PCI (# of Deaths)',
    'PCI (# of Cases)',
    'CAROTID ENDARTERECTOMY (Risk-Adjusted Mortality Rate)',
    'CAROTID ENDARTERECTOMY (# of Deaths)',
    'CAROTID ENDARTERECTOMY (# of Cases)',
    
]

categorical_cols = [
    'PANCREATIC RESECTION (Outlier Ratings)',
    'AAA REPAIR (Outlier Ratings)',
    'CRANIOTOMY (Outlier Ratings)',
    'ACUTE MYOCARDIAL INFARCTION (Outlier Ratings)',
    'HEART FAILURE (Outlier Ratings)',
    'ACUTE STROKE (Outlier Ratings)',
    'GI HEMORRHAGE (Outlier Ratings)',
    'HIP FRACTURE (Outlier Ratings)',
    'PNEUMONIA (Outlier Ratings)',
    'PCI (Outlier Ratings)',
    'CAROTID ENDARTERECTOMY (Outlier Ratings)',
    'COUNTY',
    'HOSPITAL'
]

# Separate the dataset into numerical and categorical dataframes
numerical_data = ds3[numerical_cols]
categorical_data = ds3[categorical_cols]

### Data Preprocessing

In [118]:
# Define the preprocessing for numerical data: impute missing values and then standardize
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define the preprocessing for categorical data: impute missing values and then apply one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Impute with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # OneHotEncode the categorical data
])

# Combine the preprocessors in a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply the transformations to the dataset
ds3_preprocessed = preprocessor.fit_transform(ds3)

# The resulting ds3_preprocessed is now a numpy array. If needed, it can be converted back to a DataFrame
# However, for the KNN algorithm, we can work with the numpy array directly
ds3_preprocessed.shape  # To check the shape of the processed data

(331, 441)

### k Nearest Neighbors clustering (KNN)

#### Unsupervised Clustering of Data Based on Nearest Neighbors with specific choice of clustering  (Outlier Ratings)

In [134]:
# Filter columns that end with "(Outlier Ratings)"
outlier_rating_cols = [col for col in ds3.columns if col.endswith("(Outlier Ratings)")]

# Select only the relevant columns
outlier_data = ds3[outlier_rating_cols]

# Data Preprocessing: Encoding categorical data
encoder = OneHotEncoder()
outlier_data_encoded = encoder.fit_transform(outlier_data)

# Implementing Nearest Neighbor Clustering
neighbors = NearestNeighbors(n_neighbors=12)  # Adjust the number of neighbors as needed
neighbors.fit(outlier_data_encoded)

# Finding the nearest neighbors
distances, indices = neighbors.kneighbors(outlier_data_encoded) # 'distances' and 'indices' will give you the distances and indices of nearest neighbors for each point

#### Custom Nearest Neighbors Clustering Based on Common Neighbors

In [145]:
# Function to find common neighbors
def find_common_neighbors(neighbors_indices, current_index, threshold):
    common_neighbors = set()
    target_neighbors = set(neighbors_indices[current_index])
    for i, neighbors in enumerate(neighbors_indices):
        if i != current_index and len(target_neighbors.intersection(neighbors)) >= threshold:
            common_neighbors.add(i)
    return common_neighbors

# Initialize cluster labels to -1 (unassigned)
cluster_labels = -1 * np.ones(outlier_data_encoded.shape[0], dtype=int) # Each element is set to -1, indicating that initially, no data point is assigned to any cluster.
current_cluster = 0
threshold =12  # Threshold for the number of common neighbors to consider points in the same cluster

for i in range(ds3_preprocessed.shape[0]):
    if cluster_labels[i] == -1:
        # Find points with common neighbors
        common_neighbors = find_common_neighbors(indices, i, threshold)

        if common_neighbors:
            # Assign points to the same cluster
            cluster_labels[list(common_neighbors)] = current_cluster
            cluster_labels[i] = current_cluster
            current_cluster += 1
        else:
            # If no common neighbors, it's a separate cluster
            cluster_labels[i] = current_cluster # 'cluster_labels' contains the cluster ID for each point
            current_cluster += 1

### Evaluation of Clustering Performance Using: 
Silhouette, Calinski-Harabasz, and Davies-Bouldin Indices

In [146]:
# Note: We need the dense version of outlier_data_encoded for these metrics
outlier_data_dense = outlier_data_encoded.toarray()

# Silhouette Score -  measures how similar an object is to its own cluster 
silhouette_avg = silhouette_score(outlier_data_dense, cluster_labels)
print("Silhouette Score: ", silhouette_avg)

# Calinski-Harabasz Index - evaluates the cluster validity based on the ratio of the sum of between-clusters dispersion and of within-cluster dispersion
calinski_harabasz = calinski_harabasz_score(outlier_data_dense, cluster_labels)
print("Calinski-Harabasz Index: ", calinski_harabasz)

# Davies-Bouldin Index - evaluates the cluster validity based on the average similarity between each cluster and its most similar one
davies_bouldin = davies_bouldin_score(outlier_data_dense, cluster_labels)
print("Davies-Bouldin Index: ", davies_bouldin)


Silhouette Score:  0.9154078549848943
Calinski-Harabasz Index:  1.0
Davies-Bouldin Index:  0.0
