In [2]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from sklearn.mixture import GaussianMixture

In [3]:
data = pd.read_csv('FinalDataframe_3.csv')

In [34]:
## have to also drop na
data = data.dropna()

In [35]:
#for X, or what we're clustering, we are removing outcomes y, such as case and death totals
#Also removing strings; can't k-means cluster on these
#Also removing columns Unnamed, they seem to be artifacts from data cleaning/combining multiple datasets?
X = data.drop(['Unnamed: 0', 'Unnamed: 0.1','NAME',  'cases_total', 'fips_final',
       'deaths_total', 'cases_per_capita', 'deaths_per_capita',
       'case_fatality_rate', 'Unnamed: 0.1.1', 'county', 'STATEFP', 'COUNTYFP',
       'CountyName', 'StateName', 'State',], axis=1)
y = data[["cases_total", "deaths_total", "cases_per_capita", "deaths_per_capita", "case_fatality_rate"]]

## K-Means

In [36]:
#set number of clusters
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)

#Run the clustering algorithm
model = kmeans.fit(X)
model

#Generate cluster predictions and store in y_hat
y_hat = kmeans.predict(X)

In [37]:
#function that creates a dataframe with a column for cluster number
cluster_centers = kmeans.cluster_centers_

def kmean_col(X, cluster_centers):
        cols = list(X)
        cols.append('cluster')
        
# Zip with a column called 'cluster' (index)
        Z = [np.append(A, index) for index, A in enumerate(cluster_centers)]
    
# Convert to pandas data frame for plotting
        P = pd.DataFrame(Z, columns=cols)
        P['cluster'] = P['cluster'].astype(int)
        return P
    
P = kmean_col(X, cluster_centers)
P

Unnamed: 0,num_hospitals,num_licensed_beds,num_staffed_beds,num_icu_beds,adult_icu_beds,pedi_icu_beds,potential_increase_in_bed_capac,bed_utilization,avg_ventilator_usage,population,...,mob_parks,mob_transit,mob_workplace,mob_residential,0-17,18-44,45-64,65-74,75+,cluster
0,2.017104,269.880274,225.704675,24.503991,24.503991,6.161916,44.175599,0.442497,3.278873,96121.94,...,6.20386,-14.905899,-21.825665,8.931269,21424.81,32067.85,24560.351197,6907.238312,5678.288483,0
1,20.090909,5147.454545,4559.954545,509.636364,509.636364,234.590909,587.5,0.618309,7.065232,1752731.0,...,1.811044,-31.341742,-27.773535,12.114003,386508.3,608684.8,412511.0,101990.136364,91081.909091,1
2,50.5,12139.5,10313.25,1169.5,1169.5,484.25,1826.25,0.563698,5.786362,4381166.0,...,-6.294881,-28.645199,-27.770579,12.062153,1031159.0,1580324.0,975604.5,240716.25,200817.25,2
3,8.024194,1848.427419,1575.0,176.137097,176.137097,72.427419,273.427419,0.612851,5.918894,645810.6,...,12.85645,-22.821002,-25.929805,10.956429,144948.7,220163.0,158290.217742,39776.685484,35611.879032,3


## EM

In [38]:
## run the EM algorithm (note i have to increase the reg-covar here)
em_k = GaussianMixture(n_components=4, init_params='kmeans', random_state=10, reg_covar = 1e-4).fit(X)

In [39]:
#Generate cluster predictions and store in y_hat
y_hat = em_k.predict(X)

In [44]:
#combining cluster data with outcome data to look at the whole dataset...
X['label_2'] = y_hat # result for EM

X['label'] = kmeans.labels_ # result for K-means
whole = pd.concat([X, y], axis=1)

In [45]:
whole

Unnamed: 0,num_hospitals,num_licensed_beds,num_staffed_beds,num_icu_beds,adult_icu_beds,pedi_icu_beds,potential_increase_in_bed_capac,bed_utilization,avg_ventilator_usage,population,...,45-64,65-74,75+,label_2,label,cases_total,deaths_total,cases_per_capita,deaths_per_capita,case_fatality_rate
1,4,386,362,51,51,0,24,0.562203,2.0,223234,...,51456,17803,12765,2,0,188,4,0.000842,0.000018,0.021276596
3,1,35,25,4,4,0,10,0.430904,1.0,22394,...,6121,1723,1183,2,0,42,0,0.001876,0.000000,0
4,1,25,25,6,6,0,0,0.499069,2.0,57826,...,15361,5079,3360,2,0,40,0,0.000692,0.000000,0
6,1,72,44,7,7,0,28,0.187111,2.0,19448,...,5805,1829,1660,2,0,114,1,0.005862,0.000051,0.00877193
7,3,486,435,30,30,0,51,0.431563,5.0,113605,...,32095,9437,7553,2,0,105,3,0.000924,0.000026,0.028571429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2289,1,146,133,12,12,0,13,0.128829,2.0,39261,...,11249,3253,2552,2,0,128,4,0.003260,0.000102,0.03125
2292,1,25,22,4,4,0,3,0.275342,1.0,8445,...,2679,884,704,2,0,11,1,0.001303,0.000118,0.090909091
2293,1,206,170,15,15,0,36,0.513360,4.0,99500,...,25170,6441,5064,2,0,104,1,0.001045,0.000010,0.009615385
2294,2,38,35,8,8,0,3,0.212492,2.0,19830,...,5115,1372,869,2,0,6,0,0.000303,0.000000,0


In [60]:
#creates a dataframe with a column for cluster number
b = pd.DataFrame()
for col in X.drop(['label_2'],axis=1).columns: 
    a = whole.groupby('label_2', as_index=False).agg({col: "mean"})
    b = pd.concat([b, a], axis=1)

In [61]:
b

Unnamed: 0,label_2,num_hospitals,label_2.1,num_licensed_beds,label_2.2,num_staffed_beds,label_2.3,num_icu_beds,label_2.4,adult_icu_beds,...,label_2.5,18-44,label_2.6,45-64,label_2.7,65-74,label_2.8,75+,label_2.9,label
0,0,5.130952,0,1086.113095,0,927.324405,0,102.008929,0,102.008929,...,0,126212.4,0,90443.431548,0,23446.809524,0,20385.761905,0,1.107143
1,1,20.090909,1,5147.454545,1,4559.954545,1,509.636364,1,509.636364,...,1,608684.8,1,412511.0,1,101990.136364,1,91081.909091,1,1.0
2,2,1.56391,2,151.813534,2,122.8,2,13.618045,2,13.618045,...,2,19573.44,2,16208.153383,2,4679.442105,2,3828.745865,2,0.0
3,3,50.5,3,12139.5,3,10313.25,3,1169.5,3,1169.5,...,3,1580324.0,3,975604.5,3,240716.25,3,200817.25,3,2.0


In [62]:
## counts_1 is the label result for K-means, counts_2 is the label result for EM
counts_1 = whole['label'].value_counts()
counts_2 = whole['label_2'].value_counts()

In [63]:
counts_1

0    877
3    124
1     22
2      4
Name: label, dtype: int64

In [64]:
counts_2

2    665
0    336
1     22
3      4
Name: label_2, dtype: int64

In [65]:
whole.to_csv('FinalDataframe_with_cluster.csv')