In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

In [3]:
data18 = pd.read_csv("data18_c_d.csv")
data18 = data18.drop("Unnamed: 0", axis = 1)
data18_no_stars = data18[data18.columns[~data18.columns.str.contains("star")]].drop(["contract_name", "id", "name", "organization_type", "parent_org", "spring"], axis = 1)
data18_stars = data18[data18.columns[data18.columns.str.contains("star")]].drop(["c_health_plan_quality_improvement_star", "d_drug_plan_quality_improvement_star"], axis = 1)
data18_cutpoints = pd.read_csv("2018_cleaned/2018_Report_Card_Master_Table_2018_03_28_C_cutpoints.csv", encoding = "cp1252")

In [115]:
def cluster(column): 
    series = pd.Series(column)
    km = KMeans(n_clusters = 5)
    #should take out -1 otherwise the -1s will be their own cluster
    km.fit(np.array(pd.DataFrame(series.drop(series[series == -1].index))))
    return km

#kmeans starts with different centroids for each iteration, can yield different final centroids 
#repeat kmeans 1000 times and take the mean of the centroids for each cluster
def cluster_means(series, n_iters = 1000): 
    x = np.sort(np.array([cluster(series).cluster_centers_.flatten() for i in range(n_iters)]).flatten())
    one = x[0:n_iters]
    two = x[n_iters:(n_iters * 2)]
    three = x[(n_iters * 2):(n_iters * 3)]
    four = x[(n_iters * 3):(n_iters * 4)]
    five = x[(n_iters * 4):]
    df = pd.DataFrame({"1": one, "2": two, "3": three, "4": four, "5": five})
    return df.apply(lambda x: x.mean(), axis = 0).to_dict()

#points are classified based on which centroid they are closest to
#so cutpoints are the mean between the two closest centroids
def return_cutpoints(series): 
    cuts = cluster_means(series)
    cut1 = (cuts["1"] + cuts["2"])/2
    cut2 = (cuts["2"] + cuts["3"])/2
    cut3 = (cuts["3"] + cuts["4"])/2
    cut4 = (cuts["4"] + cuts["5"])/2
    return pd.Series({"1": cut1, "2": cut2, "3": cut3, "4": cut4})

def convert_to_stars(series): 
    cuts = cluster_means(series)
    cut1 = (cuts["1"] + cuts["2"])/2
    cut2 = (cuts["2"] + cuts["3"])/2
    cut3 = (cuts["3"] + cuts["4"])/2
    cut4 = (cuts["4"] + cuts["5"])/2
    return series.apply(lambda x: -1.0 if x == -1 else 
                                      1.0 if x < cut1 else 
                                      2.0 if (x >= cut1 and x < cut2) else
                                       3.0 if (x >= cut2 and x < cut3) else 
                                       4.0 if (x >= cut3 and x < cut4) else 
                                       5.0)


In [103]:
predicted = data18_no_stars.apply(convert_to_stars, axis = 0)

In [116]:
cutpoints = data18_no_stars.apply(return_cutpoints, axis = 0)

In [117]:
cutpoints

Unnamed: 0,c_adult_bmi_assessment,c_annual_flu_vaccine,c_beneficiary_access_and_performance_problems,c_breast_cancer_screening,c_call_center_foreign_language_interpreter_and_tty_availability,c_colorectal_cancer_screening,c_complaints_about_health_plan,c_controlling_blood_pressure,c_coordination,c_customer_service,c_diabetes_blood_sugar_controlled,c_diabetes_eye_exam,c_diabetes_kidney_disease_monitoring,c_getting_appointments_and_quickly,c_getting_needed_care,c_improving_bladder_control,c_improving_or_maintaining_mental_health,c_improving_or_maintaining_physical_health,c_medication_reconciliation_post_discharge,c_members_choosing_to_leave_plan,c_monitoring_physical_activity,c_older_adults_functional_status_assessment,c_older_adults_medication_review,c_older_adults_pain_assessment,c_osteoporosis_management_in_women_who_had_a_fracture,c_plan_all_cause_readmissions,c_plan_makes_timely_decisions_about_appeals,c_rating_of_health_plan,c_rating_of_health_quality,c_reducing_risk_of_falling,c_reviewing_appeals_decisions,c_rheumatoid_arthritis_management,c_special_needs_plan_management,d_appeals_autoforward,d_appeals_upheld,d_beneficiary_access_and_performance_problems,d_call_center_foreign_language_interpreter_and_tty_availability,d_complaints_about_drug_plan,d_getting_needed_prescription_drugs,d_medication_adherence_cholesterol,d_medication_adherence_diabetes_medications,d_medication_adherence_hypertension,d_members_choosing_to_leave_plan,d_mpf_price_accuracy,d_mtm_program_completion_rate_cmr,d_rating_of_drug_plan
1,66.533279,52.666781,20.623105,50.157603,45.285375,50.619014,0.13443,46.776124,82.712111,87.930483,30.468964,55.306654,92.599567,71.954943,75.179967,37.288773,75.446739,58.25463,22.180034,7.040999,43.72901,41.880682,44.124639,35.60925,23.546946,6.039088,69.810955,82.259931,82.901121,50.551493,58.923077,67.98154,32.878571,5.207095,60.243693,20.588235,48.957378,0.123046,87.398925,65.812118,72.480632,74.153123,6.04643,92.856686,32.073301,80.084319
2,84.542333,65.556883,49.999355,66.893845,72.411565,63.940049,0.322101,61.273127,85.079473,89.398271,55.092239,65.19066,94.618234,75.635105,79.904882,41.861731,81.356568,63.854528,40.191054,12.844023,48.520609,69.27167,76.935705,62.676278,37.115352,8.925791,81.979429,84.827038,84.877198,56.781508,76.211823,75.58912,53.578371,18.026568,74.091233,50.0,71.559585,0.298569,89.252953,74.214573,77.351432,78.981159,10.843918,97.356617,48.153042,82.723058
3,92.96075,72.191662,70.0,74.089987,86.760145,72.448839,0.59804,72.292632,86.910942,90.701506,69.483156,72.386809,96.200816,78.542712,82.823073,45.825982,84.436607,67.323596,55.636347,19.547286,53.017739,83.738959,87.834977,80.274494,51.923577,11.146325,90.368911,87.079762,86.552815,63.899838,85.528068,80.88263,70.049719,39.754207,83.330868,70.0,85.580787,0.555596,90.663988,79.093186,81.404709,82.836261,16.695306,98.499932,59.946334,85.047549
4,97.214172,78.33474,90.0,80.951131,94.541785,80.104453,1.014717,81.26014,88.965265,92.420062,80.609276,80.345711,97.670535,81.573134,85.374009,50.651456,87.171698,71.185585,71.181956,27.235395,58.371164,93.340072,94.392475,91.863233,67.764773,15.663828,96.570571,89.955975,88.379671,72.952044,93.694362,86.889301,85.854145,121.86214,91.476764,90.0,94.12901,0.985257,92.335473,83.23284,85.698753,86.198501,24.14784,99.5,74.302249,87.968879


In [120]:
accuracies = {}
for i in range(predicted.shape[1]): 
    accuracies[predicted.columns[i]] = accuracy_score(predicted.iloc[:,i], data18_stars.iloc[:,i])


In [134]:
accuracies

{'c_adult_bmi_assessment': 0.9683042789223455,
 'c_annual_flu_vaccine': 0.7741679873217115,
 'c_beneficiary_access_and_performance_problems': 1.0,
 'c_breast_cancer_screening': 0.7852614896988906,
 'c_call_center_foreign_language_interpreter_and_tty_availability': 0.9128367670364501,
 'c_colorectal_cancer_screening': 0.9508716323296355,
 'c_complaints_about_health_plan': 0.43106180665610144,
 'c_controlling_blood_pressure': 0.7638668779714739,
 'c_coordination': 0.8462757527733756,
 'c_customer_service': 0.8605388272583201,
 'c_diabetes_blood_sugar_controlled': 0.8985736925515055,
 'c_diabetes_eye_exam': 0.8446909667194928,
 'c_diabetes_kidney_disease_monitoring': 0.8074484944532488,
 'c_getting_appointments_and_quickly': 0.9128367670364501,
 'c_getting_needed_care': 0.7559429477020603,
 'c_improving_bladder_control': 0.9350237717908082,
 'c_improving_or_maintaining_mental_health': 0.9334389857369255,
 'c_improving_or_maintaining_physical_health': 0.7329635499207607,
 'c_medication_rec