 ### Loss/MSE Script
1. Overview: Script that takes predicted cutpoints and compares them to actual cutpoints (basically a way for us to track loss)
2. Input: CSV of predicted cutpoints, CSV of actual cutpoints (for a given year/term)
3. Output: Loss values for each metric
    1. Loss is MSE (for now, can change if we feel so)
    2. To compute the loss for a metric, for each star rating cutpoint C_i, and each predicted star rating cutpoint P_i, take the sum of all (P_i - C_i)**2

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples, mean_squared_error
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

In [4]:
data18 = pd.read_csv("summary_data/data18_c_d.csv")
data18 = data18.drop("Unnamed: 0", axis = 1)
data18_no_stars = data18[data18.columns[~data18.columns.str.contains("star")]].drop(["contract_name", "id", "name", "organization_type", "parent_org", "spring"], axis = 1)
data18_stars = data18[data18.columns[data18.columns.str.contains("star")]].drop(["c_health_plan_quality_improvement_star", "d_drug_plan_quality_improvement_star"], axis = 1)

#load cleaned cutpoints, but d has more rows/cutpoints; need to be taken care of. For now, will work with c only.
data18_cutpoints_c = pd.read_csv("summary_data/data18_cutpoints_c.csv", encoding = "cp1252")
#data18_cutpoints_d = pd.read_csv("summary_data/data18_cutpoints_d.csv", encoding = "cp1252")
#data18_cutpoints = pd.concat([data18_cutpoints_c, data18_cutpoints_d], axis = 1)



data18_cutpoints_c


Unnamed: 0,number_of_stars_displayed_on_plan_finder_tool,c_breast_cancer_screening,c_colorectal_cancer_screening,c_annual_flu_vaccine,c_improving_or_maintaining_physical_health,c_improving_or_maintaining_mental_health,c_monitoring_physical_activity,c_adult_bmi_assessment,c_special_needs_plan_management,c_older_adults_medication_review,c_older_adults_functional_status_assessment,c_older_adults_pain_assessment,c_osteoporosis_management_in_women_who_had_a_fracture,c_diabetes_eye_exam,c_diabetes_kidney_disease_monitoring,c_diabetes_blood_sugar_controlled,c_controlling_blood_pressure,c_rheumatoid_arthritis_management,c_reducing_risk_of_falling,c_improving_bladder_control,c_medication_reconciliation_post_discharge,c_plan_all_cause_readmissions,c_getting_needed_care,c_getting_appointments_and_quickly,c_customer_service,c_rating_of_health_quality,c_rating_of_health_plan,c_coordination,c_complaints_about_health_plan,c_members_choosing_to_leave_plan,c_beneficiary_access_and_performance_problems,c_health_plan_quality_improvement,c_plan_makes_timely_decisions_about_appeals,c_reviewing_appeals_decisions,c_call_center_foreign_language_interpreter_and_tty_availability
0,1 Star,56.0,54.0,64.0,63.0,75.0,46.0,72.0,35.0,59.0,46.0,40.0,24.0,47.0,92.0,40.0,55.0,65.0,52.0,39.0,19.0,18.0,80.0,74.0,88.0,83.0,82.0,83.0,0.86,28.0,20.0,-0.203,70.0,62.0,53.0
1,2 Stars,70.0,63.0,68.0,67.0,82.0,50.0,81.0,54.0,79.0,67.0,62.0,42.0,59.0,94.0,64.0,67.0,72.0,59.0,43.0,37.0,18.0,82.0,76.0,89.0,85.0,84.0,85.0,0.86,28.0,40.0,0.0,80.0,76.0,63.0
2,3 Stars,78.0,72.0,74.0,69.0,84.0,53.0,94.0,71.0,88.0,78.0,80.0,52.0,72.0,96.0,73.0,75.0,78.0,66.0,46.0,55.0,11.0,84.0,79.0,91.0,86.0,86.0,87.0,0.53,18.0,60.0,0.155,89.0,86.0,81.0
3,4 Stars,84.0,80.0,77.0,72.0,88.0,58.0,98.0,92.0,93.0,92.0,94.0,71.0,81.0,98.0,80.0,86.0,86.0,74.0,50.0,68.0,9.0,86.0,81.0,92.0,87.0,88.0,88.0,0.31,13.0,80.0,0.429,98.0,93.0,94.0


In [4]:
def cluster(column): 
    series = pd.Series(column)
    km = KMeans(n_clusters = 5)
    #should take out -1 otherwise the -1s will be their own cluster
    km.fit(np.array(pd.DataFrame(series.drop(series[series == -1].index))))
    return km

#kmeans starts with different centroids for each iteration, can yield different final centroids 
#repeat kmeans 1000 times and take the mean of the centroids for each cluster
def cluster_means(series, n_iters = 1000): 
    x = np.sort(np.array([cluster(series).cluster_centers_.flatten() for i in range(n_iters)]).flatten())
    one = x[0:n_iters]
    two = x[n_iters:(n_iters * 2)]
    three = x[(n_iters * 2):(n_iters * 3)]
    four = x[(n_iters * 3):(n_iters * 4)]
    five = x[(n_iters * 4):]
    df = pd.DataFrame({"1": one, "2": two, "3": three, "4": four, "5": five})
    return df.apply(lambda x: x.mean(), axis = 0).to_dict()

#points are classified based on which centroid they are closest to
#so cutpoints are the mean between the two closest centroids
def return_cutpoints(series): 
    cuts = cluster_means(series)
    cut1 = (cuts["1"] + cuts["2"])/2
    cut2 = (cuts["2"] + cuts["3"])/2
    cut3 = (cuts["3"] + cuts["4"])/2
    cut4 = (cuts["4"] + cuts["5"])/2
    return pd.Series({"1": cut1, "2": cut2, "3": cut3, "4": cut4})

def convert_to_stars(series): 
    cuts = cluster_means(series)
    cut1 = (cuts["1"] + cuts["2"])/2
    cut2 = (cuts["2"] + cuts["3"])/2
    cut3 = (cuts["3"] + cuts["4"])/2
    cut4 = (cuts["4"] + cuts["5"])/2
    return series.apply(lambda x: -1.0 if x == -1 else 
                                      1.0 if x < cut1 else 
                                      2.0 if (x >= cut1 and x < cut2) else
                                       3.0 if (x >= cut2 and x < cut3) else 
                                       4.0 if (x >= cut3 and x < cut4) else 
                                       5.0)

In [None]:
predicted = data18_no_stars.apply(convert_to_stars, axis = 0)
cutpoints = data18_no_stars.apply(return_cutpoints, axis = 0)
cutpoints

In [5]:
#data18.head()
#data18_no_stars.head(),
#data18_stars.head(),
data18_cutpoints.head(5)
         

NameError: name 'data18_cutpoints' is not defined

In [None]:
accuracies = {}
for i in range(predicted.shape[1]): 
    accuracies[predicted.columns[i]] = accuracy_score(predicted.iloc[:,i], data18_stars.iloc[:,i])
accuracies

In [18]:
def loss_function(df1, df2) :
    # df1 = predicted cutoffs, df2 = actual cutoffs
    colnames = list(set(df1.columns).intersection(df2)) #remove metrics that arent in both years
    

    mse_dict = {}
    for col in colnames: 
        mse_dict[col] = mean_squared_error(df1[col], df2[col])
    return mse_dict
 
#col_predicted = "c_adult_bmi_assessment	c_annual_flu_vaccine	c_beneficiary_access_and_performance_problems	c_breast_cancer_screening	c_call_center_foreign_language_interpreter_and_tty_availability	c_colorectal_cancer_screening	c_complaints_about_health_plan	c_controlling_blood_pressure	c_coordination	c_customer_service	c_diabetes_blood_sugar_controlled	c_diabetes_eye_exam	c_diabetes_kidney_disease_monitoring	c_getting_appointments_and_quickly	c_getting_needed_care	c_improving_bladder_control	c_improving_or_maintaining_mental_health	c_improving_or_maintaining_physical_health	c_medication_reconciliation_post_discharge	c_members_choosing_to_leave_plan	c_monitoring_physical_activity	c_older_adults_functional_status_assessment	c_older_adults_medication_review	c_older_adults_pain_assessment	c_osteoporosis_management_in_women_who_had_a_fracture	c_plan_all_cause_readmissions	c_plan_makes_timely_decisions_about_appeals	c_rating_of_health_plan	c_rating_of_health_quality	c_reducing_risk_of_falling	c_reviewing_appeals_decisions	c_rheumatoid_arthritis_management	c_special_needs_plan_management	d_appeals_autoforward	d_appeals_upheld	d_beneficiary_access_and_performance_problems	d_call_center_foreign_language_interpreter_and_tty_availability	d_complaints_about_drug_plan	d_getting_needed_prescription_drugs	d_medication_adherence_cholesterol	d_medication_adherence_diabetes_medications	d_medication_adherence_hypertension	d_members_choosing_to_leave_plan	d_mpf_price_accuracy	d_mtm_program_completion_rate_cmr	d_rating_of_drug_plan".split()

#col_functest = list(set(data18_cutpoints_c.columns).intersection(col_predicted)) #extract columns in both


# testing function with 2017's cutpoints
data17_cutpoints_c = pd.read_csv("summary_data/data17_cutpoints_c.csv", encoding = "cp1252")

#first column is not integer type(stars), drop
data17_cutpoints_c = data17_cutpoints_c.iloc[:, 1:]
data18_cutpoints_c = data18_cutpoints_c.iloc[:, 1:]

loss_function(data17_cutpoints_c, data18_cutpoints_c)

{'c_rheumatoid_arthritis_management': 35.25,
 'c_controlling_blood_pressure': 163.0,
 'c_diabetes_kidney_disease_monitoring': 0.0,
 'c_diabetes_blood_sugar_controlled': 27.5,
 'c_reducing_risk_of_falling': 3.75,
 'c_members_choosing_to_leave_plan': 193.5,
 'c_health_plan_quality_improvement': 0.002990749999999999,
 'c_call_center_foreign_language_interpreter_and_tty_availability': 105.5,
 'c_beneficiary_access_and_performance_problems': 0.0,
 'c_coordination': 1.0,
 'c_plan_makes_timely_decisions_about_appeals': 26.25,
 'c_customer_service': 8.25,
 'c_diabetes_eye_exam': 1.5,
 'c_osteoporosis_management_in_women_who_had_a_fracture': 18.75,
 'c_rating_of_health_plan': 0.75,
 'c_getting_appointments_and_quickly': 5.25,
 'c_reviewing_appeals_decisions': 21.5,
 'c_complaints_about_health_plan': 0.21565,
 'c_getting_needed_care': 0.5,
 'c_plan_all_cause_readmissions': 5.0,
 'c_rating_of_health_quality': 0.25}

In [38]:
data18_cutpoints_c

Unnamed: 0,number_of_stars_displayed_on_plan_finder_tool,c_breast_cancer_screening,c_colorectal_cancer_screening,c_annual_flu_vaccine,c_improving_or_maintaining_physical_health,c_improving_or_maintaining_mental_health,c_monitoring_physical_activity,c_adult_bmi_assessment,c_special_needs_plan_management,c_older_adults_medication_review,c_older_adults_functional_status_assessment,c_older_adults_pain_assessment,c_osteoporosis_management_in_women_who_had_a_fracture,c_diabetes_eye_exam,c_diabetes_kidney_disease_monitoring,c_diabetes_blood_sugar_controlled,c_controlling_blood_pressure,c_rheumatoid_arthritis_management,c_reducing_risk_of_falling,c_improving_bladder_control,c_medication_reconciliation_post_discharge,c_plan_all_cause_readmissions,c_getting_needed_care,c_getting_appointments_and_quickly,c_customer_service,c_rating_of_health_quality,c_rating_of_health_plan,c_coordination,c_complaints_about_health_plan,c_members_choosing_to_leave_plan,c_beneficiary_access_and_performance_problems,c_health_plan_quality_improvement,c_plan_makes_timely_decisions_about_appeals,c_reviewing_appeals_decisions,c_call_center_foreign_language_interpreter_and_tty_availability
0,1 Star,56.0,54.0,64.0,63.0,75.0,46.0,72.0,35.0,59.0,46.0,40.0,24.0,47.0,92.0,40.0,55.0,65.0,52.0,39.0,19.0,18.0,80.0,74.0,88.0,83.0,82.0,83.0,0.86,28.0,20.0,-0.203,70.0,62.0,53.0
1,2 Stars,70.0,63.0,68.0,67.0,82.0,50.0,81.0,54.0,79.0,67.0,62.0,42.0,59.0,94.0,64.0,67.0,72.0,59.0,43.0,37.0,18.0,82.0,76.0,89.0,85.0,84.0,85.0,0.86,28.0,40.0,0.0,80.0,76.0,63.0
2,3 Stars,78.0,72.0,74.0,69.0,84.0,53.0,94.0,71.0,88.0,78.0,80.0,52.0,72.0,96.0,73.0,75.0,78.0,66.0,46.0,55.0,11.0,84.0,79.0,91.0,86.0,86.0,87.0,0.53,18.0,60.0,0.155,89.0,86.0,81.0
3,4 Stars,84.0,80.0,77.0,72.0,88.0,58.0,98.0,92.0,93.0,92.0,94.0,71.0,81.0,98.0,80.0,86.0,86.0,74.0,50.0,68.0,9.0,86.0,81.0,92.0,87.0,88.0,88.0,0.31,13.0,80.0,0.429,98.0,93.0,94.0


In [15]:
type(data18[["contract_name"]])


((pd.Series([1,2,3]) - pd.Series([1,1,1]))**2).sum()

5