## Imports

In [35]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from datetime import date, timedelta 
from scipy import stats
from pandas import *
import re
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
plt.style.use(['seaborn-darkgrid'])

import warnings
warnings.filterwarnings('ignore')

## Processing

In [7]:
# Combining all the dataframes
# This will be useful when plotting
def make_pretty(csv_list, ylist):
    df_list = []
    for csv, year in zip(csv_list, ylist):
        df = read_csv(csv).dropna(subset=['name']).drop(['Unnamed: 0'], axis=1).replace(-1, np.nan).set_index('name')
        ycol = np.empty(df.shape[0])
        ycol.fill(year)
        df['year'] = ycol.astype(int)
        df_list.append(df)
    return pd.concat(df_list)

# Make it pretty
csv_list = ['data15_c_d.csv','data16_c_d.csv','data17_c_d.csv','data18_c_d.csv']
ylist = np.arange(2015,2019)
df = make_pretty(csv_list, ylist)

## LDA and QDA
* QDA tends to work better so the the functions are tuned to that model
* If you want to see LDA outputs just change the classifier (clf)

In [38]:
# Remove any variables that have star ratings but no data
# In this case c_health_plan_quality_improvement_star and d_drug_plan_quality_improvement_star
df_alt = df[df.columns[(df.columns == 'c_health_plan_quality_improvement_star') | (df.columns == 'd_drug_plan_quality_improvement_star') == False]]

def cutpoints(df):
    metric_stars = df.columns[df.columns.str.contains('star')]
    metric = metric_stars.str.replace('_star','')
    
    cp_dict, no_data = {}, []
    for met, met_s in zip(metric, metric_stars):
        met_df = df[[met, met_s]].dropna()
        if met_df.shape[0] == 0:
            no_data.append(met)
            continue
        clf = QuadraticDiscriminantAnalysis()
        clf.fit(met_df[[met]], met_df[[met_s]])
        met_df[met_s] = clf.predict(met_df[[met]])
        met_df = met_df.groupby(met_s).mean()
        df_cp = met_df.shift() + met_df.diff()/2
        cp_dict.update({met: df_cp})
    print('The '+', '.join(no_data)+' fields are empty')
    return cp_dict

In [41]:
cp_dict = cutpoints(df_alt[df_alt['year'] == 2018])

The c_cardiovascular_cholesterol_screening, c_diabetes_cholesterol_controlled, c_diabetes_cholesterol_screening, d_diabetes_treatment, d_high_risk_medication fields are empty


In [43]:
# Example output
# 1 star: up to 65, 2 star:  between 65 and 82.69,  etc.
cp_dict['c_adult_bmi_assessment']

Unnamed: 0_level_0,c_adult_bmi_assessment
c_adult_bmi_assessment_star,Unnamed: 1_level_1
1.0,
2.0,65.0
3.0,82.688462
4.0,92.796299
5.0,97.439571


In [39]:
# How good is our classifier?
def training_validation_acc(df):
    metric_stars = df.columns[df.columns.str.contains('star')]
    metric = metric_stars.str.replace('_star','')
    
    for met, met_s in zip(metric, metric_stars):
        met_df = df[[met, met_s]].dropna()
        if met_df.shape[0] == 0:
            continue
        mdf_train, mdf_val, mdf_train_labels, mdf_val_labels = train_test_split(met_df[[met]], met_df[[met_s]], test_size=0.2, shuffle=shuffle, random_state=100)
        
        clf = QuadraticDiscriminantAnalysis()
        clf.fit(mdf_train, mdf_train_labels)
        print(met+':', clf.score(mdf_train, mdf_train_labels), clf.score(mdf_val, mdf_val_labels))

In [44]:
# This drops pain assessment, which doesn't have well defined covariance
# Necessary for QDA to run
df_alt = df[df.columns[(df.columns == 'c_older_adults_pain_assessment_star') | (df.columns == 'c_health_plan_quality_improvement_star') | (df.columns == 'd_drug_plan_quality_improvement_star') == False]]

### QDA
* First score is training accuracy, second score is validation accuracy
* Underperforms on 5 fields: c_beneficiary_access_and_performance_problems, c_improving_or_maintaining_mental_health, c_plan_all_cause_readmissions, d_beneficiary_access_and_performance_problems, and d_mpf_price_accuracy

In [45]:
 training_validation_acc(df_alt[df_alt['year'] == 2018])

c_adult_bmi_assessment: 1.0 1.0
c_annual_flu_vaccine: 0.952922077922078 0.948051948051948
c_beneficiary_access_and_performance_problems: 0.07173601147776183 0.05714285714285714
c_breast_cancer_screening: 1.0 1.0
c_call_center_foreign_language_interpreter_and_tty_availability: 0.9762962962962963 0.9644970414201184
c_colorectal_cancer_screening: 1.0 1.0
c_complaints_about_health_plan: 0.9919871794871795 0.9935897435897436
c_controlling_blood_pressure: 0.9951612903225806 0.9935897435897436
c_coordination: 0.9279869067103109 0.9477124183006536
c_customer_service: 0.9466666666666667 0.92
c_diabetes_blood_sugar_controlled: 0.9937106918238994 1.0
c_diabetes_eye_exam: 1.0 1.0
c_diabetes_kidney_disease_monitoring: 1.0 1.0
c_getting_appointments_and_quickly: 0.9545454545454546 0.948051948051948
c_getting_needed_care: 0.9218241042345277 0.948051948051948
c_improving_bladder_control: 0.9761467889908257 0.9781021897810219
c_improving_or_maintaining_mental_health: 0.007936507936507936 0.0
c_improvin

### LDA
* Perfect classifier for the five fields QDA does a horrible job with

In [34]:
# LDA
cp_dict = training_validation_acc(df_alt[df_alt['year'] == 2018])

c_adult_bmi_assessment: 0.9693548387096774 0.967948717948718
c_annual_flu_vaccine: 0.8538961038961039 0.8376623376623377
c_beneficiary_access_and_performance_problems: 1.0 1.0
c_breast_cancer_screening: 0.9561551433389545 0.9731543624161074
c_call_center_foreign_language_interpreter_and_tty_availability: 0.9451851851851852 0.9230769230769231
c_colorectal_cancer_screening: 0.9790322580645161 0.967948717948718
c_complaints_about_health_plan: 0.9615384615384616 0.9743589743589743
c_controlling_blood_pressure: 0.9338709677419355 0.9038461538461539
c_coordination: 0.8707037643207856 0.9019607843137255
c_customer_service: 0.9116666666666666 0.9666666666666667
c_diabetes_blood_sugar_controlled: 0.9512578616352201 0.96875
c_diabetes_eye_exam: 1.0 1.0
c_diabetes_kidney_disease_monitoring: 1.0 1.0
c_getting_appointments_and_quickly: 0.8636363636363636 0.8311688311688312
c_getting_needed_care: 0.8664495114006515 0.8896103896103896
c_improving_bladder_control: 0.9486238532110092 0.9562043795620438

In [49]:
# Strange distributions for these vars that lend themselves better to linear decision boundaries
df_alt[df_alt['year'] == 2018][['c_beneficiary_access_and_performance_problems', 'c_improving_or_maintaining_mental_health', 'c_plan_all_cause_readmissions', 'd_beneficiary_access_and_performance_problems', 'd_mpf_price_accuracy']].apply(pd.value_counts)

Unnamed: 0,c_beneficiary_access_and_performance_problems,c_improving_or_maintaining_mental_health,c_plan_all_cause_readmissions,d_beneficiary_access_and_performance_problems,d_mpf_price_accuracy
0.0,30.0,,4.0,32.0,
3.0,,,2.0,,
4.0,,,4.0,,
5.0,,,14.0,,
6.0,,,8.0,,
7.0,,,38.0,,
8.0,,,74.0,,
9.0,,,152.0,,
10.0,,,196.0,,
11.0,,,144.0,,
