In [1]:
# Importing Necessary dependencies
import pandas as pd,datetime
import snowflake.connector as sf
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import utils
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time
import os


In [31]:
# Snowflake credentials stored in environment variables

username = os.getenv('Snowflake_User')
password = os.getenv('Snowflake_password')
account = os.getenv('Snowflake_account')

# Define warehouse, if neccessary
warehouse = 'DEVELOPER_BASIC'

# Define Database, if not defined in SQL request
#database = 'VESTA_STAGING'

# Create connection object for Snowflake connection
conn = sf.connect(user=username, password=password, account=account, warehouse=warehouse)


# Execution function
def execute_query(connection, query):
    cursor = connection.cursor()
    cursor.execute(query)
    cursor.close


try:
    # If defining a database, uncomment code set and add database in connection parameter
    #sql = 'use {}'.format(database)
    #execute_query(conn,sql)1011

    # Define warehouse to use in Snowflake
    sql = 'use warehouse {}'.format(warehouse)
    execute_query(conn, sql)

    print('Successful Connection')

    # Query to Snowflake
    sql = '''WITH EDIP AS ( //This is sub table for a self join

    SELECT
        *
    FROM "VESTA_DEVELOPMENT"."CLAIMS_REPORTING"."CCA_MEM_PROFILE_IP_ER_SNF" //THIS NEEDS TO CHANGE BASED ON CLIENT
    WHERE MEASURE = 'ED' or MEASURE = 'IP'

    SELECT
    *
FROM
VESTA_DEVELOPMENT.CLAIMS_REPORTING.NM_MEMBERSHIP_MTH
WHERE CLNT = 'CCA'
and month >= '202201'
'''
    cursor = conn.cursor()
    cursor.execute(sql)

    # Dataframe creation
    df = pd.DataFrame.from_records(iter(cursor), columns = [x[0] for x in cursor.description])

    print('Successful DataFrame Created')



except Exception as e:
    print(e)

finally:
    conn.close



Successful Connection
Successful DataFrame Created


In [32]:
df.head()

Unnamed: 0,CLNT,MEMBER_ID,DASH_ID,HR_MIN_DATE,DISENROLL_DATE,PLATFORM_STATUS,TARGETTED_HISTORICALLY,ENROLLED,ENGAGED,DISENROLLED,...,NI_COST_OP,NI_COST_OTH,NI_COST_PCA_T1019,NI_COST_PCA_T1020,NI_COST_PR,NI_COST_PSYC,NI_COST_RX,NI_COST_RXD,NI_COST_UNC,TOTAL_NON_IMPACTABLE_COST
0,CCA,5365612939,3682,2021-03-19,2022-03-15,DISENROLLED,1.0,1.0,0.0,1.0,...,0.0,0.0,1390.0,46.2,1078.55,0.0,43.43,197.81,0.0,2814.98
1,CCA,5365612939,3682,2021-03-19,2022-03-15,DISENROLLED,1.0,1.0,0.0,1.0,...,0.0,0.0,1820.0,51.15,0.0,0.0,0.0,0.85,0.0,2549.07
2,CCA,5365612938,2193,,,DECLINED,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CCA,5365612938,2193,,,DECLINED,1.0,0.0,0.0,0.0,...,0.0,0.0,494.0,0.0,280.95,0.0,0.0,127.09,0.0,902.04
4,CCA,5365612938,2193,,,DECLINED,1.0,0.0,0.0,0.0,...,38.82,0.0,1817.92,49.5,391.98,0.0,23.82,188.88,0.0,2510.92


In [33]:
# features with missing values
sumdf = pd.DataFrame(df.isna().sum())

In [34]:
print(sumdf)

                               0
CLNT                           0
MEMBER_ID                      0
DASH_ID                      823
HR_MIN_DATE                14209
DISENROLL_DATE             21007
...                          ...
NI_COST_PSYC                   0
NI_COST_RX                     0
NI_COST_RXD                    0
NI_COST_UNC                    0
TOTAL_NON_IMPACTABLE_COST      0

[168 rows x 1 columns]


In [35]:
features_to_drop = []
for row in sumdf.iterrows():
    if row[1][0] != 0:
        features_to_drop.append(row[0])

# Review features with missing values
print('These are the features that initially had missing values within the data frame: \n',features_to_drop,'\n\n')

nulls_to_correct = ['DASH_ID', 'HR_MIN_DATE', 'DISENROLL_DATE', 'PLATFORM_STATUS', 'HR_LAST_4_WK', 'AVG_WKLY_HR', 'TARGETED_DATE', 'TARGETING_SOURCE', 'AGENCY', 'LANGUAGE_SPOKEN', 'CENTER_NAME', 'HR_COUNT', 'ALERT_COUNT', 'PREMIUM_MD', 'PREMIUM_MR_PART_C', 'PREMIUM_MR_PART_C_ACCRUAL', 'PREMIUM_MR_PART_C_PROJECTED', 'PREMIUM_MR_PART_D_LICS', 'PREMIUM_MR_PART_D_REINS', 'PREMIUM_MR_PART_D_PREMIUM', 'PREMIUM_MR_PART_D_TOTAL', 'PREMIUM_MR_TOTAL_PROJECTED', 'PREMIUM_MR_REVENUE_TOTAL', 'PREMIUM_TOTAL_PROJECTED', 'PREMIUM_TOTAL', 'INITIAL_TARGET_GROUP_M', 'CARECOORD_CNT', 'WELLNESS_CNT', 'PAT_CARE_PLAN_CNT', 'TCM_CNT', 'INIT_PAT_CARE_PLAN_CNT', 'CARECOORD_MIN', 'WELLNESS_MIN', 'PAT_CARE_PLAN_MIN', 'TCM_MIN', 'INIT_PAT_CARE_PLAN_MIN', 'VALID_URGENT_ALERTS', 'NP_ESC_ALERTS', 'CPT_CODING_DATES', 'VESTA_NP_VISIT', 'CG_AGENCY', 'CG_AIDE_UNKNOWN', 'CG_N_A', 'CG_PCA_FAMILY', 'CG_PCA_PROFESSIONAL', 'CG_UNPAID_CG', 'ONB_PRE_POST', 'TAR_PRE_POST', 'NLO_PRE_POST', 'NO_MTH', 'HR_TOTAL', 'AVG_HR_PER_MTH', 'LIFETIME_ZERO', 'HR_AVG_THIS_MONTH', 'PX_ACP_PRE_POST', 'PX_ASG_PRE_POST', 'PX_C24_PRE_POST', 'PX_CCM_PRE_POST', 'PX_CORE_PRE_POST', 'PX_DAA_PRE_POST', 'PX_LIFE_PRE_POST', 'PX_LITE_PRE_POST', 'PX_MCD_PRE_POST', 'PX_RPM_PRE_POST', 'ALL_CLAIMS', 'DENT', 'ED', 'HM', 'HMKR', 'HS', 'IP', 'IP_RHB', 'OP', 'OTH', 'PCA_T1019', 'PCA_T1020', 'PR', 'PSYC', 'RX', 'RXD', 'SNF', 'UNC', 'ALL_CLAIMS_N', 'DENT_N', 'ED_N', 'HM_N', 'HMKR_N', 'HS_N', 'IP_N', 'IP_RHB_N', 'OP_N', 'OTH_N', 'PCA_T1019_N', 'PCA_T1020_N', 'PR_N', 'PSYC_N', 'RX_N', 'RXD_N', 'SNF_N', 'UNC_N', 'ALL_CLAIMS_U', 'DENT_U', 'ED_U', 'HMKR_U', 'HM_U', 'HS_U', 'IP_RHB_U', 'IP_U', 'OP_U', 'OTH_U', 'PCA_T1019_U', 'PCA_T1020_U', 'PR_U', 'PSYC_U', 'SNF_U', 'UNC_U']

# Fill selected features with 0 value
for col in nulls_to_correct:
    df[col] = df[col].fillna(0)

# Check for missing values and drop columns with missing values
sumdf = pd.DataFrame(df.isna().sum())
print(sumdf)

features_to_drop = []
for row in sumdf.iterrows():
    if row[1][0] != 0:
        features_to_drop.append(row[0])

print('These columns were dropped after because missing values were not corrected :\n', features_to_drop,'\n\n')
df = df.drop(columns = features_to_drop)

# Convert object datatypes to dummy variables
object_list = []

for col in df.columns:
    if df[col].dtypes == 'object':
        object_list.append(col)


print('These are the features that were converted to dummy variables: \n',object_list,'\n\n')
df = pd.get_dummies(df, columns = object_list, drop_first = True)

These are the features that initially had missing values within the data frame: 
 ['DASH_ID', 'HR_MIN_DATE', 'DISENROLL_DATE', 'PLATFORM_STATUS', 'HR_LAST_4_WK', 'AVG_WKLY_HR', 'TARGETED_DATE', 'TARGETING_SOURCE', 'AGENCY', 'LANGUAGE_SPOKEN', 'CENTER_NAME', 'HR_COUNT', 'ALERT_COUNT', 'PREMIUM_MD', 'PREMIUM_MR_PART_C', 'PREMIUM_MR_PART_C_ACCRUAL', 'PREMIUM_MR_PART_C_PROJECTED', 'PREMIUM_MR_PART_D_LICS', 'PREMIUM_MR_PART_D_REINS', 'PREMIUM_MR_PART_D_PREMIUM', 'PREMIUM_MR_PART_D_TOTAL', 'PREMIUM_MR_TOTAL_PROJECTED', 'PREMIUM_MR_REVENUE_TOTAL', 'PREMIUM_TOTAL_PROJECTED', 'PREMIUM_TOTAL', 'INITIAL_TARGET_GROUP_M', 'CARECOORD_CNT', 'WELLNESS_CNT', 'PAT_CARE_PLAN_CNT', 'TCM_CNT', 'INIT_PAT_CARE_PLAN_CNT', 'CARECOORD_MIN', 'WELLNESS_MIN', 'PAT_CARE_PLAN_MIN', 'TCM_MIN', 'INIT_PAT_CARE_PLAN_MIN', 'VALID_URGENT_ALERTS', 'NP_ESC_ALERTS', 'CPT_CODING_DATES', 'VESTA_NP_VISIT', 'CG_AGENCY', 'CG_AIDE_UNKNOWN', 'CG_N_A', 'CG_PCA_FAMILY', 'CG_PCA_PROFESSIONAL', 'CG_UNPAID_CG', 'ONB_PRE_POST', 'TAR_PRE_

In [36]:
original_memory = df.memory_usage().sum()
print(f'Memory Usage of Dataframe: {df.memory_usage().sum()} bytes')

Memory Usage of Dataframe: 174686245 bytes


In [37]:
#convert data types
#changing all float64 to float32
df[df.select_dtypes(np.float64).columns] = df.select_dtypes(np.float64).astype(np.float32)

In [38]:
new_memory = df.memory_usage().sum()
print(f'Memory Usage of Original Dataframe: {original_memory} bytes')
print(f'Memory Usage of New Dataframe: {new_memory} bytes')
print(f'Memory usage reduced by:{round((original_memory-new_memory)/original_memory * 100,0)}%')

Memory Usage of Original Dataframe: 174686245 bytes
Memory Usage of New Dataframe: 162729693 bytes
Memory usage reduced by:7.0%


In [39]:
# Drop low variability columns
df_var = df.var()
df.columns.to_list()

features_to_drop = []

for i in range(len(df.columns.to_list())):
    #print(df.columns.to_list()[i],df_var[i])
    if df_var[i] == 0 and df.columns.to_list()[i] != 'ED':
        features_to_drop.append(df.columns.to_list()[i])


print('These are the features that were dropped because of low variability: \n',features_to_drop,'\n\n')

df = df.drop(columns = features_to_drop)
print(f'Memory Usage of Dataframe: {df.memory_usage().sum()} bytes')

These are the features that were dropped because of low variability: 
 ['CENTER_NAME', 'PREMIUM_MR_PART_D_LICS', 'PREMIUM_MR_PART_D_REINS', 'PREMIUM_MR_PART_D_PREMIUM', 'PREMIUM_MR_PART_D_TOTAL', 'PREMIUM_MR_TOTAL_PROJECTED', 'PREMIUM_MR_REVENUE_TOTAL', 'PX_MCD_PRE_POST', 'UNC', 'UNC_N', 'IP_U', 'UNC_U', 'NI_COST_UNC'] 


Memory Usage of Dataframe: 160498993 bytes


In [40]:
df.head()

Unnamed: 0,TARGETTED_HISTORICALLY,ENROLLED,ENGAGED,DISENROLLED,HR_LAST_4_WK,INSTITUTIONAL,AGE,ENROLLED_IN_DASH,DISENROLLED_IN_DASH,TARGETED_POST_V2,...,TEST_MONTH_202202,TEST_MONTH_202203,TEST_MONTH_202204,TEST_MONTH_202205,TEST_MONTH_202206,TEST_MONTH_202207,TEST_MONTH_202208,TEST_MONTH_202209,TEST_MONTH_202210,TEST_MONTH_202211
0,1.0,1.0,0.0,1.0,0.0,0.0,77.0,1.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,1.0,1.0,0.0,1.0,0.0,0.0,77.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0.0,0.0,0.0,0.0,0.0,74.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,1
3,1.0,0.0,0.0,0.0,0.0,0.0,74.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,1,0
4,1.0,0.0,0.0,0.0,0.0,0.0,74.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,1,0,0


In [41]:
# Split the data set
X = df[[col for col in df.columns if col != 'ED']] #independent variables
y = df[[col for col in df.columns if col == 'ED']] #dependent variable
y = y.values.flatten()

#convert y values to categorical values
lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y)

# Define MinMax Scaler
scaler = MinMaxScaler()

# Transform data
X = scaler.fit_transform(X)

# Split X and y into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y_transformed,test_size=0.30, random_state = 2)



# Smote for balancing the training data set
# smote = SMOTE(random_state = 2)
# X_train,y_train = smote.fit_resample(X_train, y_train)
print(f'Memory Usage of Dataframe: {df.memory_usage().sum()} bytes')

Memory Usage of Dataframe: 160498993 bytes


In [42]:
%%time
# Create a Random Forest Classifier
clf=RandomForestClassifier(n_estimators = 2000,min_samples_split = 2, min_samples_leaf = 1,
                           max_depth = 50, bootstrap = False, n_jobs = -1,random_state = 2)

# Train the model using the training sets
clf.fit(X_train,y_train)
print(f'Memory Usage of Dataframe: {df.memory_usage().sum()} bytes')

Memory Usage of Dataframe: 160498993 bytes
CPU times: total: 49min 2s
Wall time: 3min 9s


In [43]:
# Create probabilities from the model on test data
y_prob = clf.predict_proba(X_test)[:,1]

# Store probabilites in Dataframe for threshold analysis
threshold_list = [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,.7,.75,.8,.85,.9,.95,.99]
accuracy_list = []
precision_list = []
recall_list = []
for threshold in threshold_list:
    y_pred = [1 if result >= threshold else 0 for result in y_prob]

    #Calulating Metrics
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)

    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)

    #print('Thereshold: ',threshold)
    #print("Accuracy: ",accuracy)
    #print("Precision: ",precision)
    #print("Recall: ",recall)

metric_df = pd.DataFrame()
metric_df['Threshold'] = threshold_list
metric_df['Accuracy'] = accuracy_list
metric_df['Precision'] = precision_list
metric_df['Recall'] = recall_list
metric_df['F1'] = (2 * metric_df['Precision'] * metric_df['Recall']) / (metric_df['Precision'] + metric_df['Recall'])
metric_df['Acc + Recall'] = metric_df['Accuracy'] + metric_df['Recall']

metric_df

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [44]:
#Find the max accuracy and recall from the Metric Table and corresponding Threshold
threshold = metric_df[metric_df['Acc + Recall'] == metric_df['Acc + Recall'].max()]['Threshold'].item()

y_pred = [1 if result >= threshold else 0 for result in y_prob]


# Creating confusion maxtrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

%matplotlib inline
class_names=[0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# Create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")

# Axis labels
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

# Show the plot
plt.show()


NameError: name 'metric_df' is not defined