In [46]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import svm

data_dir = 'HeritageHealth/HHP_release3/'

In [47]:
# Read in data
claims = pd.read_csv(data_dir + 'Claims.csv')
members = pd.read_csv(data_dir + 'Members.csv')
drug = pd.read_csv(data_dir + 'DrugCount.csv')
lab = pd.read_csv(data_dir + 'LabCount.csv')

In [48]:
claims.head()

Unnamed: 0,MemberID,ProviderID,Vendor,PCP,Year,Specialty,PlaceSvc,PayDelay,LengthOfStay,DSFS,PrimaryConditionGroup,CharlsonIndex,ProcedureGroup,SupLOS
0,42286978,8013252.0,172193.0,37796.0,Y1,Surgery,Office,28,,8- 9 months,NEUMENT,0,MED,0
1,97903248,3316066.0,726296.0,5300.0,Y3,Internal,Office,50,,7- 8 months,NEUMENT,1-2,EM,0
2,2759427,2997752.0,140343.0,91972.0,Y3,Internal,Office,14,,0- 1 month,METAB3,0,EM,0
3,73570559,7053364.0,240043.0,70119.0,Y3,Laboratory,Independent Lab,24,,5- 6 months,METAB3,1-2,SCS,0
4,11837054,7557061.0,496247.0,68968.0,Y2,Surgery,Outpatient Hospital,27,,4- 5 months,FXDISLC,1-2,EM,0


In [49]:
full_age = members.dropna(subset=['AgeAtFirstClaim'])
full_age.head()

Unnamed: 0,MemberID,AgeAtFirstClaim,Sex
0,14723353,70-79,M
1,75706636,70-79,M
2,17320609,70-79,M
3,69690888,40-49,M
4,33004608,0-9,M


In [50]:
# Transform member's 'AgeAtFirstClaim' to be a numeric value
age_groups = full_age['AgeAtFirstClaim'].value_counts().index.sort_values()
ages = np.arange(len(age_groups))*10 + 5
age_map = dict(zip(age_groups, ages))
full_age = full_age.replace({'AgeAtFirstClaim': age_map})
full_age.head()

Unnamed: 0,MemberID,AgeAtFirstClaim,Sex
0,14723353,75,M
1,75706636,75,M
2,17320609,75,M
3,69690888,45,M
4,33004608,5,M


In [51]:
# Add binary column for male and females
members_ohe = pd.get_dummies(full_age, columns=["Sex"]).drop('Sex_F', axis=1)
members_ohe.head()

Unnamed: 0,MemberID,AgeAtFirstClaim,Sex_M
0,14723353,75,1
1,75706636,75,1
2,17320609,75,1
3,69690888,45,1
4,33004608,5,1


In [52]:
# Fix DrugCount type
drug['DrugCount'] = drug['DrugCount'].str.replace('7\+', '7').astype(int)
drug_by_yr = drug.groupby(['MemberID', 'Year'])[['DrugCount']].sum()
# Fix LabCount type
lab['LabCount'] = lab['LabCount'].str.replace('10\+', '10').astype(int)
lc_by_year = lab.groupby(['MemberID', 'Year'])[['LabCount']].sum()

In [53]:
# Combine Sum of LabCounts and DrugCounts
lab_drug_by_year = lc_by_year.merge(drug_by_yr, on=['MemberID', 'Year'], how='outer')
lab_drug_by_year = lab_drug_by_year.fillna(0)
lab_drug_by_year = lab_drug_by_year.reset_index()

In [55]:
# Claims DF with matched sum of LabCounts and DrugCounts for that 'Year' and 'MemberID'
# Took MemberID, ProcedureGroup, LabCount, DrugCount, PlaceSvc
# Dropped columns with no Procedure Groups
cleaned_claims = claims.merge(lab_drug_by_year, on=['MemberID', 'Year'], how='left')
cleaned_claims = cleaned_claims.merge(members_ohe, on=['MemberID'], how='left')
cleaned_claims = cleaned_claims[['MemberID', 'ProcedureGroup', 'PlaceSvc', 'LabCount', 'DrugCount', 'AgeAtFirstClaim', 'Sex_M']]
cleaned_claims[['LabCount', 'DrugCount']] = cleaned_claims[['LabCount', 'DrugCount']].fillna(0)
cleaned_claims = cleaned_claims.dropna(subset=['ProcedureGroup', 'PlaceSvc', 'AgeAtFirstClaim', 'Sex_M'])
cleaned_claims.head()

Unnamed: 0,MemberID,ProcedureGroup,PlaceSvc,LabCount,DrugCount,AgeAtFirstClaim,Sex_M
0,42286978,MED,Office,8.0,26.0,85.0,0.0
1,97903248,EM,Office,19.0,47.0,75.0,0.0
2,2759427,EM,Office,4.0,29.0,45.0,1.0
3,73570559,SCS,Independent Lab,31.0,0.0,55.0,0.0
4,11837054,EM,Outpatient Hospital,10.0,41.0,85.0,0.0


In [56]:
# Assign numerical class for each ProcedureGroup name
procedures = cleaned_claims['ProcedureGroup'].value_counts().index
vals = np.arange(len(procedures))
proc_map = dict(zip(procedures, vals))
# cleaned_claims = cleaned_claims.replace({'ProcedureGroup': proc_map})
cleaned_claims.head()

Unnamed: 0,MemberID,ProcedureGroup,PlaceSvc,LabCount,DrugCount,AgeAtFirstClaim,Sex_M
0,42286978,MED,Office,8.0,26.0,85.0,0.0
1,97903248,EM,Office,19.0,47.0,75.0,0.0
2,2759427,EM,Office,4.0,29.0,45.0,1.0
3,73570559,SCS,Independent Lab,31.0,0.0,55.0,0.0
4,11837054,EM,Outpatient Hospital,10.0,41.0,85.0,0.0


In [57]:
# OHE for 'PlaceSvc'
claims_ohe = pd.get_dummies(cleaned_claims, columns=["PlaceSvc"])
claims_ohe.head()

Unnamed: 0,MemberID,ProcedureGroup,LabCount,DrugCount,AgeAtFirstClaim,Sex_M,PlaceSvc_Ambulance,PlaceSvc_Home,PlaceSvc_Independent Lab,PlaceSvc_Inpatient Hospital,PlaceSvc_Office,PlaceSvc_Other,PlaceSvc_Outpatient Hospital,PlaceSvc_Urgent Care
0,42286978,MED,8.0,26.0,85.0,0.0,0,0,0,0,1,0,0,0
1,97903248,EM,19.0,47.0,75.0,0.0,0,0,0,0,1,0,0,0
2,2759427,EM,4.0,29.0,45.0,1.0,0,0,0,0,1,0,0,0
3,73570559,SCS,31.0,0.0,55.0,0.0,0,0,1,0,0,0,0,0
4,11837054,EM,10.0,41.0,85.0,0.0,0,0,0,0,0,0,1,0


In [58]:
claims_ohe.isna().sum()

MemberID                        0
ProcedureGroup                  0
LabCount                        0
DrugCount                       0
AgeAtFirstClaim                 0
Sex_M                           0
PlaceSvc_Ambulance              0
PlaceSvc_Home                   0
PlaceSvc_Independent Lab        0
PlaceSvc_Inpatient Hospital     0
PlaceSvc_Office                 0
PlaceSvc_Other                  0
PlaceSvc_Outpatient Hospital    0
PlaceSvc_Urgent Care            0
dtype: int64

In [60]:
claims_ohe.shape

(2407066, 14)

In [74]:
# Test Train Split
smaller_claims_ohe = claims_ohe
X, y = smaller_claims_ohe.drop('ProcedureGroup', axis=1), smaller_claims_ohe['ProcedureGroup']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [71]:
# Initial LR model
# clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X, y)
# y_pred = clf.predict(X)
# clf.score(X, y)

In [None]:
# SVM Multiclass Model
svm_model = svm.SVC().fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
svm_model.score(X_test, y_test)

In [None]:
import pickle
# save the classifier
filename = 'svm_model_02_24_age_sex_full.pkl'
pickle.dump(svm_model, open(filename, 'wb'))    