### For faster processing, read in smaller cleaned dataset marked down towards the bottom of the code

In [13]:
import pandas as pd
import numpy as np
import os
import itertools
import matplotlib.pyplot as plt
from matplotlib import rcParams, gridspec
from pandas.api.types import CategoricalDtype

import sklearn
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read in Treatment Episode Data Set data
teds18 = pd.read_csv('/Users/kelsey.huntzberry/OneDrive - Personal Use/Intro_to_Machine_Learning/Data/tedsa_puf_2018.csv')

In [None]:
# Subset the data to just the 0/1 drug flag variables
flags = teds18.filter(regex='FLG$', axis = 1)

# Sum the flag variables to calculate the number of drugs recorded for each individual
NUMSUBS = flags.sum(axis=1)

# Concatenate flag variables back into the 2015-17 data
teds_wflgs = pd.concat([teds18, NUMSUBS], axis = 1)
teds_wflgs.rename(columns={0:'NUMSUBS'}, inplace = True)

In [None]:
# Remove rows where the first substance was "None"
teds_sm_temp = teds_wflgs[teds_wflgs.SUB1 != 1]
# Remove rows where number of prior treatments is NA (target variable)
teds_sm1 = teds_sm_temp[teds_sm_temp.NOPRIOR != -9]
# Replace -9 with missing
teds_sm1.replace(-9,np.NaN, inplace=True)

In [None]:
# Select subset of columns
teds_sm1 = teds_sm1.loc[:,['CASEID','ADMYR','AGE','GENDER','RACE','ETHNIC','EDUC','EMPLOY','VET','LIVARAG',\
                          'STFIPS','CBSA2010','DIVISION','REGION','SERVICES','PSOURCE','NOPRIOR','ARRESTS','ROUTE1','FRSTUSE1','FREQ1', \
                          'ROUTE2','FRSTUSE2', 'FREQ2','ROUTE3','FRSTUSE3','FREQ3','NUMSUBS','METHUSE','ALCFLG','PSYPROB', \
                          'COKEFLG','MARFLG','HERFLG','METHFLG','OPSYNFLG','PCPFLG','HALLFLG','MTHAMFLG','AMPHFLG','STIMFLG', \
                          'BENZFLG','TRNQFLG','BARBFLG','SEDHPFLG','INHFLG','OTCFLG','OTHERFLG','MARSTAT']]

### Feature Engineering
Below I recode variables in preparation for machine learning. Notice that:
- I change ordered categorical variables to numeric factor variables
- I combine very small groups into larger groups
- For unordered categorical variables with 3+ classes, I keep values as strings. This allows us to use the OneHotEncoder to create dummy variables and the variable names will be the string values in the variable.

In [None]:
# Recode age group variable
def age_groups(series):
    if series == 1:
        return '12_14_years'
    elif series == 2:
        return '15_17_years'
    elif series == 3:
        return '18_20_years'
    elif series == 4:
        return '21_24_years'
    elif series == 5:
        return '25_29_years'
    elif series == 6:
        return '30_34_years'
    elif series == 7:
        return '35_39_years'
    elif series == 8:
        return '40_44_years'
    elif series == 9:
        return '45_49_years'
    elif series == 10:
        return '50_54_years'
    elif series == 11:
        return '55_64_years'
    elif series == 12:
        return '65_plus_years'
    
teds_sm1['age_group'] = teds_sm1.AGE.apply(age_groups)

"""
Example of how to change an ordered categorical variable into an ordered factor variable
for machine learning modeling.
"""
# Change variable to an ordered factor
teds_sm1['age_group'] = pd.Categorical(teds_sm1['age_group'], categories = ['12_14_years', '15_17_years', '18_20_years',
                                                                            '21_24_years', '25_29_years', '30_34_years',
                                                                            '35_39_years', '40_44_years', '45_49_years',
                                                                            '50_54_years', '55_64_years', '65_plus_years'], ordered = True)

# Change variable to an ordered factor with values as numbers
labels, unique = pd.factorize(teds_sm1['age_group'], sort = True)
teds_sm1['age_group'] = labels

# Recode gender variable
def gen_rc(series):
    if series == 1:
        return 0
    elif series == 2:
        return 1
    
teds_sm1['gender'] = teds_sm1.GENDER.apply(gen_rc)

# Recode methadone variable
def methadone_rc(series):
    if series == 1:
        return 1
    elif series == 2:
        return 0
    
teds_sm1['methadone_use'] = teds_sm1.METHUSE.apply(methadone_rc)

def educ_rc(series):
    if series == 1:
        return 'No_Schooling'
    elif series == 2:
        return 'Grades_9_to_11'
    elif series == 3:
        return 'Grades_12_or_GED'
    elif series == 4:
        return 'College_1_to_3_years'
    elif series == 5:
        return 'College_4_or_more_years'

teds_sm1['educ'] = teds_sm1.EDUC.apply(educ_rc)

# Change variable to an ordered factor
teds_sm1['educ'] = pd.Categorical(teds_sm1['educ'], categories = ['No_Schooling', 'Grades_9_to_11', 'Grades_12_or_GED',
                                                                         'College_1_to_3_years', 'College_4_or_more_years'], ordered = True)

# Change variable to an ordered factor with values as numbers
labels, unique = pd.factorize(teds_sm1['educ'], sort = True)
teds_sm1['educ'] = labels

# Record race variable
"""
Hawaiian Pacific Islander and Hispanic variables are examples of where you should 
recode a variable to combine very small groups.
"""
def race_rc(series):
    if series == 1:
        return 'Alaska_Native'
    elif series == 2:
        return 'American_Indian'
    elif series == 3 or series == 9:
        return 'Hawaiian_Pacific_Islander'
    elif series == 4:
        return 'Black'
    elif series == 5:
        return 'White'
    elif series == 6:
        return 'Asian'
    elif series == 7:
        return 'Other_race'
    elif series == 8:
        return 'Two_or_more_races'
    
teds_sm1['race'] = teds_sm1.RACE.apply(race_rc)

# Recode ethnicity variable
def ethnic_rc(series):
    if (series >= 1 or series <= 3) or series == 5:
        return 1
    elif series == 4:
        return 0
    
teds_sm1['hispanic'] = teds_sm1.ETHNIC.apply(ethnic_rc)

# Recode service setting variable
def servseta_rc(series):
    if series == 1 or series == 2:
        return 'Detox'
    elif series >= 3 and series <= 5:
        return 'Rehab_Residential'
    elif series >= 6 and series <= 8:
        return 'Ambulatory'

teds_sm1['services'] = teds_sm1.SERVICES.apply(servseta_rc)

# Recode marital status variable
def marstat_rc(series):
    if series == 1:
        return 'Never_Married'
    elif series == 2:
        return 'Married'
    elif series == 3:
        return 'Separated'
    elif series == 4:
        return 'Divorced_or_Widowed'

teds_sm1['marstat'] = teds_sm1.MARSTAT.apply(marstat_rc)

# Recode employment status variable
def employ_rc(series):
    if series == 1:
        return 'Full_time'
    elif series == 2:
        return 'Part_time'
    elif series == 3:
        return 'Unemployed'
    elif series == 4:
        return 'Not_in_labor_force'
    
teds_sm1['employ'] = teds_sm1.EMPLOY.apply(employ_rc)

# Recode veteran variable
def vet_rc(series):
    if series == 1:
        return 1
    elif series == 2:
        return 0
    
teds_sm1['vet'] = teds_sm1.VET.apply(vet_rc)

# Recode living arrangement variable
def livarag_rc(series):
    if series == 1:
        return 'Homeless'
    elif series == 2:
        return 'Dependent_Living'
    elif series == 3:
        return 'Independent_Living'

teds_sm1['livarag'] = teds_sm1.LIVARAG.apply(livarag_rc)

# Recode arrests variable
def arrests_rc(series):
    if series == 0:
        return 'None'
    elif series == 1:
        return 'Once'
    elif series == 2:
        return 'Two_or_more_times'
    
teds_sm1['arrests'] = teds_sm1.ARRESTS.apply(arrests_rc)

# Change variable to an ordered factor variable
teds_sm1['arrests'] = pd.Categorical(teds_sm1['arrests'], categories = ['None', 'Once','Two_or_more_times'],
                                           ordered = True)

# Change variable to an ordered factor with values as numbers
labels, unique = pd.factorize(teds_sm1['arrests'], sort = True)
teds_sm1['arrests'] = labels

# Recode division variable
def division_rc(series):
    if series == 0:
        return 'US_Territories'
    elif series == 1:
        return 'New_England'
    elif series == 2:
        return 'Mid_Atlantic'
    elif series == 3:
        return 'East_North_Central'
    elif series == 4:
        return 'West_North_Central'
    elif series == 5:
        return 'South_Atlantic'
    elif series == 6:
        return 'East_South_Central'
    elif series == 7:
        return 'West_South_Central'
    elif series == 8:
        return 'Mountain'
    elif series == 9:
        return 'Pacific'
    
teds_sm1['division'] = teds_sm1.DIVISION.apply(division_rc)

# Recode referral source variable
def psource_rc(series):
    if series == 1:
        return 'Self_referral'
    elif series == 2:
        return 'Alcohol_Drug_Care_Professional'
    elif series == 3:
        return 'Other_Health_Care_Professional'
    elif series == 4:
        return 'School_Referral'
    elif series == 5:
        return 'Employer_Referral'
    elif series == 6:
        return 'Community_Referral'
    elif series == 7:
        return 'Court_Referral'
    
teds_sm1['psource'] = teds_sm1.PSOURCE.apply(psource_rc)

# Coding of target variable below. Recodes number of prior treatment encounters.
def noprior_rc(series):
    if series == 0:
        return 0
    elif series >= 1:
        return 1
    
teds_sm1['noprior'] = teds_sm1.NOPRIOR.apply(noprior_rc)

# Recode mental illness variable
def psyprob_rc(series):
    if series == 1:
        return 1
    elif series == 2:
        return 0
    
teds_sm1['psyprob'] = teds_sm1.PSYPROB.apply(psyprob_rc)

# Creates frstuse column. Finds most earliest first use of all drugs in a patient's system on admission
def first_age_recode(column1, column2, column3):
    if column1 == 1 or column2 == 1 or column3 == 1:
        return '11_years_and_under'
    elif column1 == 2 or column2 == 2 or column3 == 2:
        return '12_14_years'
    elif column1 == 3 or column2 == 3 or column3 == 3:
        return '15_17_years'
    elif column1 == 4 or column2 == 4 or column3 == 4:
        return '18_20_years'
    elif column1 == 5 or column2 == 5 or column3 == 5:
        return '21_24_years'
    elif column1 == 6 or column2 == 6 or column3 == 6:
        return '25_29_years'
    elif column1 == 7 or column2 == 7 or column3 == 7:
        return '30_years_older'

teds_sm1['frstuse'] = teds_sm1.apply(lambda x: first_age_recode(x.FRSTUSE1, x.FRSTUSE2, x.FRSTUSE3), axis=1)

# Change first use into an ordered factor
teds_sm1['frstuse'] = pd.Categorical(teds_sm1['frstuse'], categories = ['11_years_and_under', '12_14_years', '15_17_years',
                                                                        '18_20_years', '21_24_years',
                                                                        '25_29_years', '30_years_older'], ordered = True)

labels, unique = pd.factorize(teds_sm1['frstuse'], sort = True)
teds_sm1['frstuse'] = labels

# Creates frequency of use column. Finds most recent use of all drugs in a patient's system on admission
def freq_recode(column1, column2, column3):
    if (column1 == 1 and column2 == 1 and column3 == 1):
        return 'No_Use_Past_Month'
    elif (column1 == 3 or column2 == 3 or column3 == 3):
        return 'Daily_Use'
    elif (column1 == 2 or column2 == 2 or column3 == 2):
        return 'Some_Use'
    elif (column1 == 1 or column2 == 1 or column3 == 1):
        return 'No_Use_Past_Month'

teds_sm1['freq_use'] = teds_sm1.apply(lambda x: freq_recode(x.FREQ1, x.FREQ2, x.FREQ3), axis=1)

# Change freq_use into an ordered factor
teds_sm1['freq_use'] = pd.Categorical(teds_sm1['freq_use'], categories = ['No_Use_Past_Month', 'Daily_Use', 'Some_Use'], ordered = True)
# Convert freq_use to factor with numeric value
labels, unique = pd.factorize(teds_sm1['freq_use'], sort = True)
teds_sm1['freq_use'] = labels


In [None]:
"""
Below I create variables that flag the method of administration for all substances in the 
patients' systems at admission.
"""

def oral_recode(column1, column2, column3):
    if (column1 == 1 and column1 != None) or (column2 == 1 and column2 != None) or (column3 == 1 and column3 != None):
        return 1
    elif (column1 > 1 and column1 != None) or (column2 > 1 and column2 != None) or (column3 > 1 and column3 != None):
        return 0

teds_sm1['oral_drug_use'] = teds_sm1.apply(lambda x: oral_recode(x.ROUTE1, x.ROUTE2, x.ROUTE3), axis=1)

def smoking_recode(column1, column2, column3):
    if (column1 == 2 and column1 != None) or (column2 == 2 and column2 != None) or (column3 == 2 and column3 != None):
        return 1
    elif (column1 > 0 and column1 != None) or (column2 > 0 and column2 != None) or (column3 > 0 and column3 != None):
        return 0

teds_sm1['smoking_drug_use'] = teds_sm1.apply(lambda x: smoking_recode(x.ROUTE1, x.ROUTE2, x.ROUTE3), axis=1)

def inhalation_recode(column1, column2, column3):
    if (column1 == 3 and column1 != None) or (column2 == 3 and column2 != None) or (column3 == 3 and column3 != None):
        return 1
    elif (column1 > 0 and column1 != None) or (column2 > 0 and column2 != None) or (column3 > 0 and column3 != None):
        return 0
    
teds_sm1['inhale_drug_use'] = teds_sm1.apply(lambda x: inhalation_recode(x.ROUTE1, x.ROUTE2, x.ROUTE3), axis=1)
    
def injection_recode(column1, column2, column3):
    if (column1 == 4 and column1 != None) or (column2 == 4 and column2 != None) or (column3 == 4 and column3 != None):
        return 1
    elif (column1 > 0 and column1 != None) or (column2 > 0 and column2 != None) or (column3 > 0 and column3 != None):
        return 0
    
teds_sm1['injection_drug_use'] = teds_sm1.apply(lambda x: injection_recode(x.ROUTE1, x.ROUTE2, x.ROUTE3), axis=1)

In [8]:
import pandas as pd
import numpy as np
import os
import itertools
import matplotlib.pyplot as plt
from matplotlib import rcParams, gridspec
from pandas.api.types import CategoricalDtype

import sklearn
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

#teds_clean.to_csv('teds_clean.csv',index=False)
#teds_clean = pd.read_csv('/Users/kelsey.huntzberry/OneDrive - Personal Use/Intro_to_Machine_Learning/Data/teds_clean.csv').replace(-1, None)

### NOTE: Read in the full data set or a smaller dataset for quick processing below. This will allow you to run the models without running all the data cleaning steps above.

In [3]:
# Subset data for easy analysis
#teds_sm = teds_clean.sample(n = 50000, random_state=16)
#teds_sm.to_csv('/Users/kelsey.huntzberry/OneDrive - Personal Use/Intro_to_Machine_Learning/Data/teds_clean_small.csv',index=False)
teds_clean = pd.read_csv('/Users/kelsey.huntzberry/OneDrive - Personal Use/Intro_to_Machine_Learning/Data/teds_clean_small.csv')

### Model Validation Set Up
Below I perform a ***stratified shuffle split*** to create a holdout dataset for final testing of the model. I am using k-fold cross validation below so I do not need a validation data set.

In [18]:
teds_clean.head()

Unnamed: 0,NUMSUBS,ALCFLG,COKEFLG,MARFLG,HERFLG,OPSYNFLG,PCPFLG,HALLFLG,MTHAMFLG,AMPHFLG,...,division,psource,noprior,psyprob,frstuse,freq_use,oral_drug_use,smoking_drug_use,inhale_drug_use,injection_drug_use
0,1,1,0,0,0,0,0,0,0,0,...,Mid_Atlantic,Self_referral,1,0.0,3.0,1.0,1.0,0.0,0.0,0.0
1,1,1,0,0,0,0,0,0,0,0,...,Mountain,Community_Referral,1,1.0,2.0,2.0,1.0,0.0,0.0,0.0
2,2,1,0,0,1,0,0,0,0,0,...,Mountain,Other_Health_Care_Professional,1,1.0,4.0,1.0,1.0,0.0,0.0,1.0
3,0,0,0,0,0,0,0,0,0,0,...,Mountain,,1,0.0,,,,,,
4,1,0,0,1,0,0,0,0,0,0,...,Mid_Atlantic,Community_Referral,0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [14]:
# Change response and predictor data frames to numpy arrays
data = teds_clean.drop(columns='noprior')
response = teds_clean['noprior']

sss = StratifiedShuffleSplit(n_splits=2, test_size=0.1, train_size=0.9, random_state=0)
sss.get_n_splits(data, response)

train_index, test_index = sss.split(data, response)

x_train = data.iloc[train_index[0],:]
x_test = data.iloc[test_index[0],:]

y_train = np.array(response.iloc[train_index[0]])
y_test = np.array(response.iloc[test_index[0]])

### **Pipeline Benefits**

#### Below I create a modeling pipeline combining:
- All preprocessing steps separated out by variable type (ordered categorical variables and unordered categorical variables)
- A gradient boosted trees model evaluated using k-fold cross validation

#### Having all preprocessing steps and model in the same pipeline would allow you to deploy the model in production to get real-time or near real-time results.

### Fit vs. Transform
- The ***fit*** method allows you to use your training data to train the model. The algorithm iteratively learns to make better predictions.
- The ***transform*** method allows you to use the trained model (created with fit) to make predictions on unseen data.

### Grid Search
Grid searches allow us to test multiple combinations of hyperparameters in the same model. This allows us to find the most accurate combination of hyperparameters for prediction.

The steps are:
- Specifying hyperparameters in a dictionary (called param_grid below)
- Place pipeline and parameters inside of GridSearchCV
- By specifying n_jobs we can run the models parallelized over multiple cores of your computer

The grid search will return the accuracy of the best model.

In [19]:
imputer_cat = SimpleImputer(strategy='most_frequent')
imputer_main = SimpleImputer(strategy='most_frequent')
ohe = OneHotEncoder(sparse=False)
scaler = StandardScaler()

ohe_vars = ['race','division','services','marstat','employ','livarag']
main_vars = ['ALCFLG', 'COKEFLG', 'MARFLG', 'HERFLG', 'OPSYNFLG', 'PCPFLG',
            'HALLFLG', 'MTHAMFLG', 'AMPHFLG', 'STIMFLG', 'BENZFLG', 'TRNQFLG',
            'BARBFLG', 'SEDHPFLG', 'INHFLG', 'OTCFLG', 'OTHERFLG',
            'NUMSUBS', 'age_group', 'gender', 'methadone_use', 'educ', 'hispanic', 'vet', 'arrests',
            'psyprob', 'frstuse', 'freq_use','oral_drug_use', 'smoking_drug_use', 'inhale_drug_use',
            'injection_drug_use']

main_transformer = Pipeline(
    steps=[("imputer", imputer_main), ("scaler", scaler)]
)

cat_transformer = Pipeline(
    steps=[("imputer", imputer_cat), ("ohe", ohe)]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("main", main_transformer, main_vars),
        ("cat", cat_transformer, ohe_vars),
    ]
)

param_grid = {
    'gbt__max_depth':[5, 9], 
    'gbt__learning_rate':[0.1, 0.3],
}

gbt = GradientBoostingClassifier(random_state=384)
grid_search = GridSearchCV(gbt, param_grid)

pp_pipe = Pipeline(steps=[('preprocessor', preprocessor), 
                          ('gbt', gbt)])


search = GridSearchCV(pp_pipe, param_grid, n_jobs=6)

search.fit(x_train, y_train)

best_model = search.best_estimator_

print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)


Best parameter (CV score=0.718):
{'gbt__learning_rate': 0.1, 'gbt__max_depth': 5}


Best parameter (CV score=0.718):
{'gbt__learning_rate': 0.1, 'gbt__max_depth': 5}

### Obtain predictions from test data

In [16]:
best_model.predict(x_test)

array([0, 1, 1, ..., 1, 0, 1])

### Saving Best Estimator
Below I save the best model to a pickle file so it can be reloaded later without retraining your model!

In [11]:
import joblib
joblib.dump(search.best_estimator_, 'best_estimator.pkl')

['best_estimator.pkl']