Analisando dataset do kaggle: Tratamento de dependencia quimica
Utilizando um modelo de regressão da scikitlearn e transformando variáveis categóricas.

Estudo baseado no vídeo de "Mario Filho": https://www.youtube.com/watch?v=tBUZ5xonmDc

Link dataset: https://www.kaggle.com/new-york-state/nys-chemical-dependence-treatment-prog-admissions

In [42]:
import numpy as np
import pandas as pd
from category_encoders.one_hot import OneHotEncoder #transformar variáveis categóricas usando "OneHotEncoder"
from category_encoders.ordinal import OrdinalEncoder #transformar variáveis categóricas usando "OrdinalEncoder"
from category_encoders.target_encoder import TargetEncoder #transformar variáveis categóricas usando "OrdinalEncoder"
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [11]:
#pip install category_encoders

Collecting category_encoders
  Downloading https://files.pythonhosted.org/packages/a0/52/c54191ad3782de633ea3d6ee3bb2837bda0cf3bc97644bb6375cf14150a0/category_encoders-2.1.0-py2.py3-none-any.whl (100kB)
Installing collected packages: category-encoders
Successfully installed category-encoders-2.1.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
data = pd.read_csv('dataset\chemical-dependence-treatment-program-admissions-beginning-2007.csv')
data.head()

Unnamed: 0,Year,County of Program Location,Program Category,Service Type,Age Group,Primary Substance Group,Admissions
0,2018,Albany,Crisis,Medical Managed Detoxification,18 thru 24,Alcohol,32
1,2018,Albany,Crisis,Medical Managed Detoxification,18 thru 24,Heroin,60
2,2018,Albany,Crisis,Medical Managed Detoxification,18 thru 24,Other Opioids,14
3,2018,Albany,Crisis,Medical Managed Detoxification,18 thru 24,Cocaine incl Crack,1
4,2018,Albany,Crisis,Medical Managed Detoxification,18 thru 24,All Others,10


In [7]:
data['Year'].value_counts().sort_index()

2007    6549
2008    6519
2009    6540
2010    6655
2011    6623
2012    6700
2013    6619
2014    6507
2015    6506
2016    6428
2017    6819
2018    6946
Name: Year, dtype: int64

In [38]:
#separar uma parte como treino e outra como validação
#será considerado fator "temporal"... para treinar o passado a fim de validar o futuro
df_train = data[data['Year'] <= 2013]
df_val = data[data['Year'] > 2013]

list_var_categoric = ['County of Program Location','Program Category','Service Type','Age Group','Primary Substance Group']

# Usando o "One Hot Encoder"

In [39]:
#transformar as variáveis categóricas utilizando a biblioteca "category_encoders"
enc = OneHotEncoder(cols=list_var_categoric, use_cat_names=True)
enc.fit(df_train)

OneHotEncoder(cols=['County of Program Location', 'Program Category',
                    'Service Type', 'Age Group', 'Primary Substance Group'],
              drop_invariant=False, handle_missing='value',
              handle_unknown='value', return_df=True, use_cat_names=True,
              verbose=0)

In [21]:
df_train_ohe = enc.transform(df_train)
df_train_ohe.head()

Unnamed: 0,Year,County of Program Location_Albany,County of Program Location_Allegany,County of Program Location_Bronx,County of Program Location_Broome,County of Program Location_Cattaraugus,County of Program Location_Cayuga,County of Program Location_Chautauqua,County of Program Location_Chemung,County of Program Location_Chenango,...,Age Group_45 thru 54,Age Group_55 and Older,Age Group_Under 18,Primary Substance Group_Alcohol,Primary Substance Group_Heroin,Primary Substance Group_Other Opioids,Primary Substance Group_All Others,Primary Substance Group_Marijuana incl Hashish,Primary Substance Group_Cocaine incl Crack,Admissions
33206,2013,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,13
33207,2013,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,186
33208,2013,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,47
33209,2013,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,7
33210,2013,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,132


In [22]:
df_val_ohe = enc.transform(df_val)
df_val_ohe.head()

Unnamed: 0,Year,County of Program Location_Albany,County of Program Location_Allegany,County of Program Location_Bronx,County of Program Location_Broome,County of Program Location_Cattaraugus,County of Program Location_Cayuga,County of Program Location_Chautauqua,County of Program Location_Chemung,County of Program Location_Chenango,...,Age Group_45 thru 54,Age Group_55 and Older,Age Group_Under 18,Primary Substance Group_Alcohol,Primary Substance Group_Heroin,Primary Substance Group_Other Opioids,Primary Substance Group_All Others,Primary Substance Group_Marijuana incl Hashish,Primary Substance Group_Cocaine incl Crack,Admissions
0,2018,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,32
1,2018,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,60
2,2018,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,14
3,2018,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,2018,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,10


In [23]:
#remover a coluna "Admissions" que será o target
x_train = df_train_ohe.drop('Admissions', axis=1)
x_val = df_val_ohe.drop('Admissions', axis=1)

y_train = df_train_ohe["Admissions"]
y_val = df_val_ohe["Admissions"]

In [29]:
#utilizar o modelo RandomForest para fazer predição de "Admissions"
mdl = RandomForestRegressor(n_jobs=6, n_estimators=100, random_state=22)
mdl.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=6,
                      oob_score=False, random_state=22, verbose=0,
                      warm_start=False)

In [31]:
p_ohe = mdl.predict(x_val)

In [32]:
p_ohe[:5]

array([ 19.46, 162.83,  46.92,  12.72,   6.61])

In [35]:
mean_absolute_error(y_val, p_ohe)

15.181879178461726

# Usando o "Ordinal Encoder"

In [50]:
#transformar as variáveis categóricas utilizando a biblioteca "category_encoders"
enc = OrdinalEncoder(cols=list_var_categoric)
enc.fit(df_train)

df_train_ord = enc.transform(df_train)
df_val_ord = enc.transform(df_val)

df_train_ord.head()

Unnamed: 0,Year,County of Program Location,Program Category,Service Type,Age Group,Primary Substance Group,Admissions
33206,2013,1,1,1,1,1,13
33207,2013,1,1,1,1,2,186
33208,2013,1,1,1,1,3,47
33209,2013,1,1,1,1,4,7
33210,2013,1,1,1,2,1,132


In [47]:
#remover a coluna "Admissions" que será o target
x_train = df_train_ord.drop('Admissions', axis=1)
x_val = df_val_ord.drop('Admissions', axis=1)

y_train = df_train_ord["Admissions"]
y_val = df_val_ord["Admissions"]

#utilizar o modelo RandomForest para fazer predição de "Admissions"
mdl = RandomForestRegressor(n_jobs=6, n_estimators=100, random_state=22)
mdl.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=6,
                      oob_score=False, random_state=22, verbose=0,
                      warm_start=False)

In [49]:
p_ord = mdl.predict(x_val)

mean_absolute_error(y_val, p_ord)

21.506693669818706

# Usando o "Target Encoding"

In [56]:
#transformar as variáveis categóricas utilizando a biblioteca "category_encoders"
enc = TargetEncoder(cols=list_var_categoric, min_samples_leaf=200)
enc.fit(df_train, df_train["Admissions"])

df_train_tar = enc.transform(df_train)
df_val_tar = enc.transform(df_val)

df_train_tar.head()

Unnamed: 0,Year,County of Program Location,Program Category,Service Type,Age Group,Primary Substance Group,Admissions
33206,2013,35.887704,68.736003,99.255446,32.816565,96.876022,13
33207,2013,35.887704,68.736003,99.255446,32.816565,49.108832,186
33208,2013,35.887704,68.736003,99.255446,32.816565,15.765966,47
33209,2013,35.887704,68.736003,99.255446,32.816565,8.872935,7
33210,2013,35.887704,68.736003,99.255446,50.308285,96.876022,132


In [57]:
#remover a coluna "Admissions" que será o target
x_train = df_train_tar.drop('Admissions', axis=1)
x_val = df_val_tar.drop('Admissions', axis=1)

y_train = df_train_tar["Admissions"]
y_val = df_val_tar["Admissions"]

#utilizar o modelo RandomForest para fazer predição de "Admissions"
mdl = RandomForestRegressor(n_jobs=6, n_estimators=100, random_state=22)
mdl.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=6,
                      oob_score=False, random_state=22, verbose=0,
                      warm_start=False)

In [58]:
p_tar = mdl.predict(x_val)

mean_absolute_error(y_val, p_tar)

15.35572077750655

In [59]:
p_tar

array([ 20.3       , 163.14      ,  43.17      , ...,   3.18478571,
         1.51417063,  10.84181019])