# Objetivo: Predecir si una persona es saludable o no

## Obtención del dataset y limpio datos

In [None]:
# Dataset sobre factores de riesgo, encuesta a 400.000 personas.
# https://www.kaggle.com/cdc/behavioral-risk-factor-surveillance-system

import numpy as np
import pandas as pd

#url = "https://seafile.innova.antel.net.uy/f/e7e8270c549f49a8b898/?dl=1"
#url = './data/2015_health.csv'

url = "https://seafile.innova.antel.net.uy/f/4737172cc5274f539367/?dl=1"
#url = './data/2015_health_lite.csv'

df = pd.read_csv(url)

# Si queremos tomar una muestra del dataset.
#df = df.sample(10000)

df.head()

In [None]:
# Realizo cambios al dataset, en produccion estos cambios se hacen antes de predecir con el modelo.

# Selecciono features numéricos.
df = df.select_dtypes('number')

# Selecciono feature para ser utilizada como target.
# 0 indica que es una persona poco saludable.
# 1 es una persona saludable.
df['_RFHLTH'] = df['_RFHLTH'].replace({2: 0})
df = df.loc[df['_RFHLTH'].isin([0, 1])].copy()
df = df.rename(columns = {'_RFHLTH': 'label'})
df['label'].value_counts()

# Quito columas sin valores.
df = df.drop(columns = ['POORHLTH', 'PHYSHLTH', 'GENHLTH', 'PAINACT2',
                        'QLMENTL2', 'QLSTRES2', 'QLHLTH2', 'HLTHPLN1', 'MENTHLTH'])

In [None]:
# Antes de comenzar separamos nuestro dataset en train y test.

from sklearn.model_selection import train_test_split

# Extraigo labels
labels = np.array(df.pop('label'))

X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.30)


In [None]:

# Obtengo columnas donde todas no tienen valor.
columns_null = X_train.isnull().all()

columns_drop = [] # Guardarme esta lista de columnas para no considerarlas en el modelo final.
for col in df.columns:
    if columns_null[col]:
        columns_drop.append(col)
        
# Quito columas sin valores.
X_train = X_train.drop(columns = columns_drop)
X_test = X_test.drop(columns = columns_drop)

In [None]:
# Relleno valores vacios o nulos con el promedio de la columna.
mean = X_train.mean()
X_train = X_train.fillna(mean)

X_test = X_test.fillna(mean) # Hacer esto en produccion, guardarme mean

## Construcción de la solución

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=30, random_state=0)

tree.fit(X_train, y_train)

print(f'Decision tree has {tree.tree_.node_count} nodes with maximum depth {tree.tree_.max_depth}.')

In [None]:
print("Train mean accuracy: ", tree.score(X_train, y_train))
print("Test mean accuracy: ", tree.score(X_test, y_test))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=10, random_state=1)

forest.fit(X_train, y_train)


In [None]:
n_nodes = []
max_depths = []

for ind_tree in forest.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

In [None]:
print("Train mean accuracy: ", forest.score(X_train, y_train))
print("Test mean accuracy: ", forest.score(X_test, y_test))