# Fitness Trends Dataset A dataset of fitness trends and how they change with exercise

## Parte 2: Selección de Modelos

[Fitness Trends Dataset A dataset of fitness trends and how they change with exercise](https://www.kaggle.com/aroojanwarkhan/fitness-data-trends/)

Clases para el STAN de [Liricus SRL](http://www.liricus.com.ar)

Dictado durante 3 clases en el primer semestre del 2019 por personal del [IATE-OAC-CONICET](http://iate.oac.uncor.edu/)

## Librerias a usar

In [None]:
# contador
import pickle

# computo numerico convencional
import numpy as np

# dataframes
import pandas as pd

# importamos plots
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import cm

# mejor manejo de dataframes para plot
import seaborn as sns

# ML
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, roc_curve, auc

# fijamos el estado de random
np.random.seed(42)

# apagamos warnings
import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, Markdown

In [None]:
import joblib
cpu = joblib.cpu_count()
print(cpu)

In [None]:
Xc = ["step_count", "mood", "calories_burned", "hours_of_sleep", "weight_kg"]
yc = "bool_of_active"

### Recreamos los datos

In [None]:
df = pd.read_pickle("out/scaled_df.pkl")

X = df[Xc].values
y = df[yc].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

### Recreamos los clasificadores

In [None]:
with open("out/best_results.pkl", "rb") as fp:
    bests = pickle.load(fp)

out = []
for clf, results in bests.items():
    out.append(f"#### {clf}")
    for score, best in results.items():
        out.append(f"- **{score}**: {best}")
    out.append("")

display(Markdown("\n".join(out)))

clfs = {
    "svc": SVC(probability=True, **bests["SVC"]["precision"]),
    "rf_p": RandomForestClassifier(**bests["RandomForestClassifier"]["precision"]),
    "rf_r": RandomForestClassifier(**bests["RandomForestClassifier"]["recall"]),
    "knn": KNeighborsClassifier(**bests["KNeighborsClassifier"]["precision"])}

### Ejecutamos los clasificadores

In [None]:
for name, clf in clfs.items():
    print(f"Model {name}")
    clf.fit(X_train, y_train)
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print("-" * 40)

### Curva ROC

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

plt.title("ROC Curve")

cmap = cm.get_cmap("viridis")
colors = iter(cmap(4))

legends = []
for name, clf in clfs.items():
    
    y_probas = clf.predict_proba(X_test)
       
    preds = y_probas[:,1]
    fpr, tpr, threshold = roc_curve(y_test, preds)
    roc_auc = auc(fpr, tpr)

    ax.plot(fpr, tpr, next(colors))
    legends.append(f'{name} - AUC={roc_auc:.2f}')

    
ax.set_ylabel('True Positive Rate')
ax.set_xlabel('False Positive Rate');    
ax.legend(legends, loc = 'lower right')

ax.plot([0, 1], [0, 1],'r--')
ax.set_xlim([0, 1.01])
ax.set_ylim([0, 1.01])


----

### Y como se usa?

supongamos que tenemos una observacion nueva

In [None]:
obs = [
    1000, # step count
    200, # mod regular
    100, # calories_burned
    7.5, # hours_of_sleep,
    68, # weight
]

#### 1. Se escala la observacion con los datos **DE ENTRENAMIENTO**

In [None]:
with open("out/scaler.pkl", "rb") as fp:
    scaler = pickle.load(fp)

In [None]:
obs = scaler.transform([obs])

#### 2. Se entrena con **TODO**

In [None]:
clf = RandomForestClassifier(**bests["RandomForestClassifier"]["precision"])
clf.fit(X, y)

#### 3. Se predice

In [None]:
clf.predict(obs)

In [None]:
clf.predict_proba(obs)