# Fitness Trends Dataset A dataset of fitness trends and how they change with exercise

## Parte 2: Selección de Modelos

[Fitness Trends Dataset A dataset of fitness trends and how they change with exercise](https://www.kaggle.com/aroojanwarkhan/fitness-data-trends/)

Clases para el STAN de [Liricus SRL](http://www.liricus.com.ar)

Dictado durante 3 clases en el primer semestre del 2019 por personal del [IATE-OAC-CONICET](http://iate.oac.uncor.edu/)

## Librerias a usar

In [None]:
# contador
from collections import Counter

# computo numerico convencional
import numpy as np

# dataframes
import pandas as pd

# importamos plots
%matplotlib inline
import matplotlib.pyplot as plt

# mejor manejo de dataframes para plot
import seaborn as sns

# scikit learn
from sklearn import preprocessing

# fijamos el estado de random
np.random.seed(42)

# apagamos warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
Xc = ["step_count", "mood", "calories_burned", "hours_of_sleep", "weight_kg"]
yc = "bool_of_active"

In [None]:
df = pd.read_csv("data/fitness-data-trends.csv")

df["bool_of_active"] = df.bool_of_active.apply(lambda x: 1 if x else 0)

df = df.assign(date=pd.to_datetime(df.date))
df = df.set_index("date")

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(df[Xc].values)
df[Xc] = x_scaled

In [None]:
df.describe()

In [None]:
sns.pairplot(df, hue=yc, vars=Xc);

## Repasando Machine Learning

<b><a href="imgs/slides.pdf">![ddd](imgs/err1.png)</a></b>

### Hyper Parámetros

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import joblib

In [None]:
cpu = joblib.cpu_count()
print(cpu)

In [None]:
X = df[Xc].values
y = df[yc].values

bests = {}

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

### Suport Vector Machines

In [None]:
from sklearn.svm import SVC

In [None]:
%%time

# Set the parameters by cross-validation
tuned_parameters = [
    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}
]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5, n_jobs=cpu,
                       scoring='%s_macro' % score)
    
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
    
    # store the bests
    clf_name = clf.estimator.__class__.__name__
    if clf_name not in bests:
        bests[clf_name] = {}
    bests[clf_name][score] = clf.best_params_

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
%%time

# Set the parameters by cross-validation
tuned_parameters = [
    {'max_features': ['auto', 'sqrt', "log2", None, 0.2, 0.5], 
     "min_samples_split": [2, 5, 10],
     "n_estimators": [500], 
     "criterion": ["entropy"], 
     "n_jobs": [10]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, n_jobs=cpu,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
    
    # store the bests
    clf_name = clf.estimator.__class__.__name__
    if clf_name not in bests:
        bests[clf_name] = {}
    bests[clf_name][score] = clf.best_params_

### K-Nearest NeightBorgs

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
%%time

# Set the parameters by cross-validation
tuned_parameters = [
    {'n_neighbors': range(3, 30), 
     "weights": ["uniform", "distance"],
     "p": [1, 2]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=5, n_jobs=cpu,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
    
    # store the bests
    clf_name = clf.estimator.__class__.__name__
    if clf_name not in bests:
        bests[clf_name] = {}
    bests[clf_name][score] = clf.best_params_

### Comparando los mejores

In [None]:
from IPython.display import display, Markdown

In [None]:
out = []
for clf, results in bests.items():
    out.append(f"#### {clf}")
    for score, best in results.items():
        out.append(f"- **{score}**: {best}")
    out.append("")

display(Markdown("\n".join(out)))

### Persistiendo

In [None]:
import pickle

In [None]:
with open("out/scaler.pkl", "wb") as fp:
    pickle.dump(min_max_scaler, fp)

df.to_pickle("out/scaled_df.pkl")

with open("out/best_results.pkl", "wb") as fp:
    pickle.dump(bests, fp)