In [None]:
%load_ext memory_profiler
import sys; sys.path.append("../src")

In [2]:
import lib.data as dataset
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split

# About

This notebook looks at the exported pipeline that tpot generated in the model selection notebook. In this notebook that pipeline is modified to work with the existing dataset. That model is then exported in a serialized format to prevent having to retrain for invocations.

# Dataset

Source: [Heterogeneity Activity Recognition Data Set](https://archive.ics.uci.edu/ml/datasets/Heterogeneity+Activity+Recognition)

The dataset and accompanying research can befound at UCIs dataset repository. The copy used in this notebook was transformed from the original dataset in the `process_datasets.ipynb`

In [3]:
%time df = dataset.read_local_phones(); df.head(1)

CPU times: user 10.8 s, sys: 411 ms, total: 11.2 s
Wall time: 11.3 s


Unnamed: 0,arrival_time,target,user,x_accel,x_gyro,y_accel,y_gyro,z_accel,z_gyro
0,1424686733391,stand,g,-2.779668,0.015577,1.908179,-0.043371,8.927979,-0.014661


In [4]:
target_labels, target_uniques = pd.factorize(df.target)
user_labels, user_uniques = pd.factorize(df.user)

## Apply Changes and Select Subset
Factorizations are applied in this step, and a subset of the dataset is selected.

In testing Tpot took a very long time to complete a single generation of the full dataset. I opted to select 100,00 random samples instead. 

In [5]:
df = (
    df.assign(
        target=lambda _: target_labels,
        user=lambda _: user_labels
    )
    .drop(columns=["arrival_time"])
    .sample(n=125_000)
    .pipe(dataset.balance())
); df.head(1)

Unnamed: 0,target,user,x_accel,x_gyro,y_accel,y_gyro,z_accel,z_gyro
3844539,1,3,1.418747,-0.019638,-0.362427,0.006195,9.831162,-0.226715


In [6]:
df.target.value_counts()

5    16659
4    16659
3    16659
2    16659
1    16659
0    16659
Name: target, dtype: int64

In [7]:
len(df)

99954

# Split Test and Train Set

In [8]:
X_train, X_test, y_train, y_test = (
    train_test_split(df.drop(columns=["target"]), df.target)
)

In [9]:
print(f"Train set size: {len(X_train)}; test set size: {len(X_test)}")

Train set size: 74965; test set size: 24989


# Pipeline

This pipeline was taken from `model_01.py`. None of the hyperparameters have been adjusted from the output.

In [10]:
len(df)

99954

In [11]:
pipeline = ExtraTreesClassifier(
    bootstrap=False, 
    criterion="gini", 
    max_features=0.3, 
    min_samples_leaf=1, 
    min_samples_split=4, 
    n_estimators=100
)

# Train

In [12]:
%time pipeline.fit(X_train, y_train)

CPU times: user 3.85 s, sys: 125 ms, total: 3.97 s
Wall time: 3.98 s


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features=0.3,
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=4,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [13]:
%time results = pipeline.predict(X_test)

CPU times: user 555 ms, sys: 71 µs, total: 555 ms
Wall time: 555 ms


# Export Pipeline

In [14]:
pickle.dump(pipeline, open("../model/experiment_01_model.pkl", "wb"))

# Export Decoder

In [15]:
pickle.dump(target_uniques, open("../model/experiment_01_decoder.pkl", "wb"))

# Evaluate Exported Pipeline

In [16]:
%time rehydrated = pickle.load(open("../model/experiment_01_model.pkl", "rb"))

CPU times: user 96.2 ms, sys: 233 ms, total: 330 ms
Wall time: 330 ms


In [17]:
%time rehydrated.score(X_test, y_test)

CPU times: user 549 ms, sys: 895 µs, total: 550 ms
Wall time: 550 ms


0.8338468926327585

In [18]:
sample = X_test.sample(n=1)

In [19]:
%timeit target_uniques[rehydrated.predict(sample)]

6.57 ms ± 42.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
