In [1]:
import os
import pandas as pd
import pickle
import time
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier

# About

This notebook looks at tpot, an automated machine learning system. Tpot uses an evolutionary algorithm to perform model selection and hyperparameter tuning.

# Dataset

Source: [Heterogeneity Activity Recognition Data Set](https://archive.ics.uci.edu/ml/datasets/Heterogeneity+Activity+Recognition)

The dataset and accompanying research can befound at UCIs dataset repository. The copy used in this notebook was transformed from the original dataset in the `process_datasets.ipynb`

In [2]:
data_directory = os.environ["DATASET"] + "/heterogeneity_activity_recognition"
data_path = f"{data_directory}/processed/phones.zip"

In [3]:
%time df = pd.read_csv(data_path); df.head(1)

CPU times: user 10.2 s, sys: 333 ms, total: 10.5 s
Wall time: 10.5 s


Unnamed: 0,arrival_time,target,user,x_accel,x_gyro,y_accel,y_gyro,z_accel,z_gyro
0,1424779162870,stand,f,-1.618774,0.009163,0.029892,-0.01741,10.02536,0.009163


# Balance

In [4]:
def balance(strategy="undersample", seed=0):
    def F(df):
        count: int
        replace: bool
        target = pd.DataFrame()
        
        # oversample
        if "oversample" in strategy:
            count = df.target.value_counts().max()
            replace = True
        
        # undersample
        else:
            count = df.target.value_counts().min()
            replace = False
        
        for action in df.target.unique():
            sample = (
                df[df.target == action]
                .sample(n=count, replace=replace, random_state=seed)
            )
            
            target = pd.concat((target, sample), axis="rows")

        return target
    
    return F

# Factorize Categorical Data

Tpot expects only numeric data. `target` and `user` are represented as strings. Both of these columns are categorical data each containing 10-20 unique values. These categories are assigned a number based on the order they appear. 

The numerical represntation can be decoded using the `col_uniques` variables. These variables contain a set of the categories. The numeric representation is the index of its corresponding string in the `col_uniques` set.

```
Categorical data -> numerical representation  
["walk", "stand", "bike", "sit"] -> [1,2,3,4]
```

In [5]:
target_labels, target_uniques = pd.factorize(df.target)
user_labels, user_uniques = pd.factorize(df.user)

## Apply Change and Select Subset
Factorizations are applied in this step, and a subset of the dataset is selected.

In testing Tpot took a very long time to complete a single generation of the full dataset. I opted to select 10,00 random samples instead. 

In [6]:
df = (
    df
    .drop(columns=["arrival_time"])
    .assign(
        target=lambda _: target_labels,
        user=lambda _: user_labels
    )
    .sample(n=10_500)
    .pipe(balance())
)

In [7]:
df.target.value_counts()

5    1690
4    1690
3    1690
2    1690
1    1690
0    1690
Name: target, dtype: int64

In [8]:
len(df)

10140

In [9]:
df.head(1)

Unnamed: 0,target,user,x_accel,x_gyro,y_accel,y_gyro,z_accel,z_gyro
2022322,1,3,1.534027,0.002792,0.554764,-0.003418,10.212219,-0.002411


# Split Test And Train Sets

In [10]:
X_train, X_test, y_train, y_test = (
    train_test_split(df.drop(columns=["target"]), df.target)
)

In [11]:
X_train.head(1)

Unnamed: 0,user,x_accel,x_gyro,y_accel,y_gyro,z_accel,z_gyro
6662540,2,4.763264,0.349415,0.949301,-0.777631,5.820304,0.464258


In [12]:
X_test.head(1)

Unnamed: 0,user,x_accel,x_gyro,y_accel,y_gyro,z_accel,z_gyro
6955084,6,-1.503326,0.836275,-1.579239,0.019853,7.32721,0.637132


In [13]:
y_train.head(1)

6662540    4
Name: target, dtype: int64

In [14]:
y_test.head(1)

6955084    4
Name: target, dtype: int64

# Classify

## Train

In [15]:
%%time
model_finder = TPOTClassifier(generations=16, population_size=32, verbosity=2, n_jobs=1)
model_finder.fit(X_train, y_train)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=544.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: 0.770940170940171
Generation 2 - Current best internal CV score: 0.770940170940171
Generation 3 - Current best internal CV score: 0.770940170940171
Generation 4 - Current best internal CV score: 0.770940170940171
Generation 5 - Current best internal CV score: 0.7789612097304406
Generation 6 - Current best internal CV score: 0.7817225509533202
Generation 7 - Current best internal CV score: 0.7817225509533202
Generation 8 - Current best internal CV score: 0.7847468770545694
Generation 9 - Current best internal CV score: 0.7847468770545694
Generation 10 - Current best internal CV score: 0.7847468770545694
Generation 11 - Current best internal CV score: 0.7847468770545694
Generation 12 - Current best internal CV score: 0.7847468770545694
Generation 13 - Current best internal CV score: 0.7847468770545694
Generation 14 - Current best internal CV score: 0.7851413543721236
Generation 15 - Current best internal CV score: 0.7851413543721236
Generati

TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=16,
               max_eval_time_mins=5, max_time_mins=None, memory=None,
               mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=32,
               random_state=None, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)

## Score

In [16]:
%time model_finder.score(X_test, y_test)

CPU times: user 51.8 ms, sys: 1 ms, total: 52.8 ms
Wall time: 51.7 ms


0.7676528599605522

# Export Model

In [17]:
time_stamp = int(time.time())
model_finder.export(f"../model/experiment01_model_{time_stamp}.py")
pickle.dump(target_uniques, open(f"../model/experiment01_targetset_{time_stamp}.pkl", "wb"))