In [None]:
import numpy as np
import pandas as pd
import re
import sys
import os

from sklearn.impute import SimpleImputer 
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, Normalizer
from pycaret.classification import *

sys.path.append('..')
from src import config

if sys.platform == 'linux':
    path = config.LINUX_PATH
else:
    path = config.OS_PATH
os.chdir(path)

def race_heuristic(strava_caption, workout_type):
    strava_caption = str(strava_caption)
    if re.findall(r"\b\d+th|\d+st|\d+rd|\d+nd\b", strava_caption):
        return 1
    else:
        return workout_type

mapping = {0: 'easy_run', 1: 'race', 2: 'long_run', 3: 'workout'}

In [None]:
# load in preprocessed data
data_path = config.STRAVA_TRAIN_PATH
data = pd.read_csv(data_path, index_col=0)

# apply race heuristics
data['workout_type'] = data.apply(lambda row: race_heuristic(row['name'], row['workout_type']), axis=1)
data_race = data[data['workout_type'] == 1]
data = data[data['workout_type'] != 1]

# train and test set for workout classifier
data_missing_ix = data[data["workout_type"].isnull()].index
data_full = data[~data.index.isin(data_missing_ix)]
data_missing = data[data.index.isin(data_missing_ix)]

# get columns that are useful for classifying workout_type
cols = ['workout_type', 'distance', 'moving_time', 'elapsed_time', 'average_speed', 'max_speed', 'average_heartrate', 'max_heartrate']
data_full = data_full[cols]
data_full = data_full.reset_index(drop=True)
data_missing = data_missing[cols]

# check class imbalance of workout_type
# print("Class imbalance: \n", data_full.workout_type.value_counts() / data_full.shape[0])
X, y = data_full.drop('workout_type', axis=1), data_full.workout_type
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)


# impute missing values into columns of data using strategy mean 
cols_to_imp = [col for col in cols if col not in ['workout_type']]
impute_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
impute_mean = impute_mean.fit(X_train)

X_train = impute_mean.transform(X_train)
X_test = impute_mean.transform(X_test)


# scale the data using normalizer or std scaler
def scale_num_data(X_train, X_test, normalizer=False):
    if normalizer:
        scaler = Normalizer().fit(X_train)
    else:
        scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test
X_train, X_test = scale_num_data(X_train, X_test)

In [None]:
clf = setup(data=data_full, target='workout_type') #, fold_shuffle=True, imputation_type='iterative')
best = compare_models()

In [None]:
model = create_model('lr')
# plot_model(model, 'confusion_matrix')
finalize_model(model)
save_model(model, 'models/workoutImputer')
