In [60]:
import pickle
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels.api as sm
import statsmodels.tsa as tsa
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import seaborn as sns
import math
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
import pdb

In [69]:
def read_pickle(path):
    
    input_file = open(path,'rb')
    variable = pickle.load(input_file)
    input_file.close()
    return(variable)

def downsample(df, downsample_step = 10):
    
    # Downsample
    df['downsample'] = df['time_period'] % downsample_step
    df = df[df['downsample'] == 0]
    
    return(df)

In [84]:
# Read in data
data = read_pickle('../data/watch/processed_data/train_df.pkl')

# Downsample, we don't need that many observations
data = downsample(data, downsample_step = 5)
len(data)

484378

In [85]:
x = data.drop('is_exercise', axis = 1)
y = data['is_exercise']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [86]:
#Plot ROC Curve
def plot_roc_curve(fpr, tpr, label = None):
    plt.plot(fpr, tpr, linewidth = 2, label = label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    
#Plot basic diagnostics
def classification_diagnostics(model, x, y, standardize = True, classifier = True):
    
    fig, axs = plt.subplots(nrows = 1, ncols=2, figsize = (15,4))
    
    if classifier == True:
        
        y_hat = model.predict_proba(x)[:, 1]
        residuals = y - y_hat
        std_error = math.sqrt(np.var(residuals))
        std_residuals = residuals/std_error
        
    else:
        
        y_hat = model.predict(x)
        residuals = y_hat - y
        std_error = math.sqrt(np.var(residuals))
        std_residuals = residuals/std_error
        
    #Residual vs Fitted Plot
    sns.regplot(y_hat, 
                std_residuals, 
                lowess=True,
                line_kws={'color': 'red', 'lw': 1, 'alpha': 1},
                ax = axs[0])
    
    #Histogram of residuals
    sns.distplot(std_residuals, 
                hist=True,
                ax = axs[1])
    
    #Labels
    axs[0].set(xlabel="Fitted Values", 
                ylabel="Pearson's Standardized Residuals", 
                title = 'Residual vs Fitted')
    
    axs[1].set(xlabel="Fitted Values", 
                ylabel="Frequency", 
                title = "Residuals Histogram")

In [87]:
class FeatureSelector(BaseEstimator, TransformerMixin):

    def __init__(self, select_features):
        self.select_features = select_features
    
    def fit(self, x_df, y=None):
        return self
    
    def transform(self, x_df, y=None):
        select_features = self.select_features
        x_df = x_df[select_features]
        return(x_df)

    
class ContinuousFeatureEngineering(BaseEstimator, TransformerMixin):
    
    # Initiate class
    def __init__(self, window):
        self.window = window
        return None
        
    # We don't need to fit anything, so leave this as is
    def fit(self, df, y=None):
        return self
    
    # Perform our feature transformations
    def transform(self, df, y=None):
        
        x_df = df.copy(deep = True)
        
        col_names = x_df.columns
        
        # Time-Series transformations
        for col in col_names:
            
            # Box-Cox estimation
            x_df[col] = x_df[col] - min(x_df[col]) + 0.0001
            y, fitted_lambda = scipy.stats.boxcox(x_df[col],lmbda = None)
        
            # First difference, make first element nan, loss from differencing
            y = np.append([np.nan], np.diff(y, n=1))
            x_df[col] = y
        
        # Add volatility columns
        for col in col_names:
            x_df[col + '_vol'] = x_df[col].rolling(self.window).std()
        
        # Fill nan values with mean
        x_df = x_df.fillna(x_df.mean())
        
        # Standardize data
        standard_scaler = preprocessing.StandardScaler()
        x_df = standard_scaler.fit_transform(x_df)
        
        return x_df

In [88]:
# ----Feature Pipeline ----#
num_attributes = ['accel_x', 'accel_y', 'accel_z', 'gyro_x', 'gyro_y', 'gyro_z']
window = 50

numerical_pipeline = Pipeline([

    ('FeatureSelector', FeatureSelector(num_attributes)),
    ('FeatureEngineering', ContinuousFeatureEngineering(window))
])

feature_pipeline = FeatureUnion([
        ('numerical_pipeline', numerical_pipeline)
    ])

## Test some algorithms

In [89]:
processed_x_train = feature_pipeline.fit_transform(x_train)
proccessed_x_test = feature_pipeline.fit_transform(x_test)

Logistic Regression

In [90]:
log_model = LogisticRegression()
log_model.fit(processed_x_train, y_train)
scores = cross_val_score(log_model, processed_x_train, y_train,
                        scoring = "roc_auc", cv = 10)
print('CV AUC Scores: ', scores)
print('Mean CV AUC Scores: ', np.mean(scores))

CV AUC Scores:  [0.66273677 0.65503826 0.65611071 0.6612758  0.66029784 0.66107315
 0.66935034 0.66424128 0.665273   0.66616448]
Mean CV AUC Scores:  0.6621561634993807


CART Classifier

In [91]:
cart_model = DecisionTreeClassifier(min_samples_leaf = 30) #blanket default, to avoid 1 observation per leaf
cart_model.fit(processed_x_train, y_train)

scores = cross_val_score(cart_model, processed_x_train, y_train,
                        scoring = "roc_auc", cv = 10)
print('CV AUC Scores: ', scores)
print('Mean CV AUC Scores: ', np.mean(scores))

CV AUC Scores:  [0.78643484 0.78126546 0.78251569 0.7876979  0.78648604 0.7872575
 0.78637583 0.7873258  0.78311762 0.78271116]
Mean CV AUC Scores:  0.7851187819993154
