Data can be found here:

https://www.kaggle.com/uciml/human-activity-recognition-with-smartphones/downloads/human-activity-recognition-with-smartphones.zip  

In [27]:
import numpy
import pandas
import seaborn
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from tqdm import tqdm

## 2 - Load and analyse the data

In [2]:
train = pd.read_csv("./har_data/train.csv")
test = pd.read_csv("./har_data/test.csv")

In [3]:
train.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,STANDING
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,STANDING
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,STANDING
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,STANDING
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,STANDING


## FIXME: group folds on subject

In [4]:
train.drop('subject', axis =1, inplace=True)
test.drop('subject', axis =1, inplace=True)

In [11]:
for x in [train, test]:
    x['Activity'] = x.Activity.astype("category")

feature_cols = list(set(train.columns) - set(['Activity', 'subject']))

## 4 - Splitting the data into train and validation 

In [21]:
#Getting the split indexes

split_data = StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, random_state = 42)
train_idx, val_idx = next(split_data.split(train[feature_cols], train.Activity))

#creating the dataframes

x_train = train.loc[train_idx, feature_cols]
y_train = train.loc[train_idx, 'Activity']

x_val = train.loc[val_idx, feature_cols]
y_val = train.loc[val_idx, 'Activity']

## 5 - Predictive Modelling

In [22]:
# Remove useless features
from sklearn.feature_selection import VarianceThreshold
v = VarianceThreshold(threshold=0.2)
v.set_output(transform="pandas")
x_train = v.fit_transform(x_train)
x_val = v.transform(x_val)
x_val.shape

(2206, 58)

In [23]:
# Establish max perf
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators = 100)
cross_val_score(rf, x_train, y_train)


array([0.95048544, 0.94266278, 0.94266278, 0.95238095, 0.94655005])

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

def scores_to_error(scores):
    err = (1.0 - scores)*100
    return err

def check_sparse_logreg(x_train, y_train, C=1.0, weight_minimum=0.5, cv=5):

    # Define model
    est = LogisticRegression(C=C, penalty='l1', solver='liblinear', max_iter=10000)
    pipe = make_pipeline(StandardScaler(), est)
    
    # Estimate perf with all features
    scores_full = scores_to_error(cross_val_score(pipe, x_train, y_train, cv=cv))

    # Figure out which features to select/remove
    # FIXME: use SelectFromModel?
    pipe.fit(x_train, y_train)
    
    c = pipe.named_steps['logisticregression'].coef_
    weights_selected = np.count_nonzero(c)/np.prod(c.shape)
    
    feature_weights = np.sum(np.abs(c), axis=0)
    features_removed = feature_weights <= weight_minimum

    # Estimate perf with subset features
    selected_columns = x_train.columns[~features_removed]
    x_train_cut = x_train[selected_columns]
    scores = scores_to_error(cross_val_score(pipe, x_train_cut, y_train, cv=cv))

    results = pandas.Series({
        'features_selected': len(selected_columns),
        'cv_scores_full': scores_full,
        'cv_scores_reduced': scores,
        'C': C,
        'weight_minimum': weight_minimum,
    })
    return results

complexities = np.logspace(-3, +3, 20)
out = []
for C in tqdm(complexities):
    r  = check_sparse_logreg(x_train, y_train, C=C)
    out.append(r)
results = pandas.DataFrame(out)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [40:25<00:00, 121.27s/it]


In [None]:
results

In [None]:
results.to_csv('logreg-l1-feature-selection-2.csv')
del results

In [None]:
results = pandas.read_csv('logreg-l1-feature-selection-2.csv')

def load_csv_embedded_array(s):
    def load_one(vv):
        l = [ float(p.strip()) for p in vv.strip('[]').split(' ') if p.strip() ]
        return l
    return s.apply(load_one)
    
for c in ['cv_scores_full', 'cv_scores_reduced']:
    results[c] = load_csv_embedded_array(results[c])

results.head()

In [None]:
import scipy.stats

def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

#def mean_confidence_err(data, **kwargs):
#    mean, lower, upper = mean_confidence_interval(data, **kwargs)
#    return 

results['scores_full_mean'] = results['cv_scores_full'].apply(np.mean)
#results['scores_full_ci'] = results['cv_scores_full'].apply(mean_confidence_interval)

results['scores_reduced_mean'] = results['cv_scores_reduced'].apply(np.mean)
#results['scores_reduced_ci'] = results['cv_scores_reduced'].apply(mean_confidence_interval)

results = results.drop(columns=['cv_scores_full', 'cv_scores_reduced'])

results

In [None]:

import plotly
import plotly.express

# FIXME: plot both full and reduced, in separate colors
# FIXME: add error bars
# TODO: include other data on hover, like complexity value
fig = plotly.express.scatter(results,
                 x="features_selected",
                 y="scores_reduced_mean",
                 #color="species",
                 #error_y="scores_reduced_mean",
)
fig.update_layout(height=500, width=800,)
fig.add_hline(y=2.0, line_width=2, line_dash="dash", line_color="green")
fig.update_yaxes(range=[0.0, 10.0])
fig.show()
