I am using the Kaggle data which can be found here:

https://www.kaggle.com/uciml/human-activity-recognition-with-smartphones/downloads/human-activity-recognition-with-smartphones.zip  

## 1 - Importing thr Necessary Libraries

In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## 2 - Load and analyse the data

In [2]:
train = pd.read_csv("./har_data/train.csv")
test = pd.read_csv("./har_data/test.csv")

In [3]:
train.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,STANDING
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,STANDING
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,STANDING
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,STANDING
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,STANDING


In [4]:
test.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
0,0.257178,-0.023285,-0.014654,-0.938404,-0.920091,-0.667683,-0.952501,-0.925249,-0.674302,-0.894088,...,-0.705974,0.006462,0.16292,-0.825886,0.271151,-0.720009,0.276801,-0.057978,2,STANDING
1,0.286027,-0.013163,-0.119083,-0.975415,-0.967458,-0.944958,-0.986799,-0.968401,-0.945823,-0.894088,...,-0.594944,-0.083495,0.0175,-0.434375,0.920593,-0.698091,0.281343,-0.083898,2,STANDING
2,0.275485,-0.02605,-0.118152,-0.993819,-0.969926,-0.962748,-0.994403,-0.970735,-0.963483,-0.93926,...,-0.640736,-0.034956,0.202302,0.064103,0.145068,-0.702771,0.280083,-0.079346,2,STANDING
3,0.270298,-0.032614,-0.11752,-0.994743,-0.973268,-0.967091,-0.995274,-0.974471,-0.968897,-0.93861,...,-0.736124,-0.017067,0.154438,0.340134,0.296407,-0.698954,0.284114,-0.077108,2,STANDING
4,0.274833,-0.027848,-0.129527,-0.993852,-0.967445,-0.978295,-0.994111,-0.965953,-0.977346,-0.93861,...,-0.846595,-0.002223,-0.040046,0.736715,-0.118545,-0.692245,0.290722,-0.073857,2,STANDING


In [5]:
#Check for null values
train.isnull().values.any()

False

In [6]:
test.isnull().values.any()

False

There are no null values in either the test and the train datasets

The subject column is not going to be usefull here so i will drop it from both data sets

In [7]:
train.drop('subject', axis =1, inplace=True)
test.drop('subject', axis =1, inplace=True)

In [8]:
train.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.298676,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,STANDING
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,STANDING
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,STANDING
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.11729,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,STANDING
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.351471,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,STANDING


In [9]:
rem_cols2 = test.columns.tolist()

In [10]:
# We check the datatypes 
train.dtypes.value_counts()

float64    561
object       1
dtype: int64

In [11]:
test.dtypes.value_counts()

float64    561
object       1
dtype: int64

****Should we rescale the data? Scaling a dataset usually produces better dataset and more accurate predictions. First we check the range( the min and the max) for each of the datasets. Lets try using the .describe() method and lets exclude the activity column which is the last column. ****

In [12]:
train.describe()  #we see that the min = -1 and the max = +1. so no need for scaling

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
count,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,...,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0
mean,0.274488,-0.017695,-0.109141,-0.605438,-0.510938,-0.604754,-0.630512,-0.526907,-0.60615,-0.468604,...,0.125293,-0.307009,-0.625294,0.008684,0.002186,0.008726,-0.005981,-0.489547,0.058593,-0.056515
std,0.070261,0.040811,0.056635,0.448734,0.502645,0.418687,0.424073,0.485942,0.414122,0.544547,...,0.250994,0.321011,0.307584,0.336787,0.448306,0.608303,0.477975,0.511807,0.29748,0.279122
min,-1.0,-1.0,-1.0,-1.0,-0.999873,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-0.995357,-0.999765,-0.97658,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,0.262975,-0.024863,-0.120993,-0.992754,-0.978129,-0.980233,-0.993591,-0.978162,-0.980251,-0.936219,...,-0.023692,-0.542602,-0.845573,-0.121527,-0.289549,-0.482273,-0.376341,-0.812065,-0.017885,-0.143414
50%,0.277193,-0.017219,-0.108676,-0.946196,-0.851897,-0.859365,-0.950709,-0.857328,-0.857143,-0.881637,...,0.134,-0.343685,-0.711692,0.009509,0.008943,0.008735,-0.000368,-0.709417,0.182071,0.003181
75%,0.288461,-0.010783,-0.097794,-0.242813,-0.034231,-0.262415,-0.29268,-0.066701,-0.265671,-0.017129,...,0.289096,-0.126979,-0.503878,0.150865,0.292861,0.506187,0.359368,-0.509079,0.248353,0.107659
max,1.0,1.0,1.0,1.0,0.916238,1.0,1.0,0.967664,1.0,1.0,...,0.9467,0.989538,0.956845,1.0,1.0,0.998702,0.996078,1.0,0.478157,1.0


They have the same data types. That is, mostly floats and one object feature. Lets see what the object feature is abd extract it from the rest

As we can see, the only object data type in both train and the test dataset is the Activity feature. Lets take a closer look at it...

We need to encode the Activity column becasue sklearn won't accept sparse matrix as prediction columns . WEe will use LabelEncoder to encode the Activities 

In [33]:
for x in [train, test]:
    x['Activity'] = x.Activity.astype("category")

## 3- Finding the Correlation/ Relationships between the features

Correlation refers to the mutual relationship and association between quantities and it is generaly used to express one quantity in terns of its relationship with other quantities. The can either be Positive(variables change in the same direction), negative(variables change in opposite direction or neutral(No correlation).

Variable within a dataset can be related in lots of ways and for lost of reasons:
    - They could depend on values of other variable
    - They could be associated to each other
    - They could both depend on a thirf variable.
    
In this project, we will be using the pandas method .corr() for calculating correlation between dataframe columns

## 4 - Splitting the data into train and validation 

In [87]:
#Getting the split indexes

split_data = StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, random_state = 42)
train_idx, val_idx = next(split_data.split(train[feature_cols], train.Activity))

#creating the dataframes

x_train = train.loc[train_idx, feature_cols]
y_train = train.loc[train_idx, 'Activity']

x_val = train.loc[val_idx, feature_cols]
y_val = train.loc[val_idx, 'Activity']

## 5 - Predictive Modelling

In [91]:
from sklearn.feature_selection import VarianceThreshold

v = VarianceThreshold(threshold=0.2)
x_train = v.fit_transform(x_train)
x_val = v.transform(x_val)
x_val.shape

(2206, 58)

In [78]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

lsvc = LinearSVC(C=0.01, penalty="l1", max_iter=10000, dual=False).fit(StandardScaler().fit_transform(x_train), y_train)
model = SelectFromModel(lsvc, prefit=True)
xx = model.transform(x_train)
xx.shape




(5146, 169)

In [64]:

splitter = StratifiedShuffleSplit(n_splits = 5, test_size = 0.3, random_state = 42)
from sklearn.feature_selection import SequentialFeatureSelector

s = SequentialFeatureSelector(estimator=RandomForestClassifier(n_estimators=10),
                              n_features_to_select=10,
                              direction='forward',
                              scoring='accuracy',
                              n_jobs=4,
                              cv=splitter)
s.fit(x_train, y_train)


#KNeighborsClassifier(n_neighbors=3), n_features_to_select=3


KeyboardInterrupt: 

In [92]:
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators = 30)

cross_val_score(rf, x_train, y_train)


array([0.93883495, 0.94266278, 0.94071914, 0.94557823, 0.94460641])

In [93]:
x_train.shape

(5146, 58)

In [137]:
from sklearn.svm import NuSVC, SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

svc = NuSVC(nu=0.10,
      kernel='linear',
      #gamma='scale',
      shrinking=True,
      tol=0.001,
      cache_size=200,
      verbose=False,
      max_iter=100000,
      decision_function_shape='ovr',
      break_ties=False,
      random_state=None)

pipe = make_pipeline(StandardScaler(), svc)

scores = cross_val_score(pipe, x_train, y_train)
print(scores)
pipe.fit(x_train, y_train)
print(np.sum(pipe.named_steps['nusvc'].n_support_))


[0.90679612 0.88921283 0.90962099 0.92517007 0.92614189]
1595


In [None]:
# Seems like NuSVC has a quite small range for the support vectors

In [136]:


from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

svc = SVC(C=1.0, max_iter=100000, kernel='linear')
pipe = make_pipeline(StandardScaler(), svc)
scores = cross_val_score(pipe, x_train, y_train)
print(scores)
pipe.fit(x_train, y_train)
print(pipe.named_steps)
print(np.sum(pipe.named_steps['svc'].n_support_))


[0.94174757 0.9436346  0.93488824 0.95238095 0.95140914]
{'standardscaler': StandardScaler(), 'svc': SVC(kernel='linear', max_iter=100000)}
990


In [60]:
pipe.named_steps['nusvc'].support_vectors_.shape

(4703, 561)

In [None]:
# LinearSVC has both a larger range for the support vectors, and a lower number for better performance