In [1]:
#numeric: pandas and numpy
import numpy as np
import pandas as pd
# graphics
%matplotlib inline 

import matplotlib.pyplot as plt 
import matplotlib.gridspec as gridspec 

from sklearn import linear_model, svm, preprocessing, tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix

In [2]:
df = pd.read_csv('data/sensoringData_feature_prepared_20_19.0_2.csv',header = 0)

# id is useless
df.drop('id',axis=1,inplace=True)
df.drop('user',axis=1,inplace=True)
df.drop('timestamp',axis=1,inplace=True)

feature_list = list(df.columns[:-2])
print(len(feature_list))
#print(df.head())

# print the number of missing 
#df.isnull().sum()

90


In [3]:
print(df['activity'].unique())

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'species'.
df['activity']= label_encoder.fit_transform(df['activity'])
 
print(df['activity'].unique())

y=df.values[:,-1]
#Y = np.array(y).astype(int)
#print(y)

X=df.values[:,0:-2]
print(f"Features: {len(X[0])}")
print(f"Examples: {len(X)}")

['Walking' 'Inactive' 'Active' 'Driving']
[3 2 0 1]
Features: 90
Examples: 499276


In [4]:
labels = np.unique(y)
#print(labels)
quantity = [np.sum(y == label) for label in labels]
#print(quantity)
#print(sum(quantity))

# Creating plot 
fig = plt.figure(figsize =(20, 10)) 
#plt.pie(quantity, labels=labels) 
  
# show plot 
#plt.show()

<Figure size 1440x720 with 0 Axes>

In [5]:
# Separating the data 
dfW=df[df['activity'] == "Walking"]
dfA=df[df['activity'] == "Active"]
dfI=df[df['activity'] == "Inactive"]
dfD=df[df['activity'] == "Driving"]
"""
#Creating the window with 10 subplots.
plt.rcParams.update({'font.size': 8})
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(7,10))
axes = axes.ravel()
#Creating histograms with 50 bins
for idx,ax in enumerate(axes):
    ax.figure
    binwidth= (max(df[feature_list[idx]]) - min(df[feature_list[idx]]))/40
    ax.hist([dfW[feature_list[idx]],dfI[feature_list[idx]],dfA[feature_list[idx]],dfD[feature_list[idx]]], bins=np.arange(min(df[feature_list[idx]]), max(df[feature_list[idx]]) + binwidth, binwidth) , alpha=0.8,stacked=True, density= True, label=['W','I','A','D'],color=['b','g','orange','r'])
    ax.legend(loc='upper right')
    ax.set_title(feature_list[idx])
plt.tight_layout()
plt.show()"""

"\n#Creating the window with 10 subplots.\nplt.rcParams.update({'font.size': 8})\nfig, axes = plt.subplots(nrows=4, ncols=3, figsize=(7,10))\naxes = axes.ravel()\n#Creating histograms with 50 bins\nfor idx,ax in enumerate(axes):\n    ax.figure\n    binwidth= (max(df[feature_list[idx]]) - min(df[feature_list[idx]]))/40\n    ax.hist([dfW[feature_list[idx]],dfI[feature_list[idx]],dfA[feature_list[idx]],dfD[feature_list[idx]]], bins=np.arange(min(df[feature_list[idx]]), max(df[feature_list[idx]]) + binwidth, binwidth) , alpha=0.8,stacked=True, density= True, label=['W','I','A','D'],color=['b','g','orange','r'])\n    ax.legend(loc='upper right')\n    ax.set_title(feature_list[idx])\nplt.tight_layout()\nplt.show()"

In [6]:
# Anova selection

def get_best_x_features(X, y, num_features=50):
    #df = (df - np.min(df))/(np.max(df) - np.min(df))
    
    k_bestfeatures = SelectKBest(score_func = f_classif, k=num_features)
    k_bestfeatures.fit(X, y)
    
    # what are scores for the features
    #for i in range(len(rankings.scores_)):
        #print('Feature %d: %f' % (i, rankings.scores_[i]))
    
    # transform train input data
    X_best = k_bestfeatures.transform(X)
    return X_best
        

In [7]:
X_best = get_best_x_features(X,y,30)
print(f"Features: {len(X_best[0])}")
print(f"Examples: {len(X_best)}")

Features: 30
Examples: 499276


# Training

### Train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_50, y, test_size=0.3,random_state=109) # 70% training and 30% test

[1. 3. 0. ... 0. 3. 0.]


### StratifiedKFold

In [8]:
k_fold = StratifiedKFold(n_splits=4)

In [9]:
def run_kFold(model, X, y):
    acc_results = []
    for train_indices, test_indices in k_fold.split(X,y):
        print("Train: ",train_indices.shape[0])
        print("test: ",test_indices.shape[0])
        scaler = StandardScaler()
        
        X_train = X[train_indices]
        y_train = y[train_indices]
        Xs=scaler.fit_transform(X_train)

        X_test = scaler.transform(X[test_indices])
        y_test = y[test_indices]
        
        #Train the model
        model.fit(Xs,y[train_indices])
        #Predict the response for test dataset
        y_pred=model.predict(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        acc_results.append(acc)
        
    return np.mean(acc_results)
    

## Models Functions

In [10]:
#Create a svm Classifier
def linear_svm():
    lin_clf = svm.SVC(kernel="linear", C=1, gamma=1) # one-vs-all
    
    return run_kFold(lin_clf, X_best, y)

In [11]:
# Logistic regression
def lr(X_train, y_train):
    regr = linear_model.LogisticRegression()
    
    return run_kFold(regr, X_best, y)

In [12]:
# K-Nearest Neighbors
def knn():
    k_n_n = KNeighborsClassifier(n_neighbors=3)

    return run_kFold(k_n_n, X_best, y)

In [13]:
# Decision Tree
def dtree():
    clf = tree.DecisionTreeClassifier()
    
    return run_kFold(clf, X_best, y)

In [14]:
# Random Forest
def rf():
    #Create a Gaussian Classifier
    r_forest = RandomForestClassifier(n_estimators=100)

    return run_kFold(r_forest, X_best, y)

# Decision Tree

In [15]:
dt_y_pred = dtree()
print("Accuracy:",dt_y_pred)

Train:  374457
test:  124819
Train:  374457
test:  124819
Train:  374457
test:  124819
Train:  374457
test:  124819
Accuracy: 0.8260901785785818


# Random Forest

In [None]:
rf_y_pred = rf()
print("Accuracy:",rf_y_pred)

# K-Nearest Neighbors

In [23]:
knn_y_pred = knn(X_train, y_train)
print("Accuracy:",knn_y_pred)

['Inactive' 'Driving' 'Walking' ... 'Walking' 'Driving' 'Inactive']
Accuracy: 0.97593852439863


# Linear SVM 

In [None]:
svm_y_pred = svm()
print("Accuracy:",svm_y_pred)

In [None]:
"""lr_y_pred = lr(X_train, y_train)
print("Accuracy:",accuracy_score(y_test, lr_y_pred))"""