# Packages

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import shutil

#Data Tools
import numpy as np
import pandas as pd
import math
import itertools

#Model Tools
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import svm, datasets
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import pickle

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns


# Data Cleaning

In [2]:
# Read in Training Data from CSV
df_acc = pd.read_csv("acc_1.csv")

#Cleaning functions
def half_int(x):
    val=0.5 * math.ceil(2.0 * x)
    return val


def cleaning(df_acc):
    # Rename df_acc column, 'train' -> 'mode'
    df_acc.columns = ['TI', 'TS', 'x', 'y', 'z', 'mode', 'dataset']

    # Drop TS column and replace transportation mode into integer
    df_acc = df_acc.replace({'mode': {'Train': 1, 'Walking': 0}})


    # Convert TI value to integer
    df_acc['TI'] = df_acc['TI'].apply(lambda x: half_int(x))

    # Group by and get mean by TI as key
    ndf_acc = df_acc.groupby(['TI', 'dataset'], as_index=False)[['x', 'y', 'z', 'mode']].mean()

    # Clean filtered data
    ndf_acc.columns = ['TI', 'dataset', 'acc_x', 'acc_y', 'acc_z', 'mode']

    
    return ndf_acc

df=cleaning(df_acc)


#Magnitude dataframe
df['acc_magnitude']=(df['acc_x']**2+df['acc_y']**2+df['acc_z']**2)**(1/2.0)

#Overlapping
n=10
o=5
X=[df['acc_magnitude'].values[x:x+n] for x in range(0,len(df['acc_magnitude'])-n+1, o) if (df.iloc[x]['dataset'] == df.iloc[x+n-1]['dataset'] and df.iloc[x]['mode'] == df.iloc[x+n-1]['mode'])]
X=np.asarray(X)

Y=[df['mode'].values[x] for x in range(0,len(df['mode'])-n+1, o) if df.iloc[x]['dataset'] == df.iloc[x+n-1]['dataset'] and df.iloc[x]['mode'] == df.iloc[x+n-1]['mode']]
Y=np.asarray(Y)

# K - Cross Validation

In [6]:
#Search for an optimal value of K for KNN

# range of k we want to try
k_range = range(1, 31)
# empty list to store scores
k_scores = []

# 1. we will loop through reasonable values of k
for k in k_range:
    # 2. run KNeighborsClassifier with k neighbours
    knn = KNeighborsClassifier(n_neighbors=k)
    # 3. obtain cross_val_score for KNeighborsClassifier with k neighbours
    scores = cross_val_score(knn, X, Y, cv=10, scoring='accuracy')
    # 4. append mean of scores for k neighbors to k_scores list
    k_scores.append(scores.mean())


print(k_scores)
k_scores.index(max(k_scores))

[0.7889019140258654, 0.7696376669119833, 0.8065808195357169, 0.8071054375686014, 0.8116199394226543, 0.813048737661827, 0.8135759572855825, 0.8150041331162494, 0.8128984167631677, 0.8147785688656628, 0.8128990960873314, 0.8147039471554134, 0.8129742841424594, 0.8150805093283399, 0.8144784962249242, 0.8160582356411847, 0.8164336099979612, 0.8168856431854474, 0.8168095496771792, 0.8178631434545188, 0.8156065989036341, 0.8165847216664142, 0.8149299638343596, 0.8161333110578399, 0.8136511459371321, 0.8155316940635469, 0.8123719321011876, 0.8135009401478348, 0.8108676068690315, 0.8107923060902277]


19

# Model Training

In [3]:
# Instantiate learning model (k = 19)
knn = KNeighborsClassifier(n_neighbors=19)

# Fitting the model
knn.fit(X, Y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=19, p=2,
           weights='uniform')

# Pickling the Model

In [4]:
filename = 'train_walk_model.sav'
pickle.dump(knn, open(filename, 'wb'))