# Packages

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import shutil

#Data Tools
import numpy as np
import pandas as pd
import math
import itertools

#Model Tools
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import svm, datasets
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import pickle

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns


# Data Cleaning

In [2]:
# Read in Training Data from CSV
df_acc = pd.read_csv("acc_4.csv")

#Cleaning functions
def half_int(x):
    val=0.5 * math.ceil(2.0 * x)
    return val


def cleaning(df_acc):
    # Rename df_acc column, 'train' -> 'mode'
    df_acc.columns = ['TI', 'TS', 'x', 'y', 'z', 'mode', 'dataset']

    # Drop TS column and replace transportation mode into integer
    df_acc = df_acc.replace({'mode': {'Bus':2, 'Train': 1, 'Walking': 0}})


    # Convert TI value to integer
    df_acc['TI'] = df_acc['TI'].apply(lambda x: half_int(x))

    # Group by and get mean by TI as key
    ndf_acc = df_acc.groupby(['TI', 'dataset'], as_index=False)[['x', 'y', 'z', 'mode']].mean()

    # Clean filtered data
    ndf_acc.columns = ['TI', 'dataset', 'acc_x', 'acc_y', 'acc_z', 'mode']

    
    return ndf_acc

df=cleaning(df_acc)


#Magnitude dataframe
df['acc_magnitude']=(df['acc_x']**2+df['acc_y']**2+df['acc_z']**2)**(1/2.0)

#Overlapping
n=10
o=5
X=[df['acc_magnitude'].values[x:x+n] for x in range(0,len(df['acc_magnitude'])-n+1, o) if (df.iloc[x]['dataset'] == df.iloc[x+n-1]['dataset'] and df.iloc[x]['mode'] == df.iloc[x+n-1]['mode'])]
X=np.asarray(X)

Y=[df['mode'].values[x] for x in range(0,len(df['mode'])-n+1, o) if df.iloc[x]['dataset'] == df.iloc[x+n-1]['dataset'] and df.iloc[x]['mode'] == df.iloc[x+n-1]['mode']]
Y=np.asarray(Y)

# K - Cross Validation

In [3]:
#Search for an optimal value of K for KNN

# range of k we want to try
k_range = range(1, 31)
# empty list to store scores
k_scores = []

# 1. we will loop through reasonable values of k
for k in k_range:
    # 2. run KNeighborsClassifier with k neighbours
    knn = KNeighborsClassifier(n_neighbors=k)
    # 3. obtain cross_val_score for KNeighborsClassifier with k neighbours
    scores = cross_val_score(knn, X, Y, cv=10, scoring='accuracy')
    # 4. append mean of scores for k neighbors to k_scores list
    k_scores.append(scores.mean())


print(k_scores)
k_scores.index(max(k_scores))

[0.7954658759365483, 0.7983348204674858, 0.8239523292992489, 0.8185439757521283, 0.8259421110649543, 0.8239555757995216, 0.8277487797238956, 0.8302703417313981, 0.830091131419217, 0.8291911980464951, 0.8273952236042508, 0.8273903521052779, 0.8273974874116223, 0.8255882215524627, 0.8273939319983972, 0.8273874424949795, 0.8248678283876408, 0.8232426303511241, 0.8221644623648668, 0.8217956704252707, 0.8208986536571027, 0.8181946383342742, 0.8192796187254127, 0.8180154315192208, 0.8174823456830593, 0.8158564948493602, 0.8162191306783854, 0.8147750873570857, 0.8145929639374787, 0.8135115529480764]


7

# Model Training

In [4]:
# Instantiate learning model (k = 19)
knn = KNeighborsClassifier(n_neighbors=7)

# Fitting the model
knn.fit(X, Y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='uniform')

# Pickling the Model

In [5]:
filename = 'bus_train_walk_model.sav'
pickle.dump(knn, open(filename, 'wb'))