# Comp255 Assignment 1

In [1]:
#import packages

import numpy as np 
import pandas as pd 
from scipy import signal
import matplotlib.pyplot as plt 
import math
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

## Data Visualisation - task 1 

In [None]:
# read and visualise the raw data 

def data_visulization():
    # read dataset file
    df = pd.read_csv('dataset_1.txt', sep=',', header=None)
    for i in range (1,14):
        df_activity = df[df[24] == i].values
        # Plot the accelerometer data
        print("Accelerometer for activity" + str(i))
        plt.plot(df_activity[500:1500, 0:3])
        plt.show()
        #plot the gyroscope data
        print("Gyroscope for activity" + str(i))
        plt.plot(df_activity[500:1500, 3:6])
        plt.show()

In [None]:
#function call
data_visulization()

In [None]:
# apply sensoring and filtering a visulise the same data 

def noise_removing():
    # Butterworth low-pass filter. 
    b, a = signal.butter(4, 0.04, 'low', analog=False)
    df = pd.read_csv('dataset_1.txt', sep=',', header=None)
    for i in range(1,14):
        df_activity = df[df[24] == i].values
    
        for j in range(3):
            df_activity[:,j] = signal.lfilter(b, a, df_activity[:, j])
        # Plot the accelerometer data
        print("Accelerometer for activity" + str (i))
        plt.plot(df_activity[500:1500, 0:3])
        plt.show()

        for j in range(3,6):
            df_activity[:,j] = signal.lfilter(b, a, df_activity[:, j])
        #plot the gyroscope data
        print("Gyroscope for activity" + str(i))
        plt.plot(df_activity[500:1500, 3:6])
        plt.show()


In [None]:
noise_removing()

## Feature Engineering - task 2

In [None]:
# Print out features for all 19 sets of data 
def create_features():
    b, a = signal.butter(4, 0.04, 'low', analog=False) # create a butterworth low pass filter

    for i in range(19): #read all 19 datasets
            df = pd.read_csv('dataset_' + str(i + 1) + '.txt', sep=',', header=None)
            print("") # used only to place a space between datasets 
            print('deal with dataset ' + str(i + 1))

            for c in range(1, 14): #seperate each activity in each dataset
                print("Activity" + str(c))
                Activity_data = df[df[24] == c].values

                for j in range (24): # clean data of each sensor 
                    Activity_data[:,i] = signal.lfilter(b, a, Activity_data[:, i])

                #create features
                minVal = np.min(Activity_data[:, c])
                maxVal = np.max(Activity_data[:, c])
                meanVal = np.mean(Activity_data[:, c])
                medianVal = np.median(Activity_data[:, c])

                print("minVal = " + str(minVal) + ", maxVal = " + str(maxVal) +  ", meanVal = " + str(meanVal)
                      +  ", medianVal = " + str(medianVal))

In [None]:
create_features()

In [None]:
def feature_engineering():
    training = np.empty(shape=(0, 73))
    testing = np.empty(shape=(0, 73))
    
    # declare the butterworth filter 
    b, a = signal.butter(4, 0.04, 'low', analog=False)
    
    # Read dataset from all 19 participants 
    for i in range(19):
        df = pd.read_csv('dataset_' + str(i + 1) + '.txt', sep=',', header=None)
        print('deal with dataset ' + str(i + 1))
        
        # Seperate each activity in each dataset
        for c in range(1, 14): 
                print("Deal with activity" + str(c))
                activity_data = df[df[24] == c].values
                
                # Apply the butterworth filter to each sensor
                for j in range(24):
                    activity_data[:, j] = signal.lfilter(b, a, activity_data[:, j])

                    # Seperate the training and testing sets 
                datat_len = len(activity_data)
                training_len = math.floor(datat_len * 0.8) # seperates 80% of data for training and 20% for testing
                training_data = activity_data[:training_len, :]
                testing_data = activity_data[training_len:, :]

                # Data segmentation - create feature sets 
                training_sample_number = training_len // 1000 + 1
                testing_sample_number = (datat_len - training_len) // 1000 + 1
                
                for s in range(training_sample_number):
                    if s < training_sample_number - 1:
                        sample_data = training_data[1000*s:1000*(s + 1), :]
                    else:
                        sample_data = training_data[1000*s:, :]
                
                # For each sensor extract a feature of min, max and mean value then store it in list feature sample
                # Feature sample is then added to training 
                    feature_sample = []
                    for i in range(24):
                        feature_sample.append(np.min(sample_data[:, i]))
                        feature_sample.append(np.max(sample_data[:, i]))
                        feature_sample.append(np.mean(sample_data[:, i]))
                    feature_sample.append(sample_data[0, -1])
                    feature_sample = np.array([feature_sample])
                    training = np.concatenate((training, feature_sample), axis=0)

                for s in range(testing_sample_number):
                    if s < training_sample_number - 1:
                        sample_data = testing_data[1000*s:1000*(s + 1), :]
                    else:
                        sample_data = testing_data[1000*s:, :]
                    
                    # For each sensor extract a feature of min, max and mean value then store it in list feature sample
                    # Feature sample is then added to testing 
                    feature_sample = []
                    for i in range(24):
                        feature_sample.append(np.min(sample_data[:, i]))
                        feature_sample.append(np.max(sample_data[:, i]))
                        feature_sample.append(np.mean(sample_data[:, i]))
                    feature_sample.append(sample_data[0, -1])
                    feature_sample = np.array([feature_sample])
                    testing = np.concatenate((testing, feature_sample), axis=0)

    df_training = pd.DataFrame(training)
    df_testing = pd.DataFrame(testing)
    df_training.to_csv('training_data.csv', index=None, header=None)
    df_testing.to_csv('testing_data.csv', index=None, header=None)
                

                

In [None]:
feature_engineering()