## Import packages

In [1]:
## import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from statistics import *
import pickle
import statistics
from scipy.stats.stats import pearsonr
from sklearn.metrics import auc

## Import dataset

In [2]:
## load raw dataset
with open('/Users/a/Desktop/df_raw.pkl', 'rb') as handle:
    example = pickle.load(handle)
    
df=pd.DataFrame(example)

In [3]:
df = df.reset_index()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19937 entries, 0 to 19936
Data columns (total 12 columns):
bookingID         19937 non-null int64
Accuracy          19937 non-null object
Bearing           19937 non-null object
acceleration_x    19937 non-null object
acceleration_y    19937 non-null object
acceleration_z    19937 non-null object
gyro_x            19937 non-null object
gyro_y            19937 non-null object
gyro_z            19937 non-null object
second            19937 non-null object
Speed             19937 non-null object
label             19937 non-null int64
dtypes: int64(2), object(10)
memory usage: 1.8+ MB


## Feature Engineering 

- Add resultant_acceleration

In [5]:
result_acc = []
for i in range(len(df.index.values)):
    acc_x = df.iloc[i,:].acceleration_x
    acc_y = df.iloc[i,:].acceleration_y
    acc_z = df.iloc[i,:].acceleration_z
    result_acc.append(np.sqrt(np.square(acc_x) + np.square(acc_y) + np.square(acc_z)))

df['result_acceleration'] = result_acc 

- Add resultant_gyro

In [6]:
result_gyro = []
for i in range(len(df.index.values)):
    gyro_x = df.iloc[i,:].gyro_x
    gyro_y = df.iloc[i,:].gyro_y
    gyro_z = df.iloc[i,:].gyro_z
    result_gyro.append(np.sqrt(np.square(gyro_x) + np.square(gyro_y) + np.square(gyro_z)))

df['result_gyro'] = result_gyro 

- Add rotation angle for x-axis and y-axis

In [7]:
rotate_x = []
for i in range(len(df.index.values)):
    acc_x = df.iloc[i,:].acceleration_x
    acc_y = df.iloc[i,:].acceleration_y
    acc_z = df.iloc[i,:].acceleration_z
    rotate_x.append(np.arctan(acc_y / np.sqrt(np.square(acc_x)+np.square(acc_z))))

df['rotate_x'] = rotate_x

  


In [8]:
def flip_sign(list_):
    return [-x for x in list_]

rotate_y = []

for i in range(len(df.index.values)):
    acc_x = flip_sign(df.iloc[i,:].acceleration_x)
    acc_z = df.iloc[i,:].acceleration_z
    rotate_y.append(np.arctan(np.array(acc_x) / np.array(acc_z)))

df['rotate_y'] = rotate_y

  if __name__ == '__main__':
  if __name__ == '__main__':


- Extract features from raw dataset

In [9]:
class feature_global():
    
    def __init__ (self, df):
        self.df = df 
        
    def sum_(self,field,index):
        return sum(np.abs(self.df.iloc[index,:][field]))
    
    def mean(self,field,index):
        return np.mean(self.df.iloc[index,:][field])
    
    def max_(self,field,index):
        return max(self.df.iloc[index,:][field]) 
    
    def std(self,field,index):
        return statistics.stdev(self.df.iloc[index,:][field])
    
    def iqr(self,field,index):
        return np.percentile(self.df.iloc[index,:][field],75) - np.percentile(self.df.iloc[index,:][field],25)     
    
    def integrate(self,field,index):
        integ = auc(self.df.iloc[index,:].second,self.df.iloc[index,:][field])
        return integ
    
    def max_consecutive_increase(self,field,index):
        list_ = self.df.iloc[index,:][field]
        max_increase = 0     
        count = 0
        for i in range(len(list_)-1):
            if list_[i+1] > list_[i]:
                count += 1
                if count > max_increase:
                    max_increase = count
            else: 
                count = 0
        return (max_increase + 1)/len(list_)
    
    def max_consecutive_decrease(self,field,index):
        list_ = self.df.iloc[index,:][field]
        max_decrease = 0     
        count = 0
        for i in range(len(list_)-1):
            if list_[i+1] < list_[i]:
                count += 1
                if count > max_decrease:
                    max_decrease = count
            else: 
                count = 0
        return (max_decrease + 1)/len(list_)
    
    def change(self,field,index):
        a = self.df.iloc[index,:][field]
        list_ = [(x - a[i-1]) for i, x in enumerate(a)][1:]
        return [statistics.mean(list_),max(list_)]      
    
    def avg_speed(self,index):
        distance = feature_global.integrate(self,'Speed',index)
        return distance / self.df.iloc[index,:].second[-1]
    
    def avg_gyro(self,index):
        rad_dist = feature_global.integrate(self,'result_gyro',index)
        return rad_dist / self.df.iloc[index,:].second[-1]
        
    def bearing(self,index):
        a = self.df.iloc[index,:]['Bearing']
        distance = feature_global.integrate(self,'Speed',index)
        diff_bear = []
        
        for i in range(1,len(a)):
            if a[i]<90 and a[i-1]>270:
                diff_bear.append(a[i] + 360 - a[i-1])
            elif a[i]>270 and a[i-1]<90:
                diff_bear.append(a[i-1] + 360 - a[i])
            else:
                diff_bear.append(a[i] - a[i-1])
        
        return [statistics.mean(diff_bear), max(diff_bear), sum(diff_bear)/distance ]
    
    def gap(self,index):
        diff_sec = self.df.iloc[index,:].second.diff().fillna(0)
        return [statistics.mean(diff_sec), max(diff_sec)]

In [10]:
accx_max = []
accy_max = []
accz_max = []

In [11]:
acc_mean = []
acc_max = []
acc_std = []
acc_iqr = []
acc_increase = []
acc_decrease = []
acc_mean_diff = []
acc_max_diff = []

In [12]:
rotate_x_max = []
rotate_x_max_diff = []
rotate_x_dist = []

rotate_y_max = []
rotate_y_max_diff = []
rotate_y_dist = []

rotate_z_dist = []

In [13]:
gyro_mean = []
gyro_max = []
gyro_iqr = []
gyro_increase = []
gyro_decrease = []
avg_gyro = []

In [14]:
speed_mean = []
speed_std = []
speed_max = []
speed_iqr = []
speed_increase = []
speed_decrease = []
distance = []
avg_speed = []

In [15]:
bear_increase = []
bear_decrease = []
bear_std = []
bear_mean_diff = []
bear_max_diff = []
bear_change_per_dist = []

In [16]:
trip_len = []

features = feature_global(df)

In [17]:
for i in range(len(df.index.values)): 
    
    accx_max.append(features.max_('acceleration_x',i))
    accy_max.append(features.max_('acceleration_y',i))
    accz_max.append(features.max_('acceleration_z',i))

    acc_mean.append(features.mean('result_acceleration',i))
    acc_max.append(features.max_('result_acceleration',i))
    acc_std.append(features.std('result_acceleration',i))
    acc_iqr.append(features.iqr('result_acceleration',i))
    acc_increase.append(features.max_consecutive_increase('result_acceleration',i))
    acc_decrease.append(features.max_consecutive_decrease('result_acceleration',i))
    acc_mean_diff.append(features.change('result_acceleration',i)[0])
    acc_max_diff.append(features.change('result_acceleration',i)[1])
    
    gyro_mean.append(features.mean('result_gyro',i))
    gyro_max.append(features.max_('result_gyro',i))
    gyro_iqr.append(features.iqr('result_gyro',i))
    gyro_increase.append(features.max_consecutive_increase('result_gyro',i))
    gyro_decrease.append(features.max_consecutive_decrease('result_gyro',i))
    avg_gyro.append(features.avg_gyro(i))
    rotate_z_dist.append(features.integrate('result_gyro',i))
    
    bear_increase.append(features.max_consecutive_increase('Bearing',i))
    bear_decrease.append(features.max_consecutive_decrease('Bearing',i))
    bear_mean_diff.append(features.bearing(i)[0])
    bear_max_diff.append(features.bearing(i)[1])
    bear_std.append(features.std('Bearing',i))
    bear_change_per_dist.append(features.bearing(i)[2])
    
    speed_mean.append(features.mean('Speed',i))
    speed_max.append(features.max_('Speed',i))
    speed_iqr.append(features.iqr('Speed',i))
    speed_increase.append(features.max_consecutive_increase('Speed',i))
    speed_decrease.append(features.max_consecutive_decrease('Speed',i))
    speed_std.append(features.std('Speed',i))
    distance.append(features.integrate('Speed',i))
    avg_speed.append(features.avg_speed(i))

    rotate_x_max.append(features.max_('rotate_x',i))
    rotate_x_max_diff.append(features.change('rotate_x',i)[1])
    rotate_x_dist.append(features.sum_('rotate_x',i))

    rotate_y_max.append(features.max_('rotate_y',i))
    rotate_y_max_diff.append(features.change('rotate_y',i)[1])
    rotate_y_dist.append(features.sum_('rotate_y',i))

    trip_len.append(df.iloc[i,:].second[-1])
    
    if i % 5000 ==0:
        print(i,' out of ',len(df.index.values),' is completed')        

0  out of  19937  is completed
5000  out of  19937  is completed
10000  out of  19937  is completed
15000  out of  19937  is completed


In [18]:
len(rotate_x_dist)

19937

## Combine features into a single dataframe

In [19]:
df_feature = np.c_[list(df.bookingID), list(df.label), accx_max, accy_max, accz_max, acc_mean, 
                  acc_max, acc_std, acc_iqr, acc_increase, acc_decrease, acc_mean_diff,
                  acc_max_diff, rotate_x_max, rotate_x_max_diff, rotate_x_dist, rotate_y_max,
                  rotate_y_max_diff, rotate_y_dist,rotate_z_dist, gyro_mean, gyro_max, gyro_iqr,
                  gyro_increase, gyro_decrease, avg_gyro, speed_mean, speed_std, speed_max,
                  speed_iqr, speed_increase, speed_decrease, distance, avg_speed,
                  bear_increase, bear_decrease, bear_std, bear_mean_diff, bear_max_diff,
                  bear_change_per_dist, trip_len]

In [20]:
df_feature = pd.DataFrame(df_feature)
df_feature.columns = ['bookingID', 'label', 'accx_max', 'accy_max', 'accz_max', 'acc_mean', 
                  'acc_max', 'acc_std', 'acc_iqr', 'acc_increase', 'acc_decrease', 'acc_mean_diff',
                  'acc_max_diff', 'rotate_x_max', 'rotate_x_max_diff', 'rotate_x_dist', 'rotate_y_max',
                  'rotate_y_max_diff', 'rotate_y_dist','rotate_z_dist', 'gyro_mean', 'gyro_max', 'gyro_iqr',
                  'gyro_increase', 'gyro_decrease', 'avg_gyro', 'speed_mean', 'speed_std', 'speed_max',
                  'speed_iqr', 'speed_increase', 'speed_decrease', 'distance', 'avg_speed',
                  'bear_increase', 'bear_decrease', 'bear_std', 'bear_mean_diff', 'bear_max_diff',
                  'bear_change_per_dist', 'trip_len']

In [21]:
df_feature.describe().iloc[:,:10]

Unnamed: 0,bookingID,label,accx_max,accy_max,accz_max,acc_mean,acc_max,acc_std,acc_iqr,acc_increase
count,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0
mean,816267000000.0,0.249887,4.119455,8.440648,5.720122,9.901941,14.265175,0.755279,0.710751,0.012352
std,494506100000.0,0.432958,2.50875,8.307189,4.984594,1.015064,4.449886,0.474272,0.449172,0.008892
min,0.0,0.0,-7.12121,-9.108994,-8.326172,0.973356,1.039078,0.010257,0.0,0.001746
25%,386547100000.0,0.0,2.903094,5.860738,3.063285,9.800065,12.522641,0.556651,0.490622,0.00653
50%,798863900000.0,0.0,3.684006,11.986897,5.236456,9.876681,13.454601,0.681726,0.649776,0.009615
75%,1245541000000.0,0.0,4.687981,13.225974,7.66076,9.960951,14.732588,0.83574,0.835849,0.014925
max,1709397000000.0,1.0,66.873456,75.055885,78.055756,40.021,115.152244,9.836279,11.419731,0.088235


In [22]:
df_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19937 entries, 0 to 19936
Data columns (total 41 columns):
bookingID               19937 non-null float64
label                   19937 non-null float64
accx_max                19937 non-null float64
accy_max                19937 non-null float64
accz_max                19937 non-null float64
acc_mean                19937 non-null float64
acc_max                 19937 non-null float64
acc_std                 19937 non-null float64
acc_iqr                 19937 non-null float64
acc_increase            19937 non-null float64
acc_decrease            19937 non-null float64
acc_mean_diff           19937 non-null float64
acc_max_diff            19937 non-null float64
rotate_x_max            19937 non-null float64
rotate_x_max_diff       19937 non-null float64
rotate_x_dist           19937 non-null float64
rotate_y_max            19937 non-null float64
rotate_y_max_diff       19937 non-null float64
rotate_y_dist           19915 non-null float6

In [23]:
import pickle
with open("/Users/a/Desktop/df_feature.pkl","wb") as handle:
    pickle.dump(df_feature,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
df_feature.to_csv('/Users/a/Desktop/df_feature.csv')