# Data Preprocessing - Speed Bump Detection


## 1.Read dataset and save it into a pandas DataFrame

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from scipy.stats import kurtosis

df = pd.read_csv("Road_surface_dataset Raw V0.2.csv", index_col=0)

df.head()

Unnamed: 0_level_0,ACC_X,ACC_Y,ACC_Z,GYRO_X,GYRO_Y,GYRO_Z,Latitude,Longitude,Altitude,Accuracy,Speed,Bearing,Road_Anomaly_type,Date
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
30419,-0.491625,8.789066,0.363765,0.0,0.0,0.0,35.710672,-0.602766,203.886207,28.136999,0.0,0.0,NONE,18-07-2021 11:19:27
30420,-1.616795,10.795905,0.492573,0.0,0.0,0.0,35.710672,-0.602766,203.886207,28.136999,0.0,0.0,NONE,18-07-2021 11:19:27
30421,1.363862,8.480052,-0.841514,0.25597,-1.71524,0.424865,35.710556,-0.602052,203.886207,2112.0,0.0,0.0,NONE,18-07-2021 11:19:28
30422,-1.312856,11.562758,2.396663,-0.17202,-0.091665,-0.005201,35.710556,-0.602052,203.886207,2112.0,0.0,0.0,NONE,18-07-2021 11:19:28
30423,0.47388,12.242002,2.707124,-0.090251,0.214676,0.03454,35.710556,-0.602052,203.886207,2112.0,0.0,0.0,NONE,18-07-2021 11:19:28


## 2.Value Counts for the target class "Road_Anomaly_type"

In [2]:
print(df["Road_Anomaly_type"].value_counts())

NONE          10915
Speed Bump       95
Name: Road_Anomaly_type, dtype: int64


## 3.Transform target class type from textual to number


In [3]:
road_anomaly_none = df["Road_Anomaly_type"] == "NONE"
road_anomaly_sb = df["Road_Anomaly_type"] == "Speed Bump"

df.loc[road_anomaly_none,"Road_Anomaly_type"] = 0
df.loc[road_anomaly_sb,"Road_Anomaly_type"] = 1

print(df["Road_Anomaly_type"].value_counts())

0    10915
1       95
Name: Road_Anomaly_type, dtype: int64


## 4.Prepare data for sampling by different seconds windows

In [4]:
df_sampled = df
df_sampled['seconds']=0
i=0
previous = 0
current_time = df_sampled['Date'].iloc[i]
while i < df_sampled['Road_Anomaly_type'].size:
    if current_time == df_sampled['Date'].iloc[i]:
        df_sampled['seconds'].iloc[i]= previous
        i=i+1
    else:
        df_sampled['seconds'].iloc[i]= previous+1
        previous = previous+1
        current_time = df_sampled['Date'].iloc[i]
        i=i+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


## 4.1.Sample Data using (one second window)

Five statistical features (mean, variance, skewness, kurtosis, standard deviation, max) are generated for each sensors value (ACC_X	ACC_Y	ACC_Z	GYRO_X	GYRO_Y	GYRO_Z).

In [5]:
df_sampled_one_sec = df_sampled.groupby("seconds",as_index=False).agg(
    ACC_X_mean=pd.NamedAgg(column="ACC_X", aggfunc=np.mean),
    ACC_X_var=pd.NamedAgg(column="ACC_X", aggfunc=np.var),
    ACC_X_skew=pd.NamedAgg(column="ACC_X", aggfunc=skew),
    ACC_X_kurtosis=pd.NamedAgg(column="ACC_X", aggfunc=kurtosis),
    ACC_X_std=pd.NamedAgg(column="ACC_X", aggfunc=np.std),
    ACC_X_max=pd.NamedAgg(column="ACC_X", aggfunc="max"),
    ACC_X_DR=pd.NamedAgg(column="ACC_X", aggfunc=lambda x: x.max() - x.min()),
    ACC_Y_mean=pd.NamedAgg(column="ACC_Y", aggfunc=np.mean),
    ACC_Y_var=pd.NamedAgg(column="ACC_Y", aggfunc=np.var),
    ACC_Y_skew=pd.NamedAgg(column="ACC_Y", aggfunc=skew),
    ACC_Y_kurtosis=pd.NamedAgg(column="ACC_Y", aggfunc=kurtosis),
    ACC_Y_std=pd.NamedAgg(column="ACC_Y", aggfunc=np.std),
    ACC_Y_max=pd.NamedAgg(column="ACC_Y", aggfunc="max"),
    ACC_Y_DR=pd.NamedAgg(column="ACC_Y", aggfunc=lambda x: x.max() - x.min()),
    ACC_Z_mean=pd.NamedAgg(column="ACC_Z", aggfunc=np.mean),
    ACC_Z_var=pd.NamedAgg(column="ACC_Z", aggfunc=np.var),
    ACC_Z_skew=pd.NamedAgg(column="ACC_Z", aggfunc=skew),
    ACC_Z_kurtosis=pd.NamedAgg(column="ACC_Z", aggfunc=kurtosis),
    ACC_Z_std=pd.NamedAgg(column="ACC_Z", aggfunc=np.std),
    ACC_Z_max=pd.NamedAgg(column="ACC_Z", aggfunc="max"),
    ACC_Z_DR=pd.NamedAgg(column="ACC_Z", aggfunc=lambda x: x.max() - x.min()),
    GYRO_X_mean=pd.NamedAgg(column="GYRO_X", aggfunc=np.mean),
    GYRO_X_var=pd.NamedAgg(column="GYRO_X", aggfunc=np.var),
    GYRO_X_skew=pd.NamedAgg(column="GYRO_X", aggfunc=skew),
    GYRO_X_kurtosis=pd.NamedAgg(column="GYRO_X", aggfunc=kurtosis),
    GYRO_X_std=pd.NamedAgg(column="GYRO_X", aggfunc=np.std),
    GYRO_X_max=pd.NamedAgg(column="GYRO_X", aggfunc="max"),
    GYRO_X_DR=pd.NamedAgg(column="GYRO_X", aggfunc=lambda x: x.max() - x.min()),
    GYRO_Y_mean=pd.NamedAgg(column="GYRO_Y", aggfunc=np.mean),
    GYRO_Y_var=pd.NamedAgg(column="GYRO_Y", aggfunc=np.var),
    GYRO_Y_skew=pd.NamedAgg(column="GYRO_Y", aggfunc=skew),
    GYRO_Y_kurtosis=pd.NamedAgg(column="GYRO_Y", aggfunc=kurtosis),
    GYRO_Y_std=pd.NamedAgg(column="GYRO_Y", aggfunc=np.std),
    GYRO_Y_max=pd.NamedAgg(column="GYRO_Y", aggfunc="max"),
    GYRO_Y_DR=pd.NamedAgg(column="GYRO_Y", aggfunc=lambda x: x.max() - x.min()),
    GYRO_Z_mean=pd.NamedAgg(column="GYRO_Z", aggfunc=np.mean),
    GYRO_Z_var=pd.NamedAgg(column="GYRO_Z", aggfunc=np.var),
    GYRO_Z_skew=pd.NamedAgg(column="GYRO_Z", aggfunc=skew),
    GYRO_Z_kurtosis=pd.NamedAgg(column="GYRO_Z", aggfunc=kurtosis),
    GYRO_Z_std=pd.NamedAgg(column="GYRO_Z", aggfunc=np.std),
    GYRO_Z_max=pd.NamedAgg(column="GYRO_Z", aggfunc="max"),
    GYRO_Z_DR=pd.NamedAgg(column="GYRO_Z", aggfunc=lambda x: x.max() - x.min()),
    Lat=pd.NamedAgg(column="Latitude", aggfunc="max"),
    Lon=pd.NamedAgg(column="Longitude", aggfunc="max"),
    Alt=pd.NamedAgg(column="Altitude", aggfunc="max"),
    Speed=pd.NamedAgg(column="Speed", aggfunc=np.mean),
    Accuracy=pd.NamedAgg(column="Accuracy", aggfunc=np.mean),
    Bearing=pd.NamedAgg(column="Bearing", aggfunc="max"),
    Road_Anomaly_type=pd.NamedAgg(column="Road_Anomaly_type", aggfunc="max"),
    
)


In [6]:
print(df_sampled_one_sec["Road_Anomaly_type"].value_counts())
df_sampled_one_sec.shape

0    2062
1      95
Name: Road_Anomaly_type, dtype: int64


(2157, 50)

## 4.1.1.Drop Null Values

In [7]:
df_sampled_one_sec.isnull().sum()

seconds               0
ACC_X_mean            0
ACC_X_var            20
ACC_X_skew            0
ACC_X_kurtosis        0
ACC_X_std            20
ACC_X_max             0
ACC_X_DR              0
ACC_Y_mean            0
ACC_Y_var            20
ACC_Y_skew            0
ACC_Y_kurtosis        0
ACC_Y_std            20
ACC_Y_max             0
ACC_Y_DR              0
ACC_Z_mean            0
ACC_Z_var            20
ACC_Z_skew            0
ACC_Z_kurtosis        0
ACC_Z_std            20
ACC_Z_max             0
ACC_Z_DR              0
GYRO_X_mean           0
GYRO_X_var           20
GYRO_X_skew           0
GYRO_X_kurtosis       0
GYRO_X_std           20
GYRO_X_max            0
GYRO_X_DR             0
GYRO_Y_mean           0
GYRO_Y_var           20
GYRO_Y_skew           0
GYRO_Y_kurtosis       0
GYRO_Y_std           20
GYRO_Y_max            0
GYRO_Y_DR             0
GYRO_Z_mean           0
GYRO_Z_var           20
GYRO_Z_skew           0
GYRO_Z_kurtosis       0
GYRO_Z_std           20
GYRO_Z_max      

In [8]:
df_sampled_one_sec = df_sampled_one_sec.dropna()
df_sampled_one_sec = df_sampled_one_sec.reset_index(drop=True)
df_sampled_one_sec.shape

(2137, 50)

In [9]:
df_sampled_one_sec = df_sampled_one_sec.drop(['seconds'], axis=1)

df_sampled_one_sec.columns

Index(['ACC_X_mean', 'ACC_X_var', 'ACC_X_skew', 'ACC_X_kurtosis', 'ACC_X_std',
       'ACC_X_max', 'ACC_X_DR', 'ACC_Y_mean', 'ACC_Y_var', 'ACC_Y_skew',
       'ACC_Y_kurtosis', 'ACC_Y_std', 'ACC_Y_max', 'ACC_Y_DR', 'ACC_Z_mean',
       'ACC_Z_var', 'ACC_Z_skew', 'ACC_Z_kurtosis', 'ACC_Z_std', 'ACC_Z_max',
       'ACC_Z_DR', 'GYRO_X_mean', 'GYRO_X_var', 'GYRO_X_skew',
       'GYRO_X_kurtosis', 'GYRO_X_std', 'GYRO_X_max', 'GYRO_X_DR',
       'GYRO_Y_mean', 'GYRO_Y_var', 'GYRO_Y_skew', 'GYRO_Y_kurtosis',
       'GYRO_Y_std', 'GYRO_Y_max', 'GYRO_Y_DR', 'GYRO_Z_mean', 'GYRO_Z_var',
       'GYRO_Z_skew', 'GYRO_Z_kurtosis', 'GYRO_Z_std', 'GYRO_Z_max',
       'GYRO_Z_DR', 'Lat', 'Lon', 'Alt', 'Speed', 'Accuracy', 'Bearing',
       'Road_Anomaly_type'],
      dtype='object')

## 4.1.2.Save sampled (One sec Window) dataset into a csv file

In [10]:
df_sampled_one_sec.to_csv('Default sensors values and one second sampling window.csv')

## 4.1.3.Sample the data (Reducing None data)

In [11]:
print(df_sampled_one_sec["Road_Anomaly_type"].value_counts())

0    2042
1      95
Name: Road_Anomaly_type, dtype: int64


In [12]:
df_sampled_one_sec_sampled = df_sampled_one_sec
i=0
j=0
df_sampled_one_sec_sampled['remove']=False
for name, values in df_sampled_one_sec_sampled['Road_Anomaly_type'].iteritems():
    if(values == False):
        j=j+1
    else:
        if(j>6):
            df_sampled_one_sec_sampled.loc[i-j+3:i-4,'remove'] = True
            j=0
        else:
            j=0
    i=i+1

In [13]:
df_sampled_one_sec_sampled = df_sampled_one_sec_sampled[df_sampled_one_sec_sampled.remove != True]
print(df_sampled_one_sec_sampled["Road_Anomaly_type"].value_counts())

0    567
1     95
Name: Road_Anomaly_type, dtype: int64


In [14]:
df_sampled_one_sec_sampled = df_sampled_one_sec_sampled.drop(['remove'], axis=1)

df_sampled_one_sec_sampled.columns

Index(['ACC_X_mean', 'ACC_X_var', 'ACC_X_skew', 'ACC_X_kurtosis', 'ACC_X_std',
       'ACC_X_max', 'ACC_X_DR', 'ACC_Y_mean', 'ACC_Y_var', 'ACC_Y_skew',
       'ACC_Y_kurtosis', 'ACC_Y_std', 'ACC_Y_max', 'ACC_Y_DR', 'ACC_Z_mean',
       'ACC_Z_var', 'ACC_Z_skew', 'ACC_Z_kurtosis', 'ACC_Z_std', 'ACC_Z_max',
       'ACC_Z_DR', 'GYRO_X_mean', 'GYRO_X_var', 'GYRO_X_skew',
       'GYRO_X_kurtosis', 'GYRO_X_std', 'GYRO_X_max', 'GYRO_X_DR',
       'GYRO_Y_mean', 'GYRO_Y_var', 'GYRO_Y_skew', 'GYRO_Y_kurtosis',
       'GYRO_Y_std', 'GYRO_Y_max', 'GYRO_Y_DR', 'GYRO_Z_mean', 'GYRO_Z_var',
       'GYRO_Z_skew', 'GYRO_Z_kurtosis', 'GYRO_Z_std', 'GYRO_Z_max',
       'GYRO_Z_DR', 'Lat', 'Lon', 'Alt', 'Speed', 'Accuracy', 'Bearing',
       'Road_Anomaly_type'],
      dtype='object')

In [15]:
df_sampled_one_sec_sampled.to_csv('Default sensors values and one second sampling window SAMPLED.csv')

## 4.2.Sample Data using (two seconds window)

In [16]:
df_sampled_two_sec = df_sampled
df_sampled_two_sec['Two_Sec_window']=0
i=0
while i < df_sampled_two_sec['Road_Anomaly_type'].size:
    if (df_sampled_two_sec['seconds'].iloc[i] % 2) == 0:
        df_sampled_two_sec['Two_Sec_window'].iloc[i] = df_sampled_two_sec['seconds'].iloc[i]
    else:
        df_sampled_two_sec['Two_Sec_window'].iloc[i] = df_sampled_two_sec['seconds'].iloc[i]-1
    i=i+1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [17]:
df_sampled_two_sec = df_sampled_two_sec.groupby("Two_Sec_window",as_index=False).agg(
    ACC_X_mean=pd.NamedAgg(column="ACC_X", aggfunc=np.mean),
    ACC_X_var=pd.NamedAgg(column="ACC_X", aggfunc=np.var),
    ACC_X_skew=pd.NamedAgg(column="ACC_X", aggfunc=skew),
    ACC_X_kurtosis=pd.NamedAgg(column="ACC_X", aggfunc=kurtosis),
    ACC_X_std=pd.NamedAgg(column="ACC_X", aggfunc=np.std),
    ACC_X_max=pd.NamedAgg(column="ACC_X", aggfunc="max"),
    ACC_X_DR=pd.NamedAgg(column="ACC_X", aggfunc=lambda x: x.max() - x.min()),
    ACC_Y_mean=pd.NamedAgg(column="ACC_Y", aggfunc=np.mean),
    ACC_Y_var=pd.NamedAgg(column="ACC_Y", aggfunc=np.var),
    ACC_Y_skew=pd.NamedAgg(column="ACC_Y", aggfunc=skew),
    ACC_Y_kurtosis=pd.NamedAgg(column="ACC_Y", aggfunc=kurtosis),
    ACC_Y_std=pd.NamedAgg(column="ACC_Y", aggfunc=np.std),
    ACC_Y_max=pd.NamedAgg(column="ACC_Y", aggfunc="max"),
    ACC_Y_DR=pd.NamedAgg(column="ACC_Y", aggfunc=lambda x: x.max() - x.min()),
    ACC_Z_mean=pd.NamedAgg(column="ACC_Z", aggfunc=np.mean),
    ACC_Z_var=pd.NamedAgg(column="ACC_Z", aggfunc=np.var),
    ACC_Z_skew=pd.NamedAgg(column="ACC_Z", aggfunc=skew),
    ACC_Z_kurtosis=pd.NamedAgg(column="ACC_Z", aggfunc=kurtosis),
    ACC_Z_std=pd.NamedAgg(column="ACC_Z", aggfunc=np.std),
    ACC_Z_max=pd.NamedAgg(column="ACC_Z", aggfunc="max"),
    ACC_Z_DR=pd.NamedAgg(column="ACC_Z", aggfunc=lambda x: x.max() - x.min()),
    GYRO_X_mean=pd.NamedAgg(column="GYRO_X", aggfunc=np.mean),
    GYRO_X_var=pd.NamedAgg(column="GYRO_X", aggfunc=np.var),
    GYRO_X_skew=pd.NamedAgg(column="GYRO_X", aggfunc=skew),
    GYRO_X_kurtosis=pd.NamedAgg(column="GYRO_X", aggfunc=kurtosis),
    GYRO_X_std=pd.NamedAgg(column="GYRO_X", aggfunc=np.std),
    GYRO_X_max=pd.NamedAgg(column="GYRO_X", aggfunc="max"),
    GYRO_X_DR=pd.NamedAgg(column="GYRO_X", aggfunc=lambda x: x.max() - x.min()),
    GYRO_Y_mean=pd.NamedAgg(column="GYRO_Y", aggfunc=np.mean),
    GYRO_Y_var=pd.NamedAgg(column="GYRO_Y", aggfunc=np.var),
    GYRO_Y_skew=pd.NamedAgg(column="GYRO_Y", aggfunc=skew),
    GYRO_Y_kurtosis=pd.NamedAgg(column="GYRO_Y", aggfunc=kurtosis),
    GYRO_Y_std=pd.NamedAgg(column="GYRO_Y", aggfunc=np.std),
    GYRO_Y_max=pd.NamedAgg(column="GYRO_Y", aggfunc="max"),
    GYRO_Y_DR=pd.NamedAgg(column="GYRO_Y", aggfunc=lambda x: x.max() - x.min()),
    GYRO_Z_mean=pd.NamedAgg(column="GYRO_Z", aggfunc=np.mean),
    GYRO_Z_var=pd.NamedAgg(column="GYRO_Z", aggfunc=np.var),
    GYRO_Z_skew=pd.NamedAgg(column="GYRO_Z", aggfunc=skew),
    GYRO_Z_kurtosis=pd.NamedAgg(column="GYRO_Z", aggfunc=kurtosis),
    GYRO_Z_std=pd.NamedAgg(column="GYRO_Z", aggfunc=np.std),
    GYRO_Z_max=pd.NamedAgg(column="GYRO_Z", aggfunc="max"),
    GYRO_Z_DR=pd.NamedAgg(column="GYRO_Z", aggfunc=lambda x: x.max() - x.min()),
    Lat=pd.NamedAgg(column="Latitude", aggfunc="max"),
    Lon=pd.NamedAgg(column="Longitude", aggfunc="max"),
    Alt=pd.NamedAgg(column="Altitude", aggfunc="max"),
    Speed=pd.NamedAgg(column="Speed", aggfunc=np.mean),
    Accuracy=pd.NamedAgg(column="Accuracy", aggfunc=np.mean),
    Bearing=pd.NamedAgg(column="Bearing", aggfunc="max"),
    Road_Anomaly_type=pd.NamedAgg(column="Road_Anomaly_type", aggfunc="max"),
    )

In [18]:
print(df_sampled_two_sec["Road_Anomaly_type"].value_counts())

0    984
1     95
Name: Road_Anomaly_type, dtype: int64


In [19]:
df_sampled_two_sec = df_sampled_two_sec.reset_index(drop=True)
df_sampled_two_sec.isnull().sum()

Two_Sec_window       0
ACC_X_mean           0
ACC_X_var            0
ACC_X_skew           0
ACC_X_kurtosis       0
ACC_X_std            0
ACC_X_max            0
ACC_X_DR             0
ACC_Y_mean           0
ACC_Y_var            0
ACC_Y_skew           0
ACC_Y_kurtosis       0
ACC_Y_std            0
ACC_Y_max            0
ACC_Y_DR             0
ACC_Z_mean           0
ACC_Z_var            0
ACC_Z_skew           0
ACC_Z_kurtosis       0
ACC_Z_std            0
ACC_Z_max            0
ACC_Z_DR             0
GYRO_X_mean          0
GYRO_X_var           0
GYRO_X_skew          0
GYRO_X_kurtosis      0
GYRO_X_std           0
GYRO_X_max           0
GYRO_X_DR            0
GYRO_Y_mean          0
GYRO_Y_var           0
GYRO_Y_skew          0
GYRO_Y_kurtosis      0
GYRO_Y_std           0
GYRO_Y_max           0
GYRO_Y_DR            0
GYRO_Z_mean          0
GYRO_Z_var           0
GYRO_Z_skew          0
GYRO_Z_kurtosis      0
GYRO_Z_std           0
GYRO_Z_max           0
GYRO_Z_DR            0
Lat        

In [20]:
df_sampled_two_sec = df_sampled_two_sec.reset_index(drop=True)
df_sampled_two_sec.shape

(1079, 50)

## 4.2.1.Drop Columns

In [21]:
df_sampled_two_sec = df_sampled_two_sec.drop(['Two_Sec_window'], axis=1)

df_sampled_two_sec.columns

Index(['ACC_X_mean', 'ACC_X_var', 'ACC_X_skew', 'ACC_X_kurtosis', 'ACC_X_std',
       'ACC_X_max', 'ACC_X_DR', 'ACC_Y_mean', 'ACC_Y_var', 'ACC_Y_skew',
       'ACC_Y_kurtosis', 'ACC_Y_std', 'ACC_Y_max', 'ACC_Y_DR', 'ACC_Z_mean',
       'ACC_Z_var', 'ACC_Z_skew', 'ACC_Z_kurtosis', 'ACC_Z_std', 'ACC_Z_max',
       'ACC_Z_DR', 'GYRO_X_mean', 'GYRO_X_var', 'GYRO_X_skew',
       'GYRO_X_kurtosis', 'GYRO_X_std', 'GYRO_X_max', 'GYRO_X_DR',
       'GYRO_Y_mean', 'GYRO_Y_var', 'GYRO_Y_skew', 'GYRO_Y_kurtosis',
       'GYRO_Y_std', 'GYRO_Y_max', 'GYRO_Y_DR', 'GYRO_Z_mean', 'GYRO_Z_var',
       'GYRO_Z_skew', 'GYRO_Z_kurtosis', 'GYRO_Z_std', 'GYRO_Z_max',
       'GYRO_Z_DR', 'Lat', 'Lon', 'Alt', 'Speed', 'Accuracy', 'Bearing',
       'Road_Anomaly_type'],
      dtype='object')

## 4.2.2.Save sampled dataset into a csv file

In [22]:
df_sampled_two_sec.to_csv('Default sensors values and two seconds sampling window.csv')

## 4.2.3.Sample the data (Reducing None data)

In [23]:
print(df_sampled_two_sec["Road_Anomaly_type"].value_counts())

0    984
1     95
Name: Road_Anomaly_type, dtype: int64


In [24]:
df_sampled_two_sec_sampled = df_sampled_two_sec
i=0
j=0
df_sampled_two_sec_sampled['remove']=False
for name, values in df_sampled_two_sec_sampled['Road_Anomaly_type'].iteritems():
    if(values == False):
        j=j+1
    else:
        if(j>6):
            df_sampled_two_sec_sampled.loc[i-j+3:i-4,'remove'] = True
            j=0
        else:
            j=0
    i=i+1


In [25]:
df_sampled_two_sec_sampled = df_sampled_two_sec_sampled[df_sampled_two_sec_sampled.remove != True]
print(df_sampled_two_sec_sampled["Road_Anomaly_type"].value_counts())

0    512
1     95
Name: Road_Anomaly_type, dtype: int64


In [26]:
df_sampled_two_sec_sampled = df_sampled_two_sec_sampled.drop(['remove'], axis=1)

df_sampled_two_sec_sampled.columns

Index(['ACC_X_mean', 'ACC_X_var', 'ACC_X_skew', 'ACC_X_kurtosis', 'ACC_X_std',
       'ACC_X_max', 'ACC_X_DR', 'ACC_Y_mean', 'ACC_Y_var', 'ACC_Y_skew',
       'ACC_Y_kurtosis', 'ACC_Y_std', 'ACC_Y_max', 'ACC_Y_DR', 'ACC_Z_mean',
       'ACC_Z_var', 'ACC_Z_skew', 'ACC_Z_kurtosis', 'ACC_Z_std', 'ACC_Z_max',
       'ACC_Z_DR', 'GYRO_X_mean', 'GYRO_X_var', 'GYRO_X_skew',
       'GYRO_X_kurtosis', 'GYRO_X_std', 'GYRO_X_max', 'GYRO_X_DR',
       'GYRO_Y_mean', 'GYRO_Y_var', 'GYRO_Y_skew', 'GYRO_Y_kurtosis',
       'GYRO_Y_std', 'GYRO_Y_max', 'GYRO_Y_DR', 'GYRO_Z_mean', 'GYRO_Z_var',
       'GYRO_Z_skew', 'GYRO_Z_kurtosis', 'GYRO_Z_std', 'GYRO_Z_max',
       'GYRO_Z_DR', 'Lat', 'Lon', 'Alt', 'Speed', 'Accuracy', 'Bearing',
       'Road_Anomaly_type'],
      dtype='object')

In [27]:
df_sampled_two_sec_sampled.to_csv('Default sensors values and two seconds sampling window SAMPLED.csv')

## 4.3.Sample Data using (Three seconds window)

In [28]:
df_sampled_three_sec = df_sampled
df_sampled_three_sec['Three_Sec_window']=0
i=0
while i < df_sampled_three_sec['Road_Anomaly_type'].size:
    if (df_sampled_three_sec['seconds'].iloc[i] % 3) == 0:
        df_sampled_three_sec['Three_Sec_window'].iloc[i] = df_sampled_three_sec['seconds'].iloc[i]
    elif (df_sampled_three_sec['seconds'].iloc[i] % 3) == 1:
        df_sampled_three_sec['Three_Sec_window'].iloc[i] = df_sampled_three_sec['seconds'].iloc[i]-1
    elif (df_sampled_three_sec['seconds'].iloc[i] % 3) == 2:
        df_sampled_three_sec['Three_Sec_window'].iloc[i] = df_sampled_three_sec['seconds'].iloc[i]-2
    i=i+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [29]:
df_sampled_three_sec = df_sampled_three_sec.groupby("Three_Sec_window",as_index=False).agg(
    ACC_X_mean=pd.NamedAgg(column="ACC_X", aggfunc=np.mean),
    ACC_X_var=pd.NamedAgg(column="ACC_X", aggfunc=np.var),
    ACC_X_skew=pd.NamedAgg(column="ACC_X", aggfunc=skew),
    ACC_X_kurtosis=pd.NamedAgg(column="ACC_X", aggfunc=kurtosis),
    ACC_X_std=pd.NamedAgg(column="ACC_X", aggfunc=np.std),
    ACC_X_max=pd.NamedAgg(column="ACC_X", aggfunc="max"),
    ACC_X_DR=pd.NamedAgg(column="ACC_X", aggfunc=lambda x: x.max() - x.min()),
    ACC_Y_mean=pd.NamedAgg(column="ACC_Y", aggfunc=np.mean),
    ACC_Y_var=pd.NamedAgg(column="ACC_Y", aggfunc=np.var),
    ACC_Y_skew=pd.NamedAgg(column="ACC_Y", aggfunc=skew),
    ACC_Y_kurtosis=pd.NamedAgg(column="ACC_Y", aggfunc=kurtosis),
    ACC_Y_std=pd.NamedAgg(column="ACC_Y", aggfunc=np.std),
    ACC_Y_max=pd.NamedAgg(column="ACC_Y", aggfunc="max"),
    ACC_Y_DR=pd.NamedAgg(column="ACC_Y", aggfunc=lambda x: x.max() - x.min()),
    ACC_Z_mean=pd.NamedAgg(column="ACC_Z", aggfunc=np.mean),
    ACC_Z_var=pd.NamedAgg(column="ACC_Z", aggfunc=np.var),
    ACC_Z_skew=pd.NamedAgg(column="ACC_Z", aggfunc=skew),
    ACC_Z_kurtosis=pd.NamedAgg(column="ACC_Z", aggfunc=kurtosis),
    ACC_Z_std=pd.NamedAgg(column="ACC_Z", aggfunc=np.std),
    ACC_Z_max=pd.NamedAgg(column="ACC_Z", aggfunc="max"),
    ACC_Z_DR=pd.NamedAgg(column="ACC_Z", aggfunc=lambda x: x.max() - x.min()),
    GYRO_X_mean=pd.NamedAgg(column="GYRO_X", aggfunc=np.mean),
    GYRO_X_var=pd.NamedAgg(column="GYRO_X", aggfunc=np.var),
    GYRO_X_skew=pd.NamedAgg(column="GYRO_X", aggfunc=skew),
    GYRO_X_kurtosis=pd.NamedAgg(column="GYRO_X", aggfunc=kurtosis),
    GYRO_X_std=pd.NamedAgg(column="GYRO_X", aggfunc=np.std),
    GYRO_X_max=pd.NamedAgg(column="GYRO_X", aggfunc="max"),
    GYRO_X_DR=pd.NamedAgg(column="GYRO_X", aggfunc=lambda x: x.max() - x.min()),
    GYRO_Y_mean=pd.NamedAgg(column="GYRO_Y", aggfunc=np.mean),
    GYRO_Y_var=pd.NamedAgg(column="GYRO_Y", aggfunc=np.var),
    GYRO_Y_skew=pd.NamedAgg(column="GYRO_Y", aggfunc=skew),
    GYRO_Y_kurtosis=pd.NamedAgg(column="GYRO_Y", aggfunc=kurtosis),
    GYRO_Y_std=pd.NamedAgg(column="GYRO_Y", aggfunc=np.std),
    GYRO_Y_max=pd.NamedAgg(column="GYRO_Y", aggfunc="max"),
    GYRO_Y_DR=pd.NamedAgg(column="GYRO_Y", aggfunc=lambda x: x.max() - x.min()),
    GYRO_Z_mean=pd.NamedAgg(column="GYRO_Z", aggfunc=np.mean),
    GYRO_Z_var=pd.NamedAgg(column="GYRO_Z", aggfunc=np.var),
    GYRO_Z_skew=pd.NamedAgg(column="GYRO_Z", aggfunc=skew),
    GYRO_Z_kurtosis=pd.NamedAgg(column="GYRO_Z", aggfunc=kurtosis),
    GYRO_Z_std=pd.NamedAgg(column="GYRO_Z", aggfunc=np.std),
    GYRO_Z_max=pd.NamedAgg(column="GYRO_Z", aggfunc="max"),
    GYRO_Z_DR=pd.NamedAgg(column="GYRO_Z", aggfunc=lambda x: x.max() - x.min()),
    Lat=pd.NamedAgg(column="Latitude", aggfunc="max"),
    Lon=pd.NamedAgg(column="Longitude", aggfunc="max"),
    Alt=pd.NamedAgg(column="Altitude", aggfunc="max"),
    Speed=pd.NamedAgg(column="Speed", aggfunc=np.mean),
    Accuracy=pd.NamedAgg(column="Accuracy", aggfunc=np.mean),
    Bearing=pd.NamedAgg(column="Bearing", aggfunc="max"),
    Road_Anomaly_type=pd.NamedAgg(column="Road_Anomaly_type", aggfunc="max"),
    )

In [30]:
print(df_sampled_three_sec["Road_Anomaly_type"].value_counts())

0    624
1     95
Name: Road_Anomaly_type, dtype: int64


In [31]:
df_sampled_three_sec = df_sampled_three_sec.reset_index(drop=True)
df_sampled_three_sec.isnull().sum()

Three_Sec_window     0
ACC_X_mean           0
ACC_X_var            0
ACC_X_skew           0
ACC_X_kurtosis       0
ACC_X_std            0
ACC_X_max            0
ACC_X_DR             0
ACC_Y_mean           0
ACC_Y_var            0
ACC_Y_skew           0
ACC_Y_kurtosis       0
ACC_Y_std            0
ACC_Y_max            0
ACC_Y_DR             0
ACC_Z_mean           0
ACC_Z_var            0
ACC_Z_skew           0
ACC_Z_kurtosis       0
ACC_Z_std            0
ACC_Z_max            0
ACC_Z_DR             0
GYRO_X_mean          0
GYRO_X_var           0
GYRO_X_skew          0
GYRO_X_kurtosis      0
GYRO_X_std           0
GYRO_X_max           0
GYRO_X_DR            0
GYRO_Y_mean          0
GYRO_Y_var           0
GYRO_Y_skew          0
GYRO_Y_kurtosis      0
GYRO_Y_std           0
GYRO_Y_max           0
GYRO_Y_DR            0
GYRO_Z_mean          0
GYRO_Z_var           0
GYRO_Z_skew          0
GYRO_Z_kurtosis      0
GYRO_Z_std           0
GYRO_Z_max           0
GYRO_Z_DR            0
Lat        

In [32]:
df_sampled_three_sec.shape

(719, 50)

## 4.3.1.Drop Columns

In [33]:
df_sampled_three_sec = df_sampled_three_sec.drop(['Three_Sec_window'], axis=1)

df_sampled_three_sec.columns

Index(['ACC_X_mean', 'ACC_X_var', 'ACC_X_skew', 'ACC_X_kurtosis', 'ACC_X_std',
       'ACC_X_max', 'ACC_X_DR', 'ACC_Y_mean', 'ACC_Y_var', 'ACC_Y_skew',
       'ACC_Y_kurtosis', 'ACC_Y_std', 'ACC_Y_max', 'ACC_Y_DR', 'ACC_Z_mean',
       'ACC_Z_var', 'ACC_Z_skew', 'ACC_Z_kurtosis', 'ACC_Z_std', 'ACC_Z_max',
       'ACC_Z_DR', 'GYRO_X_mean', 'GYRO_X_var', 'GYRO_X_skew',
       'GYRO_X_kurtosis', 'GYRO_X_std', 'GYRO_X_max', 'GYRO_X_DR',
       'GYRO_Y_mean', 'GYRO_Y_var', 'GYRO_Y_skew', 'GYRO_Y_kurtosis',
       'GYRO_Y_std', 'GYRO_Y_max', 'GYRO_Y_DR', 'GYRO_Z_mean', 'GYRO_Z_var',
       'GYRO_Z_skew', 'GYRO_Z_kurtosis', 'GYRO_Z_std', 'GYRO_Z_max',
       'GYRO_Z_DR', 'Lat', 'Lon', 'Alt', 'Speed', 'Accuracy', 'Bearing',
       'Road_Anomaly_type'],
      dtype='object')

## 4.3.2.Save sampled dataset into a csv file

In [34]:
df_sampled_three_sec.to_csv('Default sensors values and three seconds sampling window.csv')

## 4.3.3.Sample the data (Reducing None data)

In [35]:
print(df_sampled_three_sec["Road_Anomaly_type"].value_counts())

0    624
1     95
Name: Road_Anomaly_type, dtype: int64


In [36]:
df_sampled_three_sec_sampled = df_sampled_three_sec
i=0
j=0
df_sampled_three_sec_sampled['remove']=False
for name, values in df_sampled_three_sec_sampled['Road_Anomaly_type'].iteritems():
    if(values == False):
        j=j+1
    else:
        if(j>6):
            df_sampled_three_sec_sampled.loc[i-j+3:i-4,'remove'] = True
            j=0
        else:
            j=0
    i=i+1


In [37]:
df_sampled_three_sec_sampled = df_sampled_three_sec_sampled[df_sampled_three_sec_sampled.remove != True]
print(df_sampled_three_sec_sampled["Road_Anomaly_type"].value_counts())

0    432
1     95
Name: Road_Anomaly_type, dtype: int64


In [38]:
df_sampled_three_sec_sampled = df_sampled_three_sec_sampled.drop(['remove'], axis=1)

df_sampled_three_sec_sampled.columns

Index(['ACC_X_mean', 'ACC_X_var', 'ACC_X_skew', 'ACC_X_kurtosis', 'ACC_X_std',
       'ACC_X_max', 'ACC_X_DR', 'ACC_Y_mean', 'ACC_Y_var', 'ACC_Y_skew',
       'ACC_Y_kurtosis', 'ACC_Y_std', 'ACC_Y_max', 'ACC_Y_DR', 'ACC_Z_mean',
       'ACC_Z_var', 'ACC_Z_skew', 'ACC_Z_kurtosis', 'ACC_Z_std', 'ACC_Z_max',
       'ACC_Z_DR', 'GYRO_X_mean', 'GYRO_X_var', 'GYRO_X_skew',
       'GYRO_X_kurtosis', 'GYRO_X_std', 'GYRO_X_max', 'GYRO_X_DR',
       'GYRO_Y_mean', 'GYRO_Y_var', 'GYRO_Y_skew', 'GYRO_Y_kurtosis',
       'GYRO_Y_std', 'GYRO_Y_max', 'GYRO_Y_DR', 'GYRO_Z_mean', 'GYRO_Z_var',
       'GYRO_Z_skew', 'GYRO_Z_kurtosis', 'GYRO_Z_std', 'GYRO_Z_max',
       'GYRO_Z_DR', 'Lat', 'Lon', 'Alt', 'Speed', 'Accuracy', 'Bearing',
       'Road_Anomaly_type'],
      dtype='object')

In [39]:
df_sampled_three_sec_sampled.to_csv('Default sensors values and three seconds sampling window SAMPLED.csv')

## 4.4.Sample Data using (Four seconds window)

In [40]:
df_sampled_four_sec = df_sampled
df_sampled_four_sec['Four_Sec_window']=0
i=0
while i < df_sampled_four_sec['Road_Anomaly_type'].size:
    if (df_sampled_four_sec['seconds'].iloc[i] % 4) == 0:
        df_sampled_four_sec['Four_Sec_window'].iloc[i] = df_sampled_four_sec['seconds'].iloc[i]
    elif (df_sampled_four_sec['seconds'].iloc[i] % 4) == 1:
        df_sampled_four_sec['Four_Sec_window'].iloc[i] = df_sampled_four_sec['seconds'].iloc[i]-1
    elif (df_sampled_four_sec['seconds'].iloc[i] % 4) == 2:
        df_sampled_four_sec['Four_Sec_window'].iloc[i] = df_sampled_four_sec['seconds'].iloc[i]-2
    elif (df_sampled_four_sec['seconds'].iloc[i] % 4) == 3:
        df_sampled_four_sec['Four_Sec_window'].iloc[i] = df_sampled_four_sec['seconds'].iloc[i]-3
    i=i+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [41]:
df_sampled_four_sec = df_sampled_four_sec.groupby("Four_Sec_window",as_index=False).agg(
    ACC_X_mean=pd.NamedAgg(column="ACC_X", aggfunc=np.mean),
    ACC_X_var=pd.NamedAgg(column="ACC_X", aggfunc=np.var),
    ACC_X_skew=pd.NamedAgg(column="ACC_X", aggfunc=skew),
    ACC_X_kurtosis=pd.NamedAgg(column="ACC_X", aggfunc=kurtosis),
    ACC_X_std=pd.NamedAgg(column="ACC_X", aggfunc=np.std),
    ACC_X_max=pd.NamedAgg(column="ACC_X", aggfunc="max"),
    ACC_X_DR=pd.NamedAgg(column="ACC_X", aggfunc=lambda x: x.max() - x.min()),
    ACC_Y_mean=pd.NamedAgg(column="ACC_Y", aggfunc=np.mean),
    ACC_Y_var=pd.NamedAgg(column="ACC_Y", aggfunc=np.var),
    ACC_Y_skew=pd.NamedAgg(column="ACC_Y", aggfunc=skew),
    ACC_Y_kurtosis=pd.NamedAgg(column="ACC_Y", aggfunc=kurtosis),
    ACC_Y_std=pd.NamedAgg(column="ACC_Y", aggfunc=np.std),
    ACC_Y_max=pd.NamedAgg(column="ACC_Y", aggfunc="max"),
    ACC_Y_DR=pd.NamedAgg(column="ACC_Y", aggfunc=lambda x: x.max() - x.min()),
    ACC_Z_mean=pd.NamedAgg(column="ACC_Z", aggfunc=np.mean),
    ACC_Z_var=pd.NamedAgg(column="ACC_Z", aggfunc=np.var),
    ACC_Z_skew=pd.NamedAgg(column="ACC_Z", aggfunc=skew),
    ACC_Z_kurtosis=pd.NamedAgg(column="ACC_Z", aggfunc=kurtosis),
    ACC_Z_std=pd.NamedAgg(column="ACC_Z", aggfunc=np.std),
    ACC_Z_max=pd.NamedAgg(column="ACC_Z", aggfunc="max"),
    ACC_Z_DR=pd.NamedAgg(column="ACC_Z", aggfunc=lambda x: x.max() - x.min()),
    GYRO_X_mean=pd.NamedAgg(column="GYRO_X", aggfunc=np.mean),
    GYRO_X_var=pd.NamedAgg(column="GYRO_X", aggfunc=np.var),
    GYRO_X_skew=pd.NamedAgg(column="GYRO_X", aggfunc=skew),
    GYRO_X_kurtosis=pd.NamedAgg(column="GYRO_X", aggfunc=kurtosis),
    GYRO_X_std=pd.NamedAgg(column="GYRO_X", aggfunc=np.std),
    GYRO_X_max=pd.NamedAgg(column="GYRO_X", aggfunc="max"),
    GYRO_X_DR=pd.NamedAgg(column="GYRO_X", aggfunc=lambda x: x.max() - x.min()),
    GYRO_Y_mean=pd.NamedAgg(column="GYRO_Y", aggfunc=np.mean),
    GYRO_Y_var=pd.NamedAgg(column="GYRO_Y", aggfunc=np.var),
    GYRO_Y_skew=pd.NamedAgg(column="GYRO_Y", aggfunc=skew),
    GYRO_Y_kurtosis=pd.NamedAgg(column="GYRO_Y", aggfunc=kurtosis),
    GYRO_Y_std=pd.NamedAgg(column="GYRO_Y", aggfunc=np.std),
    GYRO_Y_max=pd.NamedAgg(column="GYRO_Y", aggfunc="max"),
    GYRO_Y_DR=pd.NamedAgg(column="GYRO_Y", aggfunc=lambda x: x.max() - x.min()),
    GYRO_Z_mean=pd.NamedAgg(column="GYRO_Z", aggfunc=np.mean),
    GYRO_Z_var=pd.NamedAgg(column="GYRO_Z", aggfunc=np.var),
    GYRO_Z_skew=pd.NamedAgg(column="GYRO_Z", aggfunc=skew),
    GYRO_Z_kurtosis=pd.NamedAgg(column="GYRO_Z", aggfunc=kurtosis),
    GYRO_Z_std=pd.NamedAgg(column="GYRO_Z", aggfunc=np.std),
    GYRO_Z_max=pd.NamedAgg(column="GYRO_Z", aggfunc="max"),
    GYRO_Z_DR=pd.NamedAgg(column="GYRO_Z", aggfunc=lambda x: x.max() - x.min()),
    Lat=pd.NamedAgg(column="Latitude", aggfunc="max"),
    Lon=pd.NamedAgg(column="Longitude", aggfunc="max"),
    Alt=pd.NamedAgg(column="Altitude", aggfunc="max"),
    Speed=pd.NamedAgg(column="Speed", aggfunc=np.mean),
    Accuracy=pd.NamedAgg(column="Accuracy", aggfunc=np.mean),
    Bearing=pd.NamedAgg(column="Bearing", aggfunc="max"),
    Road_Anomaly_type=pd.NamedAgg(column="Road_Anomaly_type", aggfunc="max"),
    )

In [42]:
print(df_sampled_four_sec["Road_Anomaly_type"].value_counts())

0    445
1     95
Name: Road_Anomaly_type, dtype: int64


In [43]:
df_sampled_four_sec = df_sampled_four_sec.reset_index(drop=True)
df_sampled_four_sec.isnull().sum()

Four_Sec_window      0
ACC_X_mean           0
ACC_X_var            0
ACC_X_skew           0
ACC_X_kurtosis       0
ACC_X_std            0
ACC_X_max            0
ACC_X_DR             0
ACC_Y_mean           0
ACC_Y_var            0
ACC_Y_skew           0
ACC_Y_kurtosis       0
ACC_Y_std            0
ACC_Y_max            0
ACC_Y_DR             0
ACC_Z_mean           0
ACC_Z_var            0
ACC_Z_skew           0
ACC_Z_kurtosis       0
ACC_Z_std            0
ACC_Z_max            0
ACC_Z_DR             0
GYRO_X_mean          0
GYRO_X_var           0
GYRO_X_skew          0
GYRO_X_kurtosis      0
GYRO_X_std           0
GYRO_X_max           0
GYRO_X_DR            0
GYRO_Y_mean          0
GYRO_Y_var           0
GYRO_Y_skew          0
GYRO_Y_kurtosis      0
GYRO_Y_std           0
GYRO_Y_max           0
GYRO_Y_DR            0
GYRO_Z_mean          0
GYRO_Z_var           0
GYRO_Z_skew          0
GYRO_Z_kurtosis      0
GYRO_Z_std           0
GYRO_Z_max           0
GYRO_Z_DR            0
Lat        

In [44]:
df_sampled_four_sec.shape

(540, 50)

## 4.4.1.Drop Columns

In [45]:
df_sampled_four_sec = df_sampled_four_sec.drop(['Four_Sec_window'], axis=1)

df_sampled_four_sec.columns

Index(['ACC_X_mean', 'ACC_X_var', 'ACC_X_skew', 'ACC_X_kurtosis', 'ACC_X_std',
       'ACC_X_max', 'ACC_X_DR', 'ACC_Y_mean', 'ACC_Y_var', 'ACC_Y_skew',
       'ACC_Y_kurtosis', 'ACC_Y_std', 'ACC_Y_max', 'ACC_Y_DR', 'ACC_Z_mean',
       'ACC_Z_var', 'ACC_Z_skew', 'ACC_Z_kurtosis', 'ACC_Z_std', 'ACC_Z_max',
       'ACC_Z_DR', 'GYRO_X_mean', 'GYRO_X_var', 'GYRO_X_skew',
       'GYRO_X_kurtosis', 'GYRO_X_std', 'GYRO_X_max', 'GYRO_X_DR',
       'GYRO_Y_mean', 'GYRO_Y_var', 'GYRO_Y_skew', 'GYRO_Y_kurtosis',
       'GYRO_Y_std', 'GYRO_Y_max', 'GYRO_Y_DR', 'GYRO_Z_mean', 'GYRO_Z_var',
       'GYRO_Z_skew', 'GYRO_Z_kurtosis', 'GYRO_Z_std', 'GYRO_Z_max',
       'GYRO_Z_DR', 'Lat', 'Lon', 'Alt', 'Speed', 'Accuracy', 'Bearing',
       'Road_Anomaly_type'],
      dtype='object')

## 4.4.2.Save sampled dataset into a csv file

In [46]:
df_sampled_four_sec.to_csv('Default sensors values and four seconds sampling window.csv')

## 4.4.3.Sample the data (Reducing None data)

In [47]:
print(df_sampled_four_sec["Road_Anomaly_type"].value_counts())

0    445
1     95
Name: Road_Anomaly_type, dtype: int64


In [48]:
df_sampled_four_sec_sampled = df_sampled_four_sec
i=0
j=0
df_sampled_four_sec_sampled['remove']=False
for name, values in df_sampled_four_sec_sampled['Road_Anomaly_type'].iteritems():
    if(values == False):
        j=j+1
    else:
        if(j>6):
            df_sampled_four_sec_sampled.loc[i-j+3:i-4,'remove'] = True
            j=0
        else:
            j=0
    i=i+1


In [49]:
df_sampled_four_sec_sampled = df_sampled_four_sec_sampled[df_sampled_four_sec_sampled.remove != True]
print(df_sampled_four_sec_sampled["Road_Anomaly_type"].value_counts())

0    351
1     95
Name: Road_Anomaly_type, dtype: int64


In [50]:
df_sampled_four_sec_sampled = df_sampled_four_sec_sampled.drop(['remove'], axis=1)

df_sampled_four_sec_sampled.columns

Index(['ACC_X_mean', 'ACC_X_var', 'ACC_X_skew', 'ACC_X_kurtosis', 'ACC_X_std',
       'ACC_X_max', 'ACC_X_DR', 'ACC_Y_mean', 'ACC_Y_var', 'ACC_Y_skew',
       'ACC_Y_kurtosis', 'ACC_Y_std', 'ACC_Y_max', 'ACC_Y_DR', 'ACC_Z_mean',
       'ACC_Z_var', 'ACC_Z_skew', 'ACC_Z_kurtosis', 'ACC_Z_std', 'ACC_Z_max',
       'ACC_Z_DR', 'GYRO_X_mean', 'GYRO_X_var', 'GYRO_X_skew',
       'GYRO_X_kurtosis', 'GYRO_X_std', 'GYRO_X_max', 'GYRO_X_DR',
       'GYRO_Y_mean', 'GYRO_Y_var', 'GYRO_Y_skew', 'GYRO_Y_kurtosis',
       'GYRO_Y_std', 'GYRO_Y_max', 'GYRO_Y_DR', 'GYRO_Z_mean', 'GYRO_Z_var',
       'GYRO_Z_skew', 'GYRO_Z_kurtosis', 'GYRO_Z_std', 'GYRO_Z_max',
       'GYRO_Z_DR', 'Lat', 'Lon', 'Alt', 'Speed', 'Accuracy', 'Bearing',
       'Road_Anomaly_type'],
      dtype='object')

In [51]:
df_sampled_four_sec_sampled.to_csv('Default sensors values and four seconds sampling window SAMPLED.csv')

## 5.Sample Data using (Location)

Five statistical features (mean, variance, skewness, kurtosis, standard deviation, max) are generated for each sensors value (ACC_X	ACC_Y	ACC_Z	GYRO_X	GYRO_Y	GYRO_Z).

In [52]:
df_sampled_location = df

In [53]:
df_sampled_location['Location']=df_sampled_location['Latitude'].map(str)+"/"+df_sampled_location["Longitude"].map(str)


In [54]:
df_sampled_location = df_sampled_location.groupby("Location",as_index=False).agg(
    ACC_X_mean=pd.NamedAgg(column="ACC_X", aggfunc=np.mean),
    ACC_X_var=pd.NamedAgg(column="ACC_X", aggfunc=np.var),
    ACC_X_skew=pd.NamedAgg(column="ACC_X", aggfunc=skew),
    ACC_X_kurtosis=pd.NamedAgg(column="ACC_X", aggfunc=kurtosis),
    ACC_X_std=pd.NamedAgg(column="ACC_X", aggfunc=np.std),
    ACC_X_max=pd.NamedAgg(column="ACC_X", aggfunc="max"),
    ACC_Y_mean=pd.NamedAgg(column="ACC_Y", aggfunc=np.mean),
    ACC_Y_var=pd.NamedAgg(column="ACC_Y", aggfunc=np.var),
    ACC_Y_skew=pd.NamedAgg(column="ACC_Y", aggfunc=skew),
    ACC_Y_kurtosis=pd.NamedAgg(column="ACC_Y", aggfunc=kurtosis),
    ACC_Y_std=pd.NamedAgg(column="ACC_Y", aggfunc=np.std),
    ACC_Y_max=pd.NamedAgg(column="ACC_Y", aggfunc="max"),
    ACC_Z_mean=pd.NamedAgg(column="ACC_Z", aggfunc=np.mean),
    ACC_Z_var=pd.NamedAgg(column="ACC_Z", aggfunc=np.var),
    ACC_Z_skew=pd.NamedAgg(column="ACC_Z", aggfunc=skew),
    ACC_Z_kurtosis=pd.NamedAgg(column="ACC_Z", aggfunc=kurtosis),
    ACC_Z_std=pd.NamedAgg(column="ACC_Z", aggfunc=np.std),
    ACC_Z_max=pd.NamedAgg(column="ACC_Z", aggfunc="max"),
    GYRO_X_mean=pd.NamedAgg(column="GYRO_X", aggfunc=np.mean),
    GYRO_X_var=pd.NamedAgg(column="GYRO_X", aggfunc=np.var),
    GYRO_X_skew=pd.NamedAgg(column="GYRO_X", aggfunc=skew),
    GYRO_X_kurtosis=pd.NamedAgg(column="GYRO_X", aggfunc=kurtosis),
    GYRO_X_std=pd.NamedAgg(column="GYRO_X", aggfunc=np.std),
    GYRO_X_max=pd.NamedAgg(column="GYRO_X", aggfunc="max"),
    GYRO_Y_mean=pd.NamedAgg(column="GYRO_Y", aggfunc=np.mean),
    GYRO_Y_var=pd.NamedAgg(column="GYRO_Y", aggfunc=np.var),
    GYRO_Y_skew=pd.NamedAgg(column="GYRO_Y", aggfunc=skew),
    GYRO_Y_kurtosis=pd.NamedAgg(column="GYRO_Y", aggfunc=kurtosis),
    GYRO_Y_std=pd.NamedAgg(column="GYRO_Y", aggfunc=np.std),
    GYRO_Y_max=pd.NamedAgg(column="GYRO_Y", aggfunc="max"),
    GYRO_Z_mean=pd.NamedAgg(column="GYRO_Z", aggfunc=np.mean),
    GYRO_Z_var=pd.NamedAgg(column="GYRO_Z", aggfunc=np.var),
    GYRO_Z_skew=pd.NamedAgg(column="GYRO_Z", aggfunc=skew),
    GYRO_Z_kurtosis=pd.NamedAgg(column="GYRO_Z", aggfunc=kurtosis),
    GYRO_Z_std=pd.NamedAgg(column="GYRO_Z", aggfunc=np.std),
    GYRO_Z_max=pd.NamedAgg(column="GYRO_Z", aggfunc="max"),
    Lat=pd.NamedAgg(column="Latitude", aggfunc="max"),
    Lon=pd.NamedAgg(column="Longitude", aggfunc="max"),
    Alt=pd.NamedAgg(column="Altitude", aggfunc="max"),
    Speed=pd.NamedAgg(column="Speed", aggfunc=np.mean),
    Accuracy=pd.NamedAgg(column="Accuracy", aggfunc=np.mean),
    Bearing=pd.NamedAgg(column="Bearing", aggfunc="max"),
    Road_Anomaly_type=pd.NamedAgg(column="Road_Anomaly_type", aggfunc="max"),
    
)

## 5.1. Drop Null values

In [55]:
df_sampled_location.isnull().sum()

Location             0
ACC_X_mean           0
ACC_X_var            1
ACC_X_skew           0
ACC_X_kurtosis       0
ACC_X_std            1
ACC_X_max            0
ACC_Y_mean           0
ACC_Y_var            1
ACC_Y_skew           0
ACC_Y_kurtosis       0
ACC_Y_std            1
ACC_Y_max            0
ACC_Z_mean           0
ACC_Z_var            1
ACC_Z_skew           0
ACC_Z_kurtosis       0
ACC_Z_std            1
ACC_Z_max            0
GYRO_X_mean          0
GYRO_X_var           1
GYRO_X_skew          0
GYRO_X_kurtosis      0
GYRO_X_std           1
GYRO_X_max           0
GYRO_Y_mean          0
GYRO_Y_var           1
GYRO_Y_skew          0
GYRO_Y_kurtosis      0
GYRO_Y_std           1
GYRO_Y_max           0
GYRO_Z_mean          0
GYRO_Z_var           1
GYRO_Z_skew          0
GYRO_Z_kurtosis      0
GYRO_Z_std           1
GYRO_Z_max           0
Lat                  0
Lon                  0
Alt                  0
Speed                0
Accuracy             0
Bearing              0
Road_Anomal

In [56]:
df_sampled_location = df_sampled_location.dropna()
df_sampled_location = df_sampled_location.reset_index(drop=True)
df_sampled_location.shape

(261, 44)

In [57]:
df_sampled_location['Road_Anomaly_type'].value_counts()

0    177
1     84
Name: Road_Anomaly_type, dtype: int64

In [58]:
df_sampled_location = df_sampled_location.drop(['Location'], axis=1)

df_sampled_location.columns

Index(['ACC_X_mean', 'ACC_X_var', 'ACC_X_skew', 'ACC_X_kurtosis', 'ACC_X_std',
       'ACC_X_max', 'ACC_Y_mean', 'ACC_Y_var', 'ACC_Y_skew', 'ACC_Y_kurtosis',
       'ACC_Y_std', 'ACC_Y_max', 'ACC_Z_mean', 'ACC_Z_var', 'ACC_Z_skew',
       'ACC_Z_kurtosis', 'ACC_Z_std', 'ACC_Z_max', 'GYRO_X_mean', 'GYRO_X_var',
       'GYRO_X_skew', 'GYRO_X_kurtosis', 'GYRO_X_std', 'GYRO_X_max',
       'GYRO_Y_mean', 'GYRO_Y_var', 'GYRO_Y_skew', 'GYRO_Y_kurtosis',
       'GYRO_Y_std', 'GYRO_Y_max', 'GYRO_Z_mean', 'GYRO_Z_var', 'GYRO_Z_skew',
       'GYRO_Z_kurtosis', 'GYRO_Z_std', 'GYRO_Z_max', 'Lat', 'Lon', 'Alt',
       'Speed', 'Accuracy', 'Bearing', 'Road_Anomaly_type'],
      dtype='object')

## 5.2.Save sampled dataset into a csv file

In [59]:
df_sampled_location.to_csv('Default sensors values and sampling by location.csv')