## Read in combined cleaned dataset

- Created intermediate notebook just to avoid reading in the csv every time feature adjustments are made
- However, git cannot handle big data so it is currently saved into an external folder

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.utils import resample

In [2]:
df = pd.read_pickle("../grab-ai-safety-data/df_full.pickle")

## Feature Extraction

- To verify that euclidean distance can be applied to combine acceleration and gyro

In [3]:
# Acceleration and gyro features

df["acceleration"] = np.sqrt(
    (df["acceleration_x"] ** 2) + (df["acceleration_y"] ** 2) + (df["acceleration_z"] ** 2)
)

df["gyro"] = np.sqrt(
    (df["gyro_x"] ** 2) + (df["gyro_y"] ** 2) + (df["gyro_z"] ** 2)
)

In [4]:
# Differenced series to get a measure of distance and a change in bearing

mask = df["bookingid"] != df["bookingid"].shift(1)
df["duration_of_entry"] = df["second"].diff().fillna(0)
df["change_in_bearing"] = np.abs(df["bearing"].diff().fillna(0))
df.loc[ mask, ["duration_of_entry", "change_in_bearing"]] = 0
df["distance_covered"] = df["speed"] * df["duration_of_entry"]
df["change_in_bearing_per_sec"] = df["change_in_bearing"] / df["duration_of_entry"]

In [5]:
df_grouped = df.groupby("bookingid").agg(
    {
        "second" : [np.max],
        "distance_covered" : [np.sum],
        "acceleration" : [np.max, np.mean, lambda x: np.percentile(x, q=50), np.std],
        "gyro" : [np.max, np.mean, lambda x: np.percentile(x, q=50), np.std],
        "speed" : [np.max, np.mean, lambda x: np.percentile(x, q=50), np.std],
        "change_in_bearing_per_sec" : [np.max, np.std],
        "label" : [np.max]
    }
)

df_grouped.columns = ["_".join(x) if x[0] != "label" else x[0] for x in df_grouped.columns.ravel()]

## Resampling

- Leaving this out for now as it may affect the results
- Trying random forests sample weights instead

In [6]:
df_majority = df_grouped[df_grouped["label"] == 0]
df_minority = df_grouped[df_grouped["label"] == 1]

df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=df_majority.count()[0],
    random_state=42
)

df_grouped = pd.concat(
    [df_majority, df_minority_upsampled]
)

In [7]:
with open('outputs/df_grouped.pickle', 'wb') as f:
    pickle.dump(df_grouped, f)