## Only read in relevant X_train dataset

- Feature extraction only requries X_train to prevent any leakage
- Also, to extract processes into a few functions for reusability
- General process will be:
    - Read in raw feature data
    - Remove outliers
    - Extract features
    - Perform resampling if required
    - Export for model fitting

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.utils import resample

In [2]:
df = pd.read_pickle("../../grab-ai-safety-data/X_train.pickle")

## Removing outliers
- Some initial analysis reveals some ridiculous outliers in the data
    - Some bookings have a max second (or duration) of 47 years
    - Also, use this chance to perform some truncating of the data for each booking
    - Reason being, not every reading will be useful. Try getting rid of the ends.


In [None]:
def create_window(feature_matrix):
    pass

In [8]:
df.groupby("bookingid").agg(
    {
        "second" : [
            np.min, 
            lambda x: np.percentile(x, q=90)
        ]
    }
).values.shape

(15985, 2)

In [None]:
df[["duration_lower", "duration_upper"]] = df.groupby("bookingid").agg(
    {
        "second" : [
            np.min, 
            lambda x: np.percentile(x, q=90)
        ]
    }
).values

In [10]:
df

Unnamed: 0,bookingid,accuracy,bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,speed,lowerbound,upperbound
12018093,1,4.000,236.000000,-0.840489,8.749026,-3.274607,0.037472,0.134862,-0.011083,1.0,4.937211,,
12018073,1,4.000,228.000000,-1.944456,8.989030,-2.978358,-0.024487,0.102224,-0.014870,2.0,5.544210,,
12017783,1,4.000,222.000000,-0.820675,9.282243,-2.126961,0.007522,0.089431,0.006004,3.0,6.564030,,
12017494,1,4.000,216.000000,-1.020284,9.382215,-2.618797,-0.019007,0.002409,-0.014382,4.0,7.358976,,
12017434,1,4.000,214.000000,-0.745468,9.504243,-1.587384,0.006283,0.055763,-0.006720,5.0,8.297936,,
12018061,1,4.000,212.000000,-0.485889,9.466089,-1.315164,0.001012,0.000541,0.010455,6.0,8.477888,,
12017901,1,4.000,212.000000,-1.118102,9.704149,-1.796915,-0.012619,-0.011938,-0.029374,7.0,7.977206,,
12017874,1,4.000,213.000000,-1.050183,9.387703,-1.199332,-0.035727,-0.013491,-0.022881,8.0,7.732447,,
12017442,1,4.000,211.000000,-1.055000,9.894306,-1.811855,-0.003787,0.002496,-0.024627,9.0,7.140307,,
12017715,1,4.000,210.000000,-0.526657,9.471271,-1.766509,0.008151,-0.006091,0.018448,10.0,6.565919,,


In [None]:
duration_bounds.columns = [
    "_".join(x) if x[0] != "label" else x[0] for x in duration_bounds.columns.ravel()
]

In [None]:
df = pd.merge(df, duration_bounds, how="left", on="bookingid")

In [None]:
df = df.loc[
    (df["second"] >= df["second_min"]) & (df["second"] <= df["second_<lambda>"]), 
    :-2
]

In [None]:
df

In [None]:
def extract_features(feature_matrix):
    pass

In [None]:
# Acceleration and gyro features

df["acceleration"] = np.sqrt(
    (df["acceleration_x"] ** 2) + (df["acceleration_y"] ** 2) + (df["acceleration_z"] ** 2)
)

df["gyro"] = np.sqrt(
    (df["gyro_x"] ** 2) + (df["gyro_y"] ** 2) + (df["gyro_z"] ** 2)
)

In [None]:
# Differenced series to get a measure of distance and a change in bearing

mask = df["bookingid"] != df["bookingid"].shift(1)
df["duration_of_entry"] = df["second"].diff().fillna(0)
df["change_in_bearing"] = np.abs(df["bearing"].diff().fillna(0))
df.loc[ mask, ["duration_of_entry", "change_in_bearing"]] = 0
df["distance_covered"] = df["speed"] * df["duration_of_entry"]
df["change_in_bearing_per_sec"] = df["change_in_bearing"] / df["duration_of_entry"]

In [None]:
df_grouped = df.groupby("bookingid").agg(
    {
        "second" : [np.max],
        "distance_covered" : [np.sum],
        "acceleration" : [np.max, np.mean, lambda x: np.percentile(x, q=50), np.std],
        "gyro" : [np.max, np.mean, lambda x: np.percentile(x, q=50), np.std],
        "speed" : [np.max, np.mean, lambda x: np.percentile(x, q=50), np.std],
        "change_in_bearing_per_sec" : [np.max, np.std],
        "label" : [np.max]
    }
)

df_grouped.columns = ["_".join(x) if x[0] != "label" else x[0] for x in df_grouped.columns.ravel()]

## Resampling

- Leaving this out for now as it may affect the results
- Trying random forests sample weights instead

In [None]:
df_majority = df_grouped[df_grouped["label"] == 0]
df_minority = df_grouped[df_grouped["label"] == 1]

df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=df_majority.count()[0],
    random_state=42
)

df_grouped = pd.concat(
    [df_majority, df_minority_upsampled]
)

In [None]:
with open('outputs/df_grouped.pickle', 'wb') as f:
    pickle.dump(df_grouped, f)