## Only read in relevant X_train dataset

- Feature extraction only requries X_train to prevent any leakage
- Also, to extract processes into a few functions for reusability
- General process will be:
    - Read in raw feature data
    - Remove outliers
    - Extract features
    - Perform resampling if required
    - Export for model fitting

In [1]:
import pandas as pd
import numpy as np
import pickle
import dill as pickle

from sklearn.utils import resample

In [2]:
df = pd.read_pickle("../grab-ai-safety-data/X_train.pickle")

## Standardizing column names
- Making all columns lower cased

In [3]:
def clean_columns(feature_matrix):
    feature_matrix.columns = [
        x.lower() for x in feature_matrix.columns
    ]
    return feature_matrix

## Removing outliers
- Change column names to all lower case
- Some initial analysis reveals some ridiculous outliers in the data
    - Some bookings have a max second (or duration) of 47 years
    - Also, use this chance to perform some truncating of the data for each booking
    - Reason being, not every reading will be useful. Try getting rid of the ends.
- Pandas depreciation does not allow multiple lambda functions in aggregate function. Separately define percentiles
- 148 m/s speed reading is ridiculous. Amounts to ~500km/h. Set threshold at 45
- Ignore accuracy for any entries above 20m?


In [4]:
def min_boundary(x):
    return np.percentile(x, q=1)
def max_boundary(x):
    return np.percentile(x, q=95)
def min_speed(x):
    return 0
def max_speed(x):
    return 60
def min_accuracy(x):
    return 0
def max_accuracy(x):
    return 50

def create_windows(df):
    ''' Function takes in a 2D feature matrix and creates a duration
    boundary for each bookingid based on a pre-specified percentile
    
    Args:
    A 2D pandas dataframe
    
    Returns:
    Filtered feature matrix where duration is within the bounds
    of the min and max duration percentiles
    
    '''
    
    df_original = df.copy()
    bounds_df = df.groupby("bookingid").agg(
        {
            "second" : [
                min_boundary,
                max_boundary
            ],
            "speed" : [
                min_speed,
                max_speed
            ],
            "accuracy" : [
                min_accuracy,
                max_accuracy
            ]
        }
    )
    
    bounds_df.columns = [
        "_".join(x) if x[0] != "label" else x[0] for x in bounds_df.columns.ravel()
    ]
    
    df_filtered = pd.merge(
        df, 
        bounds_df, 
        how="left", 
        on="bookingid"
    )
    
    df_filtered = df_filtered.loc[
            (df_filtered["second"] >= df_filtered["second_min_boundary"]) &\
            (df_filtered["second"] <= df_filtered["second_max_boundary"]) &\
            (df_filtered["speed"] >= df_filtered["speed_min_speed"]) &\
            (df_filtered["speed"] <= df_filtered["speed_max_speed"]) &\
            (df_filtered["accuracy"] >= df_filtered["accuracy_min_accuracy"]) &\
            (df_filtered["accuracy"] <= df_filtered["accuracy_max_accuracy"]),
            list(df_original.columns)
        ]
    
    return df_filtered

## Extracting features from raw data
- Combined acceleration and gyro terms using simple Pythagorean formula
    - Both acceleration and gyro may not be relevant in space
    - Data is probably manually labeled
- Change in bearing per second
- Arbitrary score: Total bearing change / distance per trip as an indication of how winding the route is

In [5]:
def extract_features(df):
    ''' Function takes in a 2D panadas dataframe and extracts
    certain features
    
    Args:
    A 2D pandas dataframe
    
    Returns:
    A grouped version of the feature matrix by booking id with extra feature
    columns deemed to be relevant for predictive modelling purposes
    
    '''
    
    # Acceleration and gyro features
    
    df["acceleration"] = np.sqrt(
        (df["acceleration_x"] ** 2) + (df["acceleration_y"] ** 2) + (df["acceleration_z"] ** 2)
    )

    df["gyro"] = np.sqrt(
        (df["gyro_x"] ** 2) + (df["gyro_y"] ** 2) + (df["gyro_z"] ** 2)
    )
    
    
    # Differenced series to get a measure of distance and a change in bearing
    
    mask = df["bookingid"] != df["bookingid"].shift(1)
    df["duration_of_entry"] = df["second"].diff().fillna(1)
    df["change_in_bearing"] = np.abs(df["bearing"].diff().fillna(0))
    df["change_in_speed"] = df["speed"].diff().fillna(0)
    df["change_in_acceleration"] = np.abs(df["acceleration"]).diff().fillna(0)
    df["change_in_gyro"] = np.abs(df["gyro"]).diff().fillna(0)
    df.loc[mask, ["change_in_bearing", "change_in_speed"]] = 0
    df.loc[mask, ["duration_of_entry"]] = 1
    
    # Defining some features that could explain recklessness
    
    df["distance_covered"] = df["speed"] * df["duration_of_entry"]
    df["change_in_bearing_per_sec"] = df["change_in_bearing"] / df["duration_of_entry"]
    df["turning_speed"] = df["change_in_bearing"] * df["change_in_speed"]
    df["turning_acceleration"] = df["change_in_bearing"] * df["change_in_acceleration"]
    
    # Aggregating booking IDs 
    
    df_grouped = df.groupby("bookingid").agg(
        {
            "second" : [np.max],
            "distance_covered" : [np.sum],
            "acceleration" : [np.max, np.std, lambda x: np.percentile(x, q=50)],
            "gyro" : [np.max, lambda x: np.percentile(x, q=50)],
            # "acceleration_x" : [np.mean, np.max, np.min],
            # "acceleration_y" : [np.mean, np.max, np.min],
            # "acceleration_z" : [np.mean, np.max, np.min],
            # "gyro_x" : [np.mean, np.max, np.min],
            # "gyro_y" : [np.mean, np.max, np.min],
            # "gyro_z" : [np.mean, np.max, np.min],
            "change_in_acceleration" : [np.max],
            "change_in_gyro" : [np.max],
            "speed" : [np.max, lambda x: np.percentile(x, q=50), np.std],
            "change_in_speed" : [np.max],
            "change_in_bearing_per_sec" : [np.max],
            "change_in_bearing" : [np.sum, np.std],
            # "turning_speed" : [np.max, np.sum, lambda x: np.percentile(x, q=50)],
            # "turning_acceleration" : [np.max, np.sum, lambda x: np.percentile(x, q=50)] 
        }
    )

    df_grouped.columns = ["_".join(x) if x[0] != "label" else x[0] for x in df_grouped.columns.ravel()]
    df_grouped["change_in_bearing_per_m"] = (
        df_grouped["change_in_bearing_sum"] / df_grouped["distance_covered_sum"]
    )
    df_grouped = df_grouped.replace(
        [np.inf, -np.inf], 
        np.nan
    ).dropna()
    
    return df_grouped

In [6]:
df_grouped = (
    df.pipe(
        clean_columns
    ).pipe(
        create_windows
    ).pipe(
        extract_features
    )
)

In [7]:
with open("../grab-ai-safety-data/X_train_transformed.pickle", 'wb') as f:
    pickle.dump(df_grouped, f)
    
with open("outputs/functions/clean_columns.pickle", "wb") as handle:
    pickle.dump(clean_columns, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("outputs/functions/create_windows.pickle", "wb") as handle:
    pickle.dump(create_windows, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("outputs/functions/extract_features.pickle", "wb") as handle:
    pickle.dump(extract_features, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("outputs/functions/min_boundary.pickle", "wb") as handle:
    pickle.dump(min_boundary, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("outputs/functions/max_boundary.pickle", "wb") as handle:
    pickle.dump(max_boundary, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("outputs/functions/min_accuracy.pickle", "wb") as handle:
    pickle.dump(min_accuracy, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("outputs/functions/max_accuracy.pickle", "wb") as handle:
    pickle.dump(max_accuracy, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("outputs/functions/min_speed.pickle", "wb") as handle:
    pickle.dump(min_speed, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("outputs/functions/max_speed.pickle", "wb") as handle:
    pickle.dump(max_speed, handle, protocol=pickle.HIGHEST_PROTOCOL)