In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.utils import resample

In [3]:
df = pd.read_pickle("../../grab-ai-safety-data/df_full.pickle")

In [4]:
df

Unnamed: 0,bookingid,accuracy,bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,speed,label
12128415,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.094040,0.070732,0.0,3.442991,0
12128501,0,8.0,143.298294,0.546405,-9.835590,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454,0
12127972,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.015390,2.0,0.228454,0
12128337,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454,0
12128677,0,8.0,143.298294,-0.608313,-9.539658,-1.794583,-0.007538,-0.023838,0.018068,5.0,0.228454,0
12128548,0,8.0,143.298294,-0.867758,-9.698615,-1.615439,0.022728,-0.012178,0.005982,6.0,0.228454,0
12128392,0,8.0,143.298294,-1.050790,-9.745270,-1.411771,0.027603,0.001841,0.000904,7.0,0.228454,0
12128160,0,8.0,143.298294,-0.721213,-9.960004,-1.202271,0.001864,-0.007702,0.014018,8.0,0.228454,0
12128140,0,8.0,143.298294,-0.346924,-9.532629,-1.204663,0.014962,-0.050033,0.025118,9.0,0.228454,0
12128636,0,8.0,143.298294,0.294586,-10.085315,-1.531100,0.004587,-0.077703,0.069096,10.0,0.228454,0


In [4]:
mask = df["bookingid"] != df["bookingid"].shift(1)

df["change_in_second"] = df["second"].diff().fillna(0)
df["change_in_bearing"] = np.abs(df["bearing"].diff().fillna(0))
df["change_in_speed"] = np.abs(df["speed"].diff().fillna(0))
df["change_in_acx"] = df["acceleration_x"].diff().fillna(0)
df["change_in_acy"] = df["acceleration_y"].diff().fillna(0)
df["change_in_acz"] = df["acceleration_z"].diff().fillna(0)
df["change_in_gyx"] = df["gyro_x"].diff().fillna(0)
df["change_in_gyy"] = df["gyro_y"].diff().fillna(0)
df["change_in_gyz"] = df["gyro_z"].diff().fillna(0)

df.loc[ mask, [
    "change_in_second", 
    "change_in_bearing",
    "change_in_speed",
    "change_in_acx",
    "change_in_acy",
    "change_in_acz",
    "change_in_gyx",
    "change_in_gyy",
    "change_in_gyz"
]] = 0

## Duration of trip
- Quite an obvious difference in trip durations after removing the outliers. Seems that shorter trips tend to be labelled safe.

In [5]:
df.groupby("bookingid").agg(
    {
        "second" : np.max,
        "label" : np.max
    }
).groupby("label").agg(
    {
    "second" : [lambda x: np.percentile(x, q=50), np.mean]
    }
)

Unnamed: 0_level_0,second,second
Unnamed: 0_level_1,<lambda>,mean
label,Unnamed: 1_level_2,Unnamed: 2_level_2
0,779.0,833.135876
1,1123.0,1173.834236


In [6]:
df_time = df.groupby("bookingid").agg(
    {
        "second" : np.max,
        "label" : np.max
    }
)

In [7]:
df_time.sort_values("second")

Unnamed: 0_level_0,second,label
bookingid,Unnamed: 1_level_1,Unnamed: 2_level_1
1185410973727,118.0,0
1228360646706,118.0,0
180388626478,118.0,0
34359738486,118.0,0
149,118.0,0
1374389534771,118.0,0
42949672982,118.0,0
146028888071,119.0,0
146028888115,119.0,0
575525617768,119.0,1


## Distance of trip
- There is a weird entry with a change in time at the last entry
- Looks like there are 5 trips with ridiculous trip durations

In [8]:
df_dist = df.copy()
df_dist["distance_covered"] = df_dist["change_in_second"] * df_dist["speed"]
df_dist.groupby("bookingid").agg(
    {
        "distance_covered" : np.sum,
        "label" : np.max
    }
).groupby("label").mean()

Unnamed: 0_level_0,distance_covered
label,Unnamed: 1_level_1
0,7083.79844
1,8931.852083


In [9]:
df_dist.loc[df_dist["bookingid"] == 1503238553722, :][list(df.columns)]["second"].max()

404.0

In [10]:
duration_check = df_dist.groupby("bookingid").agg(
    {
    "second" : np.max,
    "label" : np.max
    }
)

In [11]:
duration_check.loc[duration_check["label"] == 0, :] \
              .groupby("second") \
              .count().sort_values("second", ascending=False).reset_index()["second"][1]

3614.0

## Acceleration
- Let's try euclidean first to see if that works
    - Mean of bookings, mean of label: no significant difference ~1% higher only
    - Max of the bookings, mean of label: ~12% higher for dangerous trips

In [12]:
df_acc = df.copy()
df_acc["acceleration"] = np.sqrt(
    (df_acc["acceleration_x"] ** 2) + (df_acc["acceleration_y"] ** 2) + (df_acc["acceleration_z"] ** 2)
)
test = df_acc.groupby("bookingid").agg(
    {
        "acceleration" : [np.max, np.mean],
        "label" : [np.max]
    }
)
test.columns = test.columns.get_level_values(0)
test.groupby("label").agg("mean")

Unnamed: 0_level_0,acceleration,acceleration
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,13.80016,9.872149
1,15.523679,9.904714


## Gyro
- Same technique with gyro to see
    - Max of bookings, mean of label: ~47% higher for dangerous trips
    - Mean of bookings, mean of label: ~36% higher for dangerous trips

In [13]:
df_gyro = df.copy()
df_gyro["gyro"] = np.sqrt(
    (df_gyro["gyro_x"] ** 2) + (df_gyro["gyro_y"] ** 2) + (df_gyro["gyro_z"] ** 2)
)
test = df_gyro.groupby("bookingid").agg(
    {
        "gyro" : [np.max, np.mean],
        "label" : [np.max]
    }
)
test.columns = test.columns.get_level_values(0)
test.groupby("label").agg("mean")

Unnamed: 0_level_0,gyro,gyro
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.38042,0.118911
1,2.04559,0.158299


## Speed
- More variations for speed
    - Max of bookings, mean of label: 5% higher for dangerous trips
    - Mean of bookings, mean of label: 10% lower for dangerous trips

In [14]:
df_spd = df.copy()
test = df_spd.groupby("bookingid").agg(
    {
        "speed" : [np.max, np.mean],
        "label" : [np.max]
    }
)
test.columns = test.columns.get_level_values(0)
test.groupby("label").agg("mean")

Unnamed: 0_level_0,speed,speed
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,20.853004,8.606909
1,21.226834,7.882894


## Bearing
- Change in bearing?