In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import GroupShuffleSplit
from sklearn.utils import resample

In [2]:
df = pd.read_pickle("../../grab-ai-safety-data/df_full.pickle")

## Manual grouped data split
- Not logical to use train_test_split on the raw feature matrix as the data is always has pretty consistent timings and train_test_split will just randomly remove time stamps from each booking.
    - A more manual approach is taken to counter this effect
    - Use GroupShuffleSplit to split data on booking ID
    - Then check the that the class proportions are the same between training and testing sets
- Split is done first to minimize any data leakage to produce unbiased and usable production models.

In [8]:
train_idx, test_idx = next(
    GroupShuffleSplit(
        test_size=0.2, 
        n_splits=2, 
        random_state = 42
    ).split(
        df, 
        groups=df["bookingID"]
    )
)

In [9]:
X_train = df.iloc[train_idx, :-1]
X_test = df.iloc[test_idx, :-1]
y_train = df.iloc[train_idx, -1]
y_test = df.iloc[test_idx, -1]

In [11]:
pd.concat([X_train, y_train], axis=1).groupby("bookingID").max().groupby("label").count()

Unnamed: 0_level_0,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,12004,12004,12004,12004,12004,12004,12004,12004,12004,12004
1,3981,3981,3981,3981,3981,3981,3981,3981,3981,3981


In [14]:
3981 /(12004+3981) 

0.2490459806068189

In [15]:
pd.concat([X_test, y_test], axis=1).groupby("bookingID").max().groupby("label").count()

Unnamed: 0_level_0,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2995,2995,2995,2995,2995,2995,2995,2995,2995,2995
1,1002,1002,1002,1002,1002,1002,1002,1002,1002,1002


In [16]:
1002 / (1002 + 2995)

0.250688016012009

In [17]:
with open('../../grab-ai-safety-data/X_train.pickle', 'wb') as f:
    pickle.dump(X_train, f)
with open('../../grab-ai-safety-data/y_train.pickle', 'wb') as f:
    pickle.dump(y_train, f)
with open('../../grab-ai-safety-data/X_test.pickle', 'wb') as f:
    pickle.dump(X_test, f)
with open('../../grab-ai-safety-data/y_test.pickle', 'wb') as f:
    pickle.dump(y_test, f)