# Reference
1. https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets
1. https://www.kaggle.com/asobod11138/gsdc-neuralnet-keras (multi-threading)

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from glob import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px
from multiprocessing import Pool
import multiprocessing as multi

from utils.gsdc_parser import ground_truth_file_open, derived_file_open, gnsslog_file_open, gnss_log_to_dataframes

split_charater = "\\"

# Set Path and Load Dataset

In [2]:
PATH = Path("../input/google-smartphone-decimeter-challenge")
train_df = pd.read_csv(PATH / "baseline_locations_train.csv")
test_df = pd.read_csv(PATH / "baseline_locations_test.csv")

In [3]:
print(train_df.shape)
train_df.head()

(131342, 7)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,-32.83,2020-05-14-US-MTV-1_Pixel4
4,2020-05-14-US-MTV-1,Pixel4,1273529467442,37.423579,-122.094114,-34.49,2020-05-14-US-MTV-1_Pixel4


In [4]:
print(test_df.shape)
test_df.head()

(91486, 7)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416628,-122.082053,-30.69,2020-05-15-US-MTV-1_Pixel4
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416646,-122.08204,-31.76,2020-05-15-US-MTV-1_Pixel4
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416653,-122.082039,-31.65,2020-05-15-US-MTV-1_Pixel4
3,2020-05-15-US-MTV-1,Pixel4,1273608788432,37.416607,-122.082063,-31.52,2020-05-15-US-MTV-1_Pixel4
4,2020-05-15-US-MTV-1,Pixel4,1273608789432,37.416609,-122.082073,-28.95,2020-05-15-US-MTV-1_Pixel4


# Load All Data Function

## For Multi Processing Code (Temporary Dead Code)

In [5]:
path_list = list(glob(str(PATH / "train" / "*/*/*")))

get_ground_truth = lambda x:x.split(split_charater)[-1].find("ground_truth")>= 0
get_derived = lambda x:x.split(split_charater)[-1].find("derived.csv")>= 0
get_gnsslog = lambda x:x.split(split_charater)[-1].find("GnssLog.txt")>= 0

ground_truth_path_list = list(filter(get_ground_truth, path_list))
derived_path_list = list(filter(get_derived, path_list))
gnsslog_path_list = list(filter(get_gnsslog, path_list))

In [6]:
gnss_section_names = {'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
section_names = {'GroundTruth', 'Derived', 'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
_columns = ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM']

output = dict()
for section in section_names:
    output[section] = pd.DataFrame()

start_path = "train"

thread_num = multi.cpu_count()

In [7]:
with Pool(thread_num) as pool:
    imap = pool.imap(ground_truth_file_open, ground_truth_path_list)
    ground_truth = list(tqdm(imap, total=len(ground_truth_path_list), desc="load ground truth"))

load ground truth:   0%|          | 0/73 [00:00<?, ?it/s]

In [8]:
with Pool(thread_num) as pool:
    imap = pool.imap(derived_file_open, derived_path_list)
    derived = list(tqdm(imap, total=len(derived_path_list), desc="load derived data"))

load derived data:   0%|          | 0/73 [00:00<?, ?it/s]

In [None]:
with Pool(thread_num) as pool:
    imap = pool.imap(gnsslog_file_open, gnsslog_path_list)
    gnss_logs = list(tqdm(imap, total=len(gnsslog_path_list), desc="load gnss log"))

load gnss log:   0%|          | 0/73 [00:00<?, ?it/s]

## One Process Parser

In [None]:
df_truth = pd.concat(ground_truth)
print(df_truth.shape)
df_truth.head()

In [None]:
df_derived = pd.concat(derived)
print(df_derived.shape)
df_derived.head()

In [None]:
gnss_dict = {}
for key in gnss_section_names:
    list_temp = []
    for gnss_log in gnss_logs:
        list_temp.append(gnss_log[key])
    df_temp = pd.concat(list_temp)
    gnss_dict[key] = df_temp
    if (key == "Status") or (key == "Fix"):  
        gnss_dict[key].rename(columns = {'UnixTimeMillis':'utcTimeMillis'}, inplace = True)
    gnss_dict[key]["millisSinceGpsEpoch"] = gnss_dict[key]["utcTimeMillis"].astype(int) - 315964800000
    print(gnss_dict[key].shape)
    print(gnss_dict[key].head())

In [None]:
def merge_data(df_, truth, derived, gnss):
    df = df_.copy()

    df = pd.merge_asof(df.sort_values('millisSinceGpsEpoch'), 
          truth.sort_values('millisSinceGpsEpoch'), 
          on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
          direction='nearest',tolerance=100000)

    df = pd.merge_asof(df.sort_values('millisSinceGpsEpoch'), 
          derived.sort_values('millisSinceGpsEpoch'), 
          on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
          direction='nearest',tolerance=100000)


    for key, value in gnss.items():
        if value.shape[0] == 0:
            continue
        df = pd.merge_asof(df.sort_values('millisSinceGpsEpoch'), 
              value.sort_values('millisSinceGpsEpoch'), 
              on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
              direction='nearest',tolerance=100000, suffixes = ['', "_" + key])
        
    return df

# Save To Pickle File

In [None]:
output = merge_data(train_df, df_truth, df_derived, gnss_dict)
output.to_pickle(str(PATH / "gsdc_train.pkl.gzip"))

In [None]:
output = merge_data(test_df, df_truth, df_derived, gnss_dict)
output.to_pickle(str(PATH / "gsdc_test.pkl.gzip"))

In [None]:
%clear

# Load Pickle File

In [None]:
import numpy as np
import pandas as pd
from glob import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px

In [None]:
PATH = Path("../input/google-smartphone-decimeter-challenge")

In [None]:
df_train = pd.read_pickle(str(PATH / "gsdc_train.pkl.gzip"))

In [None]:
print(df_train.shape)
df_train.head()

In [None]:
df_test = pd.read_pickle(str(PATH / "gsdc_test.pkl.gzip"))

In [None]:
print(df_test.shape)
df_test.head()