# Reference
1. https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets
1. https://www.kaggle.com/asobod11138/gsdc-neuralnet-keras (multi-threading)

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from glob import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px
from multiprocessing import Pool
import multiprocessing as multi

from utils.gsdc_parser import ground_truth_file_open, derived_file_open, gnsslog_file_open, gnss_log_to_dataframes

split_charater = "\\"

# Set Path and Load Dataset

In [2]:
PATH = Path("../input/google-smartphone-decimeter-challenge")
train_df = pd.read_csv(PATH / "baseline_locations_train.csv")
test_df = pd.read_csv(PATH / "baseline_locations_test.csv")

In [3]:
print(train_df.shape)
train_df.head()

(131342, 7)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,-32.83,2020-05-14-US-MTV-1_Pixel4
4,2020-05-14-US-MTV-1,Pixel4,1273529467442,37.423579,-122.094114,-34.49,2020-05-14-US-MTV-1_Pixel4


In [4]:
print(test_df.shape)
test_df.head()

(91486, 7)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416628,-122.082053,-30.69,2020-05-15-US-MTV-1_Pixel4
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416646,-122.08204,-31.76,2020-05-15-US-MTV-1_Pixel4
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416653,-122.082039,-31.65,2020-05-15-US-MTV-1_Pixel4
3,2020-05-15-US-MTV-1,Pixel4,1273608788432,37.416607,-122.082063,-31.52,2020-05-15-US-MTV-1_Pixel4
4,2020-05-15-US-MTV-1,Pixel4,1273608789432,37.416609,-122.082073,-28.95,2020-05-15-US-MTV-1_Pixel4


# Load All Data Function

## For Multi Processing Code (Temporary Dead Code)

In [5]:
path_list = list(glob(str(PATH / "train" / "*/*/*")))

get_ground_truth = lambda x:x.split(split_charater)[-1].find("ground_truth")>= 0
get_derived = lambda x:x.split(split_charater)[-1].find("derived.csv")>= 0
get_gnsslog = lambda x:x.split(split_charater)[-1].find("GnssLog.txt")>= 0

ground_truth_path_list = list(filter(get_ground_truth, path_list))
derived_path_list = list(filter(get_derived, path_list))
gnsslog_path_list = list(filter(get_gnsslog, path_list))

In [6]:
gnss_section_names = {'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
section_names = {'GroundTruth', 'Derived', 'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
_columns = ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM']

output = dict()
for section in section_names:
    output[section] = pd.DataFrame()

start_path = "train"

thread_num = multi.cpu_count()

In [7]:
with Pool(thread_num) as pool:
    imap = pool.imap(ground_truth_file_open, ground_truth_path_list)
    ground_truth = list(tqdm(imap, total=len(ground_truth_path_list), desc="load ground truth"))

load ground truth:   0%|          | 0/73 [00:00<?, ?it/s]

In [8]:
with Pool(thread_num) as pool:
    imap = pool.imap(derived_file_open, derived_path_list)
    derived = list(tqdm(imap, total=len(derived_path_list), desc="load derived data"))

load derived data:   0%|          | 0/73 [00:00<?, ?it/s]

In [9]:
with Pool(thread_num) as pool:
    imap = pool.imap(gnsslog_file_open, gnsslog_path_list)
    gnss_logs = list(tqdm(imap, total=len(gnsslog_path_list), desc="load gnss log"))

load gnss log:   0%|          | 0/73 [00:00<?, ?it/s]

## One Process Parser

In [10]:
df_truth = pd.concat(ground_truth)
print(df_truth.shape)
df_truth.head()

(131342, 11)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,timeSinceFirstFixSeconds,hDop,vDop,speedMps,courseDegree,t_latDeg,t_lngDeg,t_heightAboveWgs84EllipsoidM
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,551.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.21
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,552.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.21
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,553.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.21
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,554.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.2
4,2020-05-14-US-MTV-1,Pixel4,1273529467442,555.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.2


In [11]:
df_derived = pd.concat(derived)
print(df_derived.shape)
df_derived.head()

(3834542, 20)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,constellationType,svid,signalType,receivedSvTimeInGpsNanos,xSatPosM,ySatPosM,zSatPosM,xSatVelMps,ySatVelMps,zSatVelMps,satClkBiasM,satClkDriftMps,rawPrM,rawPrUncM,isrbM,ionoDelayM,tropoDelayM
0,2020-05-14-US-MTV-1,Pixel4,1273529464442,3,24,GLO_G1,1273529463363061857,-25399010.0,-692512.2,-2280430.0,-325.826,156.04,3559.757,-468.084,0.001,23794980.0,11.992,1134.758,10.866,16.647
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,6,13,GAL_E1,1273529463363970742,-5199894.0,-17419270.0,23361280.0,2239.305,700.815,1022.014,120171.076,0.0,23522510.0,1.799,-222.675,3.946,2.717
2,2020-05-14-US-MTV-1,Pixel4,1273529464442,1,5,GPS_L1,1273529463365539137,-2179863.0,-26154880.0,-3437694.0,325.541,-419.725,3129.012,-3793.067,-0.001,23052310.0,4.197,0.0,7.554,5.704
3,2020-05-14-US-MTV-1,Pixel4,1273529464442,6,15,GAL_E1,1273529463352380595,16354690.0,-10478330.0,22344600.0,2172.409,561.971,-1326.001,259937.286,-0.0,26997150.0,8.094,-222.675,7.968,9.151
4,2020-05-14-US-MTV-1,Pixel4,1273529464442,6,21,GAL_E1,1273529463364075083,-14840420.0,-21109600.0,14496320.0,-426.094,-1395.675,-2467.793,-179555.991,-0.001,23491230.0,2.698,-222.675,3.834,2.544


In [12]:
gnss_dict = {}
for key in gnss_section_names:
    list_temp = []
    for gnss_log in gnss_logs:
        list_temp.append(gnss_log[key])
    df_temp = pd.concat(list_temp)
    gnss_dict[key] = df_temp
    if (key == "Status") or (key == "Fix"):  
        gnss_dict[key].rename(columns = {'UnixTimeMillis':'utcTimeMillis'}, inplace = True)
    gnss_dict[key]["millisSinceGpsEpoch"] = gnss_dict[key]["utcTimeMillis"].astype(int) - 315964800000
    print(gnss_dict[key].shape)
    print(gnss_dict[key].head())

(0, 14)
Empty DataFrame
Columns: [Provider, LatitudeDegrees, LongitudeDegrees, AltitudeMeters, SpeedMps, AccuracyMeters, BearingDegrees, utcTimeMillis, SpeedAccuracyMps, BearingAccuracyDegrees, collectionName, phoneName, elapsedRealtimeNanos, millisSinceGpsEpoch]
Index: []
(20534266, 11)
   utcTimeMillis  elapsedRealtimeNanos  UncalGyroXRadPerSec  \
0  1589494244844        19637939874074            -0.003550   
1  1589494244847        19637942316782            -0.003370   
2  1589494244849        19637944759491            -0.004533   
3  1589494244852        19637947202199            -0.008254   
4  1589494244855        19637949644907            -0.008272   

   UncalGyroYRadPerSec  UncalGyroZRadPerSec       collectionName phoneName  \
0             0.023597            -0.082773  2020-05-14-US-MTV-1    Pixel4   
1             0.030998            -0.080377  2020-05-14-US-MTV-1    Pixel4   
2             0.028622            -0.074239  2020-05-14-US-MTV-1    Pixel4   
3             0.0176

In [14]:
def merge_data(df_, truth, derived, gnss):
    df = df_.copy()

    df = pd.merge_asof(df.sort_values('millisSinceGpsEpoch'), 
          truth.sort_values('millisSinceGpsEpoch'), 
          on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
          direction='nearest',tolerance=100000)

    df = pd.merge_asof(df.sort_values('millisSinceGpsEpoch'), 
          derived.sort_values('millisSinceGpsEpoch'), 
          on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
          direction='nearest',tolerance=100000)


    for key, value in gnss.items():
        if value.shape[0] == 0:
            continue
        df = pd.merge_asof(df.sort_values('millisSinceGpsEpoch'), 
              value.sort_values('millisSinceGpsEpoch'), 
              on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
              direction='nearest',tolerance=100000, suffixes = ['', "_" + key])
        
    return df

# Save To Pickle File

In [15]:
output = merge_data(train_df, df_truth, df_derived, gnss_dict)
output.to_pickle(str(PATH / "gsdc_train.pkl.gzip"))

In [16]:
output = merge_data(test_df, df_truth, df_derived, gnss_dict)
output.to_pickle(str(PATH / "gsdc_test.pkl.gzip"))

In [17]:
%clear




# Load Pickle File

In [18]:
import numpy as np
import pandas as pd
from glob import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px

In [19]:
PATH = Path("../input/google-smartphone-decimeter-challenge")

In [20]:
df_train = pd.read_pickle(str(PATH / "gsdc_train.pkl.gzip"))

In [21]:
print(df_train.shape)
df_train.head()

(131342, 110)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,timeSinceFirstFixSeconds,hDop,vDop,...,HasEphemerisData,BasebandCn0DbHz_y,utcTimeMillis_y,elapsedRealtimeNanos_y,UncalMagXMicroT,UncalMagYMicroT,UncalMagZMicroT,BiasXMicroT,BiasYMicroT,BiasZMicroT
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4,551.44,1.1,0.0,...,,,,,,,,,,
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4,552.44,1.1,0.0,...,,,,,,,,,,
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4,553.44,1.1,0.0,...,,,,,,,,,,
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,-32.83,2020-05-14-US-MTV-1_Pixel4,554.44,1.1,0.0,...,,,,,,,,,,
4,2020-05-14-US-MTV-1,Pixel4XLModded,1273529466449,37.423574,-122.094137,-33.2,2020-05-14-US-MTV-1_Pixel4XLModded,554.45,1.2,0.0,...,,,,,,,,,,


In [22]:
df_test = pd.read_pickle(str(PATH / "gsdc_test.pkl.gzip"))

In [23]:
print(df_test.shape)
df_test.head()

(91486, 110)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,timeSinceFirstFixSeconds,hDop,vDop,...,HasEphemerisData,BasebandCn0DbHz_y,utcTimeMillis_y,elapsedRealtimeNanos_y,UncalMagXMicroT,UncalMagYMicroT,UncalMagZMicroT,BiasXMicroT,BiasYMicroT,BiasZMicroT
0,2020-05-15-US-MTV-1,Pixel4XL,1273608752446,37.416623,-122.082055,-24.21,2020-05-15-US-MTV-1_Pixel4XL,,,,...,,,,,,,,,,
1,2020-05-15-US-MTV-1,Pixel4XL,1273608753446,37.41659,-122.082073,-29.97,2020-05-15-US-MTV-1_Pixel4XL,,,,...,,,,,,,,,,
2,2020-05-15-US-MTV-1,Pixel4XL,1273608754446,37.416593,-122.082084,-29.85,2020-05-15-US-MTV-1_Pixel4XL,,,,...,,,,,,,,,,
3,2020-05-15-US-MTV-1,Pixel4XL,1273608755446,37.41658,-122.082067,-28.53,2020-05-15-US-MTV-1_Pixel4XL,,,,...,,,,,,,,,,
4,2020-05-15-US-MTV-1,Pixel4XL,1273608756446,37.416594,-122.082065,-27.7,2020-05-15-US-MTV-1_Pixel4XL,,,,...,,,,,,,,,,
