# Reference
1. https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets
1. https://www.kaggle.com/asobod11138/gsdc-neuralnet-keras (multi-threading)

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from glob import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px
from multiprocessing import Pool
import multiprocessing as multi

from utils.gsdc_parser import ground_truth_file_open, derived_file_open, gnsslog_file_open, gnss_log_to_dataframes

split_charater = "\\"

# Set Path and Load Dataset

In [2]:
PATH = Path("../input/google-smartphone-decimeter-challenge")
train_df = pd.read_csv(PATH / "baseline_locations_train.csv")
test_df = pd.read_csv(PATH / "baseline_locations_test.csv")

In [3]:
print(train_df.shape)
train_df.head()

(131342, 7)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,-32.83,2020-05-14-US-MTV-1_Pixel4
4,2020-05-14-US-MTV-1,Pixel4,1273529467442,37.423579,-122.094114,-34.49,2020-05-14-US-MTV-1_Pixel4


In [4]:
print(test_df.shape)
test_df.head()

(91486, 7)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416628,-122.082053,-30.69,2020-05-15-US-MTV-1_Pixel4
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416646,-122.08204,-31.76,2020-05-15-US-MTV-1_Pixel4
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416653,-122.082039,-31.65,2020-05-15-US-MTV-1_Pixel4
3,2020-05-15-US-MTV-1,Pixel4,1273608788432,37.416607,-122.082063,-31.52,2020-05-15-US-MTV-1_Pixel4
4,2020-05-15-US-MTV-1,Pixel4,1273608789432,37.416609,-122.082073,-28.95,2020-05-15-US-MTV-1_Pixel4


# Load All Data Function

In [5]:
gnss_section_names = {'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
section_names = {'GroundTruth', 'Derived', 'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
_columns = ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM']

In [6]:
thread_num = 8

In [7]:
def merge_data(df_, truth, derived, gnss):
    df = df_.copy()
    
    if truth is not None:
        df = pd.merge_asof(df.sort_values('millisSinceGpsEpoch'), 
              truth.sort_values('millisSinceGpsEpoch'), 
              on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
              direction='nearest',tolerance=100000)

    df = pd.merge_asof(df.sort_values('millisSinceGpsEpoch'), 
          derived.sort_values('millisSinceGpsEpoch'), 
          on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
          direction='nearest',tolerance=100000)


    for key, value in gnss.items():
        if value.shape[0] == 0:
            continue
        df = pd.merge_asof(df.sort_values('millisSinceGpsEpoch'), 
              value.sort_values('millisSinceGpsEpoch'), 
              on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
              direction='nearest',tolerance=100000, suffixes = ['', "_" + key])
        
    return df

## Train
### For Multi Processing Code

In [8]:
path_list = list(glob(str(PATH / "train" / "*/*/*")))

get_ground_truth = lambda x:x.split(split_charater)[-1].find("ground_truth")>= 0
get_derived = lambda x:x.split(split_charater)[-1].find("derived.csv")>= 0
get_gnsslog = lambda x:x.split(split_charater)[-1].find("GnssLog.txt")>= 0

ground_truth_path_list = list(filter(get_ground_truth, path_list))
derived_path_list = list(filter(get_derived, path_list))
gnsslog_path_list = list(filter(get_gnsslog, path_list))

In [9]:
output = dict()
for section in section_names:
    output[section] = pd.DataFrame()

In [10]:
with Pool(thread_num) as pool:
    imap = pool.imap(gnsslog_file_open, gnsslog_path_list)
    gnss_logs = list(tqdm(imap, total=len(gnsslog_path_list), desc="load gnss log"))

load gnss log:   0%|          | 0/73 [00:00<?, ?it/s]

In [11]:
with Pool(thread_num) as pool:
    imap = pool.imap(ground_truth_file_open, ground_truth_path_list)
    ground_truth = list(tqdm(imap, total=len(ground_truth_path_list), desc="load ground truth"))

load ground truth:   0%|          | 0/73 [00:00<?, ?it/s]

In [12]:
with Pool(thread_num) as pool:
    imap = pool.imap(derived_file_open, derived_path_list)
    derived = list(tqdm(imap, total=len(derived_path_list), desc="load derived data"))

load derived data:   0%|          | 0/73 [00:00<?, ?it/s]

### One Process Parser

In [13]:
df_truth = pd.concat(ground_truth)
print(df_truth.shape)
df_truth.head()

(131342, 11)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,timeSinceFirstFixSeconds,hDop,vDop,speedMps,courseDegree,t_latDeg,t_lngDeg,t_heightAboveWgs84EllipsoidM
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,551.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.21
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,552.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.21
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,553.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.21
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,554.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.2
4,2020-05-14-US-MTV-1,Pixel4,1273529467442,555.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.2


In [14]:
df_derived = pd.concat(derived)
print(df_derived.shape)
df_derived.head()

(3834542, 20)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,constellationType,svid,signalType,receivedSvTimeInGpsNanos,xSatPosM,ySatPosM,zSatPosM,xSatVelMps,ySatVelMps,zSatVelMps,satClkBiasM,satClkDriftMps,rawPrM,rawPrUncM,isrbM,ionoDelayM,tropoDelayM
0,2020-05-14-US-MTV-1,Pixel4,1273529464442,3,24,GLO_G1,1273529463363061857,-25399010.0,-692512.2,-2280430.0,-325.826,156.04,3559.757,-468.084,0.001,23794980.0,11.992,1134.758,10.866,16.647
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,6,13,GAL_E1,1273529463363970742,-5199894.0,-17419270.0,23361280.0,2239.305,700.815,1022.014,120171.076,0.0,23522510.0,1.799,-222.675,3.946,2.717
2,2020-05-14-US-MTV-1,Pixel4,1273529464442,1,5,GPS_L1,1273529463365539137,-2179863.0,-26154880.0,-3437694.0,325.541,-419.725,3129.012,-3793.067,-0.001,23052310.0,4.197,0.0,7.554,5.704
3,2020-05-14-US-MTV-1,Pixel4,1273529464442,6,15,GAL_E1,1273529463352380595,16354690.0,-10478330.0,22344600.0,2172.409,561.971,-1326.001,259937.286,-0.0,26997150.0,8.094,-222.675,7.968,9.151
4,2020-05-14-US-MTV-1,Pixel4,1273529464442,6,21,GAL_E1,1273529463364075083,-14840420.0,-21109600.0,14496320.0,-426.094,-1395.675,-2467.793,-179555.991,-0.001,23491230.0,2.698,-222.675,3.834,2.544


In [15]:
gnss_dict = {}
for key in gnss_section_names:
    list_temp = []
    for gnss_log in gnss_logs:
        list_temp.append(gnss_log[key])
    df_temp = pd.concat(list_temp)
    gnss_dict[key] = df_temp
    if (key == "Status") or (key == "Fix"):  
        gnss_dict[key].rename(columns = {'UnixTimeMillis':'utcTimeMillis'}, inplace = True)
    gnss_dict[key]["millisSinceGpsEpoch"] = gnss_dict[key]["utcTimeMillis"].astype('int64') - 315964800000
    print(gnss_dict[key].shape)
    print(gnss_dict[key].head())

(4386205, 39)
   utcTimeMillis       TimeNanos  LeapSecond  TimeUncertaintyNanos  \
0  1589494245442  21091250000000         NaN                   NaN   
1  1589494245442  21091250000000         NaN                   NaN   
2  1589494245442  21091250000000         NaN                   NaN   
3  1589494245442  21091250000000         NaN                   NaN   
4  1589494245442  21091250000000         NaN                   NaN   

         FullBiasNanos  BiasNanos  BiasUncertaintyNanos  DriftNanosPerSecond  \
0 -1273508372192433379  -0.173279             25.084462            -0.129289   
1 -1273508372192433379  -0.173279             25.084462            -0.129289   
2 -1273508372192433379  -0.173279             25.084462            -0.129289   
3 -1273508372192433379  -0.173279             25.084462            -0.129289   
4 -1273508372192433379  -0.173279             25.084462            -0.129289   

   DriftUncertaintyNanosPerSecond  HardwareClockDiscontinuityCount  ...  \
0        

(2825564, 8)
        collectionName phoneName  utcTimeMillis  elapsedRealtimeNanos  yawDeg  \
0  2021-03-10-US-SVL-1  Pixel4XL   1.615418e+12          1.035191e+13    61.0   
1  2021-03-10-US-SVL-1  Pixel4XL   1.615418e+12          1.035193e+13    61.0   
2  2021-03-10-US-SVL-1  Pixel4XL   1.615418e+12          1.035195e+13    61.0   
3  2021-03-10-US-SVL-1  Pixel4XL   1.615418e+12          1.035196e+13    61.0   
4  2021-03-10-US-SVL-1  Pixel4XL   1.615418e+12          1.035198e+13    61.0   

   rollDeg  pitchDeg  millisSinceGpsEpoch  
0   -165.0     -83.0        1299453148737  
1   -165.0     -83.0        1299453148753  
2   -165.0     -83.0        1299453148769  
3   -165.0     -83.0        1299453148785  
4   -165.0     -83.0        1299453148801  


### Save To Pickle File

In [16]:
output = merge_data(train_df, df_truth, df_derived, gnss_dict)
output.to_pickle(str(PATH / "gsdc_train.pkl.gzip"))
print(output.shape)
output.head()

(131342, 110)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,timeSinceFirstFixSeconds,hDop,vDop,...,UncalMagYMicroT,UncalMagZMicroT,BiasXMicroT,BiasYMicroT,BiasZMicroT,utcTimeMillis_OrientationDeg,elapsedRealtimeNanos_OrientationDeg,yawDeg,rollDeg,pitchDeg
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4,551.44,1.1,0.0,...,-48.00559,-13.144198,,,,,,,,
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4,552.44,1.1,0.0,...,-48.438065,-13.865798,,,,,,,,
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4,553.44,1.1,0.0,...,-48.41085,-12.949378,,,,,,,,
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,-32.83,2020-05-14-US-MTV-1_Pixel4,554.44,1.1,0.0,...,-47.79542,-12.749175,,,,,,,,
4,2020-05-14-US-MTV-1,Pixel4XLModded,1273529466449,37.423574,-122.094137,-33.2,2020-05-14-US-MTV-1_Pixel4XLModded,554.45,1.2,0.0,...,-1.932192,-24.935806,,,,,,,,


In [17]:
output.info(verbose = True, memory_usage= True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131342 entries, 0 to 131341
Data columns (total 110 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   collectionName                             131342 non-null  object 
 1   phoneName                                  131342 non-null  object 
 2   millisSinceGpsEpoch                        131342 non-null  int64  
 3   latDeg                                     131342 non-null  float64
 4   lngDeg                                     131342 non-null  float64
 5   heightAboveWgs84EllipsoidM                 131342 non-null  float64
 6   phone                                      131342 non-null  object 
 7   timeSinceFirstFixSeconds                   131342 non-null  float64
 8   hDop                                       131342 non-null  float64
 9   vDop                                       131342 non-null  float64
 10  speedMp

In [18]:
del df_truth, df_derived, gnss_dict

## Test
### For Multi Processing Code

In [19]:
path_list = list(glob(str(PATH / "test" / "*/*/*")))

get_ground_truth = lambda x:x.split(split_charater)[-1].find("ground_truth")>= 0
get_derived = lambda x:x.split(split_charater)[-1].find("derived.csv")>= 0
get_gnsslog = lambda x:x.split(split_charater)[-1].find("GnssLog.txt")>= 0

ground_truth_path_list = list(filter(get_ground_truth, path_list))
derived_path_list = list(filter(get_derived, path_list))
gnsslog_path_list = list(filter(get_gnsslog, path_list))

In [20]:
output = dict()
for section in section_names:
    output[section] = pd.DataFrame()

In [21]:
with Pool(thread_num) as pool:
    imap = pool.imap(gnsslog_file_open, gnsslog_path_list)
    gnss_logs = list(tqdm(imap, total=len(gnsslog_path_list), desc="load gnss log"))

load gnss log:   0%|          | 0/48 [00:00<?, ?it/s]

In [22]:
with Pool(thread_num) as pool:
    imap = pool.imap(ground_truth_file_open, ground_truth_path_list)
    ground_truth = list(tqdm(imap, total=len(ground_truth_path_list), desc="load ground truth"))

load ground truth: 0it [00:00, ?it/s]

In [23]:
with Pool(thread_num) as pool:
    imap = pool.imap(derived_file_open, derived_path_list)
    derived = list(tqdm(imap, total=len(derived_path_list), desc="load derived data"))

load derived data:   0%|          | 0/48 [00:00<?, ?it/s]

### One Process Parser

In [33]:
df_derived = pd.concat(derived)
print(df_derived.shape)
df_derived.head()

(2523199, 20)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,constellationType,svid,signalType,receivedSvTimeInGpsNanos,xSatPosM,ySatPosM,zSatPosM,xSatVelMps,ySatVelMps,zSatVelMps,satClkBiasM,satClkDriftMps,rawPrM,rawPrUncM,isrbM,ionoDelayM,tropoDelayM
0,2020-05-15-US-MTV-1,Pixel4,1273608786431,1,2,GPS_L1,1273608785359142537,-13558080.0,-22854820.0,1544018.0,299.739,-52.442,3161.066,-136834.371,-0.002,21764220.0,4.197,0.0,4.792,3.446
1,2020-05-15-US-MTV-1,Pixel4,1273608786431,6,11,GAL_E1,1273608785345898271,12751710.0,-10230810.0,24669670.0,1759.975,1745.212,-184.065,855531.658,0.068,25734750.0,10.493,-214.13,6.801,6.527
2,2020-05-15-US-MTV-1,Pixel4,1273608786431,1,6,GPS_L1,1273608785362136210,-5156902.0,-23814800.0,10564690.0,820.205,1068.285,2827.14,-81575.023,-0.002,20866740.0,3.897,0.0,3.945,2.845
3,2020-05-15-US-MTV-1,Pixel4,1273608786431,1,12,GPS_L1,1273608785356867050,-17452190.0,1887561.0,19645400.0,1365.684,-2152.439,1418.596,35245.362,-0.002,22446400.0,6.296,0.0,5.739,4.806
4,2020-05-15-US-MTV-1,Pixel4,1273608786431,6,2,GAL_E5A,1273608785354070569,-8979749.0,-19146110.0,20711280.0,494.969,-2020.888,-1652.319,40044.541,0.001,23284760.0,1.799,-2344.573,6.341,2.547


In [34]:
gnss_dict = {}
for key in gnss_section_names:
    list_temp = []
    for gnss_log in gnss_logs:
        list_temp.append(gnss_log[key])
    df_temp = pd.concat(list_temp)
    gnss_dict[key] = df_temp
    if (key == "Status") or (key == "Fix"):  
        gnss_dict[key].rename(columns = {'UnixTimeMillis':'utcTimeMillis'}, inplace = True)
    gnss_dict[key]["millisSinceGpsEpoch"] = gnss_dict[key]["utcTimeMillis"].astype('int64') - 315964800000
    print(gnss_dict[key].shape)
    print(gnss_dict[key].head())

(2956220, 39)
   utcTimeMillis      TimeNanos  LeapSecond  TimeUncertaintyNanos  \
0  1589573567431  8881264000000         NaN                   NaN   
1  1589573567431  8881264000000         NaN                   NaN   
2  1589573567431  8881264000000         NaN                   NaN   
3  1589573567431  8881264000000         NaN                   NaN   
4  1589573567431  8881264000000         NaN                   NaN   

         FullBiasNanos  BiasNanos  BiasUncertaintyNanos  DriftNanosPerSecond  \
0 -1273599904167740172   0.358889             32.053304             2.640615   
1 -1273599904167740172   0.358889             32.053304             2.640615   
2 -1273599904167740172   0.358889             32.053304             2.640615   
3 -1273599904167740172   0.358889             32.053304             2.640615   
4 -1273599904167740172   0.358889             32.053304             2.640615   

   DriftUncertaintyNanosPerSecond  HardwareClockDiscontinuityCount  ...  \
0              

(3583787, 8)
        collectionName     phoneName  utcTimeMillis  elapsedRealtimeNanos  \
0  2021-03-16-US-MTV-2  Pixel4Modded   1.615921e+12          7.160950e+12   
1  2021-03-16-US-MTV-2  Pixel4Modded   1.615921e+12          7.160966e+12   
2  2021-03-16-US-MTV-2  Pixel4Modded   1.615921e+12          7.160982e+12   
3  2021-03-16-US-MTV-2  Pixel4Modded   1.615921e+12          7.160998e+12   
4  2021-03-16-US-MTV-2  Pixel4Modded   1.615921e+12          7.161014e+12   

   yawDeg  rollDeg  pitchDeg  millisSinceGpsEpoch  
0    55.0    137.0     -82.0        1299956498428  
1    57.0    139.0     -82.0        1299956498444  
2    58.0    139.0     -82.0        1299956498460  
3    57.0    139.0     -82.0        1299956498476  
4    56.0    138.0     -82.0        1299956498492  


### Save To Pickle File

In [35]:
output = merge_data(test_df, None, df_derived, gnss_dict)
output.to_pickle(str(PATH / "gsdc_test.pkl.gzip"))
print(output.shape)
output.head()

(91486, 102)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,constellationType,svid,signalType,...,UncalMagYMicroT,UncalMagZMicroT,BiasXMicroT,BiasYMicroT,BiasZMicroT,utcTimeMillis_OrientationDeg,elapsedRealtimeNanos_OrientationDeg,yawDeg,rollDeg,pitchDeg
0,2020-05-15-US-MTV-1,Pixel4XL,1273608752446,37.416623,-122.082055,-24.21,2020-05-15-US-MTV-1_Pixel4XL,1,2,GPS_L1,...,-37.437897,-1.664517,,,,,,,,
1,2020-05-15-US-MTV-1,Pixel4XL,1273608753446,37.41659,-122.082073,-29.97,2020-05-15-US-MTV-1_Pixel4XL,6,36,GAL_E5A,...,-37.738075,-2.746611,,,,,,,,
2,2020-05-15-US-MTV-1,Pixel4XL,1273608754446,37.416593,-122.082084,-29.85,2020-05-15-US-MTV-1_Pixel4XL,6,36,GAL_E5A,...,-37.783287,-2.426145,,,,,,,,
3,2020-05-15-US-MTV-1,Pixel4XL,1273608755446,37.41658,-122.082067,-28.53,2020-05-15-US-MTV-1_Pixel4XL,1,19,GPS_L1,...,-37.108665,-2.881307,,,,,,,,
4,2020-05-15-US-MTV-1,Pixel4XL,1273608756446,37.416594,-122.082065,-27.7,2020-05-15-US-MTV-1_Pixel4XL,1,6,GPS_L1,...,-37.616684,-1.940271,,,,,,,,


In [36]:
output.info(verbose = True, memory_usage= True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91486 entries, 0 to 91485
Data columns (total 102 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   collectionName                             91486 non-null  object 
 1   phoneName                                  91486 non-null  object 
 2   millisSinceGpsEpoch                        91486 non-null  int64  
 3   latDeg                                     91486 non-null  float64
 4   lngDeg                                     91486 non-null  float64
 5   heightAboveWgs84EllipsoidM                 91486 non-null  float64
 6   phone                                      91486 non-null  object 
 7   constellationType                          91486 non-null  int64  
 8   svid                                       91486 non-null  int64  
 9   signalType                                 91486 non-null  object 
 10  receivedSvTimeInGpsNa

In [38]:
del df_derived, gnss_dict

In [39]:
%clear




# Load Pickle File

In [40]:
import numpy as np
import pandas as pd
from glob import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px

In [41]:
PATH = Path("../input/google-smartphone-decimeter-challenge")

In [42]:
df_train = pd.read_pickle(str(PATH / "gsdc_train.pkl.gzip"))

In [43]:
print(df_train.shape)
df_train.head()

(131342, 110)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,timeSinceFirstFixSeconds,hDop,vDop,...,UncalMagYMicroT,UncalMagZMicroT,BiasXMicroT,BiasYMicroT,BiasZMicroT,utcTimeMillis_OrientationDeg,elapsedRealtimeNanos_OrientationDeg,yawDeg,rollDeg,pitchDeg
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4,551.44,1.1,0.0,...,-48.00559,-13.144198,,,,,,,,
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4,552.44,1.1,0.0,...,-48.438065,-13.865798,,,,,,,,
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4,553.44,1.1,0.0,...,-48.41085,-12.949378,,,,,,,,
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,-32.83,2020-05-14-US-MTV-1_Pixel4,554.44,1.1,0.0,...,-47.79542,-12.749175,,,,,,,,
4,2020-05-14-US-MTV-1,Pixel4XLModded,1273529466449,37.423574,-122.094137,-33.2,2020-05-14-US-MTV-1_Pixel4XLModded,554.45,1.2,0.0,...,-1.932192,-24.935806,,,,,,,,


In [44]:
df_train.info(verbose = True, memory_usage= True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131342 entries, 0 to 131341
Data columns (total 110 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   collectionName                             131342 non-null  object 
 1   phoneName                                  131342 non-null  object 
 2   millisSinceGpsEpoch                        131342 non-null  int64  
 3   latDeg                                     131342 non-null  float64
 4   lngDeg                                     131342 non-null  float64
 5   heightAboveWgs84EllipsoidM                 131342 non-null  float64
 6   phone                                      131342 non-null  object 
 7   timeSinceFirstFixSeconds                   131342 non-null  float64
 8   hDop                                       131342 non-null  float64
 9   vDop                                       131342 non-null  float64
 10  speedMp

In [45]:
df_test = pd.read_pickle(str(PATH / "gsdc_test.pkl.gzip"))

In [46]:
print(df_test.shape)
df_test.head()

(91486, 102)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,constellationType,svid,signalType,...,UncalMagYMicroT,UncalMagZMicroT,BiasXMicroT,BiasYMicroT,BiasZMicroT,utcTimeMillis_OrientationDeg,elapsedRealtimeNanos_OrientationDeg,yawDeg,rollDeg,pitchDeg
0,2020-05-15-US-MTV-1,Pixel4XL,1273608752446,37.416623,-122.082055,-24.21,2020-05-15-US-MTV-1_Pixel4XL,1,2,GPS_L1,...,-37.437897,-1.664517,,,,,,,,
1,2020-05-15-US-MTV-1,Pixel4XL,1273608753446,37.41659,-122.082073,-29.97,2020-05-15-US-MTV-1_Pixel4XL,6,36,GAL_E5A,...,-37.738075,-2.746611,,,,,,,,
2,2020-05-15-US-MTV-1,Pixel4XL,1273608754446,37.416593,-122.082084,-29.85,2020-05-15-US-MTV-1_Pixel4XL,6,36,GAL_E5A,...,-37.783287,-2.426145,,,,,,,,
3,2020-05-15-US-MTV-1,Pixel4XL,1273608755446,37.41658,-122.082067,-28.53,2020-05-15-US-MTV-1_Pixel4XL,1,19,GPS_L1,...,-37.108665,-2.881307,,,,,,,,
4,2020-05-15-US-MTV-1,Pixel4XL,1273608756446,37.416594,-122.082065,-27.7,2020-05-15-US-MTV-1_Pixel4XL,1,6,GPS_L1,...,-37.616684,-1.940271,,,,,,,,


In [47]:
df_test.info(verbose = True, memory_usage= True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91486 entries, 0 to 91485
Data columns (total 102 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   collectionName                             91486 non-null  object 
 1   phoneName                                  91486 non-null  object 
 2   millisSinceGpsEpoch                        91486 non-null  int64  
 3   latDeg                                     91486 non-null  float64
 4   lngDeg                                     91486 non-null  float64
 5   heightAboveWgs84EllipsoidM                 91486 non-null  float64
 6   phone                                      91486 non-null  object 
 7   constellationType                          91486 non-null  int64  
 8   svid                                       91486 non-null  int64  
 9   signalType                                 91486 non-null  object 
 10  receivedSvTimeInGpsNa