# Reference
1. https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets
1. https://www.kaggle.com/asobod11138/gsdc-neuralnet-keras (multi-threading)

# Import Libraries

In [5]:
import numpy as np
import pandas as pd
from glob import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px
from multiprocessing import Pool
import multiprocessing as multi

from utils.gsdc_parser import ground_truth_file_open, derived_file_open, gnsslog_file_open

In [6]:
split_charater = "\\"

# Set Path and Load Dataset

In [7]:
PATH = Path("../input/google-smartphone-decimeter-challenge")
train_df = pd.read_csv(PATH / "baseline_locations_train.csv")
test_df = pd.read_csv(PATH / "baseline_locations_test.csv")

In [8]:
print(train_df.shape)
train_df.head()

(131342, 7)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,-32.83,2020-05-14-US-MTV-1_Pixel4
4,2020-05-14-US-MTV-1,Pixel4,1273529467442,37.423579,-122.094114,-34.49,2020-05-14-US-MTV-1_Pixel4


In [9]:
print(test_df.shape)
test_df.head()

(91486, 7)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416628,-122.082053,-30.69,2020-05-15-US-MTV-1_Pixel4
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416646,-122.08204,-31.76,2020-05-15-US-MTV-1_Pixel4
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416653,-122.082039,-31.65,2020-05-15-US-MTV-1_Pixel4
3,2020-05-15-US-MTV-1,Pixel4,1273608788432,37.416607,-122.082063,-31.52,2020-05-15-US-MTV-1_Pixel4
4,2020-05-15-US-MTV-1,Pixel4,1273608789432,37.416609,-122.082073,-28.95,2020-05-15-US-MTV-1_Pixel4


# Define Loading GnssLog.txt file Function

# Load All Data Function

## For Multi Processing Code (Temporary Dead Code)

In [11]:
path_list = list(glob(str(PATH / "train" / "*/*/*")))

get_ground_truth = lambda x:x.split(split_charater)[-1].find("ground_truth")>= 0
get_derived = lambda x:x.split(split_charater)[-1].find("derived.csv")>= 0
get_gnsslog = lambda x:x.split(split_charater)[-1].find("GnssLog.txt")>= 0

ground_truth_path_list = list(filter(get_ground_truth, path_list))
derived_path_list = list(filter(get_derived, path_list))
gnsslog_path_list = list(filter(get_gnsslog, path_list))

In [13]:

gnss_section_names = {'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
section_names = {'GroundTruth', 'Derived', 'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
_columns = ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM']

output = dict()
for section in section_names:
    output[section] = pd.DataFrame()

start_path = "train"

thread_num = multi.cpu_count()

In [14]:
with Pool(thread_num) as pool:
    imap = pool.imap(ground_truth_file_open, ground_truth_path_list)
    ground_truth = list(tqdm(imap, total=len(ground_truth_path_list), desc="load ground truth"))

HBox(children=(HTML(value='load ground truth'), FloatProgress(value=0.0, max=73.0), HTML(value='')))




In [15]:
with Pool(thread_num) as pool:
    imap = pool.imap(derived_file_open, derived_path_list)
    derived = list(tqdm(imap, total=len(derived_path_list), desc="load derived data"))

HBox(children=(HTML(value='load derived data'), FloatProgress(value=0.0, max=73.0), HTML(value='')))




In [17]:
with Pool(thread_num) as pool:
    imap = pool.imap(gnsslog_file_open, gnsslog_path_list)
    gnss_logs = list(tqdm(imap, total=len(gnsslog_path_list), desc="load gnss log"))

HBox(children=(HTML(value='load gnss log'), FloatProgress(value=0.0, max=73.0), HTML(value='')))




## One Process Parser

In [10]:
def get_addtional_data(df : pd.DataFrame, PATH: Path, train = True):
    gnss_section_names = {'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    section_names = {'GroundTruth', 'Derived', 'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    _columns = ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM']

    output = dict()
    for section in section_names:
        output[section] = pd.DataFrame()

    if train:
        start_path = "train"
    else:
        start_path = "test"
        
    
    for path in tqdm(glob(str(PATH / start_path / "*/*/*"))):
        print(path)
        (collectionName, phoneName) = path.split(split_charater)[-3:-1]
        
        file_name = path.split(split_charater)[-1]
        
        if(file_name.find('ground_truth') >= 0): # get ground truth data
            _df = pd.read_csv(path)    
            _df[['t_'+col for col in _columns]] = _df[_columns]
            _df = _df.drop(columns=_columns)
            output['GroundTruth'] = pd.concat([output['GroundTruth'], _df])
            
        elif(file_name.find('derived.csv') >= 0): # get derived data
            _df = pd.read_csv(path)
            output['Derived'] = pd.concat([output['Derived'], _df])
            
        elif(file_name.find('GnssLog.txt') >= 0): # get gnss log data (it is dict)
            _dict = gnss_log_to_dataframes(path)
            for key, value in _dict.items():
                if value.shape[0] == 0: # empty log bypass
                    continue
                    
                # Addtional meta data for merging original data frame
                value['collectionName'] = collectionName 
                value['phoneName'] = phoneName
                if (key == "Status") or (key == "Fix"):  
                    value.rename(columns = {'UnixTimeMillis':'utcTimeMillis'}, inplace = True)
                value["millisSinceGpsEpoch"] = value["utcTimeMillis"] - 315964800000
                
                output[key] = pd.concat([output[key], value])

    for key, value in output.items():
        if value.shape[0] == 0:
            continue
        df = pd.merge_asof(df.sort_values('millisSinceGpsEpoch'), 
              value.sort_values('millisSinceGpsEpoch'), 
              on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
              direction='nearest',tolerance=100000)
        
    return df
    
                
    
    

# Save To Pickle File

In [11]:
output = get_addtional_data(train_df, PATH, train = True)

output.to_pickle(str(PATH / "gsdc_train.pkl.gzip"))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=292.0), HTML(value='')))

..\input\google-smartphone-decimeter-challenge\train\2020-05-14-US-MTV-1\Pixel4\ground_truth.csv
..\input\google-smartphone-decimeter-challenge\train\2020-05-14-US-MTV-1\Pixel4\Pixel4_derived.csv
..\input\google-smartphone-decimeter-challenge\train\2020-05-14-US-MTV-1\Pixel4\Pixel4_GnssLog.txt
..\input\google-smartphone-decimeter-challenge\train\2020-05-14-US-MTV-1\Pixel4\supplemental
..\input\google-smartphone-decimeter-challenge\train\2020-05-14-US-MTV-1\Pixel4XLModded\ground_truth.csv
..\input\google-smartphone-decimeter-challenge\train\2020-05-14-US-MTV-1\Pixel4XLModded\Pixel4XLModded_derived.csv
..\input\google-smartphone-decimeter-challenge\train\2020-05-14-US-MTV-1\Pixel4XLModded\Pixel4XLModded_GnssLog.txt
..\input\google-smartphone-decimeter-challenge\train\2020-05-14-US-MTV-1\Pixel4XLModded\supplemental
..\input\google-smartphone-decimeter-challenge\train\2020-05-14-US-MTV-2\Pixel4\ground_truth.csv
..\input\google-smartphone-decimeter-challenge\train\2020-05-14-US-MTV-2\Pixel4

KeyboardInterrupt: 

In [None]:
output = get_addtional_data(test_df, PATH, train = False)

output.to_pickle(str(PATH / "gsdc_test.pkl.gzip"))

In [None]:
%clear

# Load Pickle File

In [None]:
import numpy as np
import pandas as pd
from glob import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px

In [None]:
PATH = Path("../input/google-smartphone-decimeter-challenge")

In [None]:
df_train = pd.read_pickle(str(PATH / "gsdc_train.pkl.gzip"))

In [None]:
print(df_train.shape)
df_train.head()

In [None]:
df_test = pd.read_pickle(str(PATH / "gsdc_test.pkl.gzip"))

In [None]:
print(df_test.shape)
df_test.head()