# Load Libraries

In [1]:
import numpy as np
import pandas as pd
from glob import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Set Path

In [2]:
data_dir = Path("../input/google-smartphone-decimeter-challenge")

# Load Data

In [3]:
df_train = pd.read_pickle(str(data_dir / "gsdc_cleaned_train.pkl.gzip"))

In [4]:
df_test = pd.read_pickle(str(data_dir / "gsdc_cleaned_test.pkl.gzip"))

In [5]:
for col in df_train.columns:
    print(col)

collectionName
phoneName
millisSinceGpsEpoch
latDeg
lngDeg
heightAboveWgs84EllipsoidM
phone
timeSinceFirstFixSeconds
hDop
vDop
speedMps
courseDegree
t_latDeg
t_lngDeg
t_heightAboveWgs84EllipsoidM
constellationType
svid
signalType
receivedSvTimeInGpsNanos
xSatPosM
ySatPosM
zSatPosM
xSatVelMps
ySatVelMps
zSatVelMps
satClkBiasM
satClkDriftMps
rawPrM
rawPrUncM
isrbM
ionoDelayM
tropoDelayM
utcTimeMillis
TimeNanos
LeapSecond
FullBiasNanos
BiasNanos
BiasUncertaintyNanos
DriftNanosPerSecond
DriftUncertaintyNanosPerSecond
HardwareClockDiscontinuityCount
Svid
TimeOffsetNanos
State
ReceivedSvTimeNanos
ReceivedSvTimeUncertaintyNanos
Cn0DbHz
PseudorangeRateMetersPerSecond
PseudorangeRateUncertaintyMetersPerSecond
AccumulatedDeltaRangeState
AccumulatedDeltaRangeMeters
AccumulatedDeltaRangeUncertaintyMeters
CarrierFrequencyHz
MultipathIndicator
ConstellationType
AgcDb
BasebandCn0DbHz
FullInterSignalBiasNanos
FullInterSignalBiasUncertaintyNanos
SatelliteInterSignalBiasNanos
SatelliteInterSignalBiasUnc

# Extract Data

## Get dlatDeg, dlngDeg, dheightAboveWgs84EllipsoidM

In [6]:
def getDeltaPositions(df:pd.DataFrame)->pd.DataFrame:
    output = df.copy()
    phones = output['phone'].unique()
    output[['dlatDeg', 'dlngDeg', 'dheight']] = 0
    for phone in phones:
        latDeg = output.loc[output['phone'] == phone, 'latDeg']
        output.loc[output['phone'] == phone, 'dlatDeg'] = latDeg - latDeg.iloc[0]
        
        lngDeg = output.loc[output['phone'] == phone, 'lngDeg']
        output.loc[output['phone'] == phone, 'dlngDeg'] = lngDeg - lngDeg.iloc[0]
        
        heightAboveWgs84EllipsoidM = output.loc[output['phone'] == phone, 'heightAboveWgs84EllipsoidM']
        output.loc[output['phone'] == phone, 'dheight'] = heightAboveWgs84EllipsoidM - heightAboveWgs84EllipsoidM.iloc[0]
        
    return output

In [7]:
df_train = getDeltaPositions(df_train)
df_test = getDeltaPositions(df_test)

## Scaler

In [8]:
class Scaler():
    def __init__(self, target_columns):
        self.target_columns = target_columns
        self.renames_columns = [name + "_Scaled" for name in target_columns]
        self.scaler = dict()
        for name in target_columns:
            self.scaler[name] = StandardScaler()
            
    def fit(self, df:pd.DataFrame):
        for col in self.target_columns:
            self.scaler[col].fit(df[col].values.reshape(-1, 1))
            
    def transform(self, df:pd.DataFrame):
        output = df.copy()
        for col,recol in zip(self.target_columns, self.renames_columns):
            output[recol] = self.scaler[col].transform(df[col].values.reshape(-1, 1))
            output[recol].fillna(0, inplace = True)
        return output
        
    

In [9]:
target_columns = ['dlatDeg', 'dlngDeg', 'dheight',
                 'xSatPosM', 'ySatPosM', 'zSatPosM', 'xSatVelMps', 'ySatVelMps', 'zSatVelMps',
                  'UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec', 
                  'DriftXRadPerSec', 'DriftYRadPerSec', 'DriftZRadPerSec', 
                  'UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2', 'BiasXMps2', 'BiasYMps2', 'BiasZMps2',
                  'UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT', 'BiasXMicroT', 'BiasYMicroT', 'BiasZMicroT',
                  'yawDeg', 'rollDeg', 'pitchDeg'
                 ]

In [10]:
scaler = Scaler(target_columns)
scaler.fit(df_train)

df_train = scaler.transform(df_train)
df_test = scaler.transform(df_test)

In [11]:
df_train[scaler.renames_columns].describe()

Unnamed: 0,dlatDeg_Scaled,dlngDeg_Scaled,dheight_Scaled,xSatPosM_Scaled,ySatPosM_Scaled,zSatPosM_Scaled,xSatVelMps_Scaled,ySatVelMps_Scaled,zSatVelMps_Scaled,UncalGyroXRadPerSec_Scaled,...,BiasZMps2_Scaled,UncalMagXMicroT_Scaled,UncalMagYMicroT_Scaled,UncalMagZMicroT_Scaled,BiasXMicroT_Scaled,BiasYMicroT_Scaled,BiasZMicroT_Scaled,yawDeg_Scaled,rollDeg_Scaled,pitchDeg_Scaled
count,131342.0,131342.0,131342.0,131342.0,131342.0,131342.0,131342.0,131342.0,131342.0,131342.0,...,131342.0,131342.0,131342.0,131342.0,131342.0,131342.0,131342.0,131342.0,131342.0,131342.0
mean,5.390932e-17,-7.898406e-18,5.193472000000001e-17,-3.786907e-17,-8.926280000000001e-17,-1.2767290000000002e-17,7.487256e-17,1.2442690000000001e-17,1.8907480000000003e-17,-4.091212e-18,...,-8.461032000000001e-17,1.76524e-16,-8.222998e-18,1.493123e-17,-8.1148e-18,-6.491839999999999e-19,3.949203e-18,-4.7444530000000004e-17,-1.666239e-17,2.1855860000000002e-17
std,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,0.9308231,...,0.7542804,0.9307495,0.9307495,0.9306759,0.7541188,0.7541188,0.754028,0.527807,0.5276843,0.5276843
min,-3.367212,-2.837517,-84.85201,-1.824201,-1.533469,-2.896326,-2.966502,-2.135619,-1.592272,-21.47876,...,-2.065078,-3.683841,-3.661762,-3.285377,-3.146779,-1.970667,-2.886818,-1.577129,-1.465147,-0.7969673
25%,-0.1426406,-0.2570525,-0.3623789,-0.7641287,-0.7586592,-0.7307094,-0.6466295,-0.7540929,-1.008162,-0.2254101,...,0.0,-0.4797574,-0.05900799,-0.3370902,-0.2461699,0.0,-0.3088827,0.0,0.0,0.0
50%,0.06784625,0.08844239,-0.2357607,-0.2086152,-0.1548048,0.3174404,-0.1229942,0.05439754,-0.001034868,0.0,...,0.0,0.0,0.1489335,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.1870814,0.203367,0.2792265,0.7479192,0.5701537,0.8245646,0.7530651,0.6654773,0.9952601,0.2263081,...,0.4842431,0.407092,0.4331973,0.1884584,0.08105131,0.1906092,0.0,0.0,0.0,0.0
max,2.689582,2.871625,188.2394,2.505145,3.481311,2.111477,2.18388,2.622603,1.575673,20.81124,...,0.4842431,3.249819,4.818309,3.778347,2.810664,3.914161,2.720061,2.41676,0.8816943,2.381081


#  Output

In [12]:
df_train.to_pickle(str(data_dir / "gsdc_extract_train.pkl.gzip"))
df_test.to_pickle(str(data_dir / "gsdc_extract_test.pkl.gzip"))