# Load Libraries

In [1]:
import numpy as np
import pandas as pd
from glob import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Set Path

In [2]:
data_dir = Path("../input/google-smartphone-decimeter-challenge")

# Load Data

In [3]:
df_train = pd.read_pickle(str(data_dir / "gsdc_train.pkl.gzip"))

In [4]:
df_test = pd.read_pickle(str(data_dir / "gsdc_test.pkl.gzip"))

# Check Dataset

In [5]:
print(df_train.shape)
df_train.head()

(131342, 110)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,timeSinceFirstFixSeconds,hDop,vDop,...,UncalMagYMicroT,UncalMagZMicroT,BiasXMicroT,BiasYMicroT,BiasZMicroT,utcTimeMillis_OrientationDeg,elapsedRealtimeNanos_OrientationDeg,yawDeg,rollDeg,pitchDeg
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4,551.44,1.1,0.0,...,-48.00559,-13.144198,,,,,,,,
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4,552.44,1.1,0.0,...,-48.438065,-13.865798,,,,,,,,
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4,553.44,1.1,0.0,...,-48.41085,-12.949378,,,,,,,,
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,-32.83,2020-05-14-US-MTV-1_Pixel4,554.44,1.1,0.0,...,-47.79542,-12.749175,,,,,,,,
4,2020-05-14-US-MTV-1,Pixel4XLModded,1273529466449,37.423574,-122.094137,-33.2,2020-05-14-US-MTV-1_Pixel4XLModded,554.45,1.2,0.0,...,-1.932192,-24.935806,,,,,,,,


In [6]:
print(df_test.shape)
df_test.head()

(91486, 102)


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,constellationType,svid,signalType,...,UncalMagYMicroT,UncalMagZMicroT,BiasXMicroT,BiasYMicroT,BiasZMicroT,utcTimeMillis_OrientationDeg,elapsedRealtimeNanos_OrientationDeg,yawDeg,rollDeg,pitchDeg
0,2020-05-15-US-MTV-1,Pixel4XL,1273608752446,37.416623,-122.082055,-24.21,2020-05-15-US-MTV-1_Pixel4XL,1,2,GPS_L1,...,-37.437897,-1.664517,,,,,,,,
1,2020-05-15-US-MTV-1,Pixel4XL,1273608753446,37.41659,-122.082073,-29.97,2020-05-15-US-MTV-1_Pixel4XL,6,36,GAL_E5A,...,-37.738075,-2.746611,,,,,,,,
2,2020-05-15-US-MTV-1,Pixel4XL,1273608754446,37.416593,-122.082084,-29.85,2020-05-15-US-MTV-1_Pixel4XL,6,36,GAL_E5A,...,-37.783287,-2.426145,,,,,,,,
3,2020-05-15-US-MTV-1,Pixel4XL,1273608755446,37.41658,-122.082067,-28.53,2020-05-15-US-MTV-1_Pixel4XL,1,19,GPS_L1,...,-37.108665,-2.881307,,,,,,,,
4,2020-05-15-US-MTV-1,Pixel4XL,1273608756446,37.416594,-122.082065,-27.7,2020-05-15-US-MTV-1_Pixel4XL,1,6,GPS_L1,...,-37.616684,-1.940271,,,,,,,,


In [7]:
df_train.info(verbose = True, memory_usage= True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131342 entries, 0 to 131341
Data columns (total 110 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   collectionName                             131342 non-null  object 
 1   phoneName                                  131342 non-null  object 
 2   millisSinceGpsEpoch                        131342 non-null  int64  
 3   latDeg                                     131342 non-null  float64
 4   lngDeg                                     131342 non-null  float64
 5   heightAboveWgs84EllipsoidM                 131342 non-null  float64
 6   phone                                      131342 non-null  object 
 7   timeSinceFirstFixSeconds                   131342 non-null  float64
 8   hDop                                       131342 non-null  float64
 9   vDop                                       131342 non-null  float64
 10  speedMp

In [8]:
df_test.info(verbose = True, memory_usage= True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91486 entries, 0 to 91485
Data columns (total 102 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   collectionName                             91486 non-null  object 
 1   phoneName                                  91486 non-null  object 
 2   millisSinceGpsEpoch                        91486 non-null  int64  
 3   latDeg                                     91486 non-null  float64
 4   lngDeg                                     91486 non-null  float64
 5   heightAboveWgs84EllipsoidM                 91486 non-null  float64
 6   phone                                      91486 non-null  object 
 7   constellationType                          91486 non-null  int64  
 8   svid                                       91486 non-null  int64  
 9   signalType                                 91486 non-null  object 
 10  receivedSvTimeInGpsNa

In [9]:
for col in df_train.columns:
    print(f"KEY {col}")
    if col in df_train.columns:
        print(f"train dtype: {df_train[col].dtype}, null: {df_train[col].isna().mean()}")
    if col in df_test.columns:
        print(f"test  dtype: {df_test[col].dtype}, null: {df_test[col].isna().mean()}")
    print("")

KEY collectionName
train dtype: object, null: 0.0
test  dtype: object, null: 0.0

KEY phoneName
train dtype: object, null: 0.0
test  dtype: object, null: 0.0

KEY millisSinceGpsEpoch
train dtype: int64, null: 0.0
test  dtype: int64, null: 0.0

KEY latDeg
train dtype: float64, null: 0.0
test  dtype: float64, null: 0.0

KEY lngDeg
train dtype: float64, null: 0.0
test  dtype: float64, null: 0.0

KEY heightAboveWgs84EllipsoidM
train dtype: float64, null: 0.0
test  dtype: float64, null: 0.0

KEY phone
train dtype: object, null: 0.0
test  dtype: object, null: 0.0

KEY timeSinceFirstFixSeconds
train dtype: float64, null: 0.0

KEY hDop
train dtype: float64, null: 0.0

KEY vDop
train dtype: float64, null: 0.0

KEY speedMps
train dtype: float64, null: 0.0

KEY courseDegree
train dtype: float64, null: 0.0

KEY t_latDeg
train dtype: float64, null: 0.0

KEY t_lngDeg
train dtype: float64, null: 0.0

KEY t_heightAboveWgs84EllipsoidM
train dtype: float64, null: 0.0

KEY constellationType
train dtype: 

# Data Assess and Clean
## Remove empty columns

In [10]:
drop_cols = []
for col in df_train.columns:
    if col in df_train.columns and col in df_test.columns:
        if df_train[col].isna().mean() == 1 or df_test[col].isna().mean() == 1:
            drop_cols.append(col)
            
df_train.drop(columns = drop_cols, inplace = True)
df_test.drop(columns = drop_cols, inplace = True)

## Change data type

In [11]:
# 서로 다른 데이터 타입이 존재하는 열이 있는지 확인한다.
for col in df_train.columns:
    if col in df_train.columns and col in df_test.columns:
        if df_train[col].dtype == df_test[col].dtype:
            pass
        else:
            print(f"KEY {col}")
            print(df_train[col].dtype, df_test[col].dtype)
            print("")

KEY DriftXRadPerSec
float64 object



In [12]:
col = 'DriftXRadPerSec'
df_temp = df_test.copy()
df_temp[col] = df_temp[col].apply(lambda x: x if type(x) == float else x + "-4" if x[-1] == 'E' else x)
df_temp[col] = df_temp[col].apply(lambda x: x if type(x) == float else float(x))

df_test = df_temp.copy()

In [13]:
# 서로 다른 데이터 타입이 존재하는 열이 있는지 확인한다.
for col in df_train.columns:
    if col in df_train.columns and col in df_test.columns:
        if df_train[col].dtype == df_test[col].dtype:
            pass
        else:
            print(f"KEY {col}")
            print(df_train[col].dtype, df_test[col].dtype)
            print("")

# Output

In [14]:
df_train.to_pickle(str(data_dir / "gsdc_cleaned_train.pkl.gzip"))
df_test.to_pickle(str(data_dir / "gsdc_cleaned_test.pkl.gzip"))