Much of the code incorporated into this notebook was provied by: https://www.kaggle.com/code/jimitshah777/rnn-bi-lstm-using-keras-for-gsdc-22-starter


In [None]:
!pip install tensorflow
!pip install nb_black
!pip install nb_black > /dev/null
%load_ext lab_black

#### Check if GPU is available or not

In [92]:
import tensorflow as tf

tf.test.is_gpu_available()

False

#### Imports

In [93]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import plotly.express as px

pd.set_option("display.max_columns", 500)

# Convert raw data to gps

In [94]:
import glob
from dataclasses import dataclass
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.interpolate import InterpolatedUnivariateSpline

INPUT_PATH = "smartphone-decimeter-2022"

WGS84_SEMI_MAJOR_AXIS = 6378137.0  # semi-major axis of the earth's ellipsoid
WGS84_SEMI_MINOR_AXIS = 6356752.314245  # semi-minor axis of the earth's ellipsoid
WGS84_SQUARED_FIRST_ECCENTRICITY = (
    6.69437999013e-3  # measures how much the ellipsoid deviates from a perfect sphere
)
WGS84_SQUARED_SECOND_ECCENTRICITY = (
    6.73949674226e-3  # squared second eccentricity of the Earth's ellipsoid
)

HAVERSINE_RADIUS = 6_371_000  # Radius of the earth


# Earth center earth fixed coordinates
@dataclass
class ECEF:
    # x, y, z coordinates measured from earths center
    x: np.array
    y: np.array
    z: np.array

    def to_numpy(self):
        return np.stack(
            [self.x, self.y, self.z], axis=0
        )  # stack x, y, z on top of one another

    @staticmethod
    def from_numpy(pos):
        x, y, z = [np.squeeze(w) for w in np.split(pos, 3, axis=-1)]
        return ECEF(x=x, y=y, z=z)


# geodetic coordinates
@dataclass
class BLH:
    lat: np.array  # latitude
    lng: np.array  # longitude
    hgt: np.array  # geodetic height


# function converts ECEF to BLF
def ECEF_to_BLH(ecef):
    a = WGS84_SEMI_MAJOR_AXIS
    b = WGS84_SEMI_MINOR_AXIS
    e2 = WGS84_SQUARED_FIRST_ECCENTRICITY
    e2_ = WGS84_SQUARED_SECOND_ECCENTRICITY

    # cartesian coordinates
    x = ecef.x
    y = ecef.y
    z = ecef.z

    # Convert the cartesian coordinates to deggrees
    r = np.sqrt(x**2 + y**2)
    t = np.arctan2(z * (a / b), r)
    B = np.arctan2(z + (e2_ * b) * np.sin(t) ** 3, r - (e2 * a) * np.cos(t) ** 3)
    L = np.arctan2(y, x)
    n = a / np.sqrt(1 - e2 * np.sin(B) ** 2)
    H = (r / np.cos(B)) - n

    # Initialize the BLH data class and return
    return BLH(lat=B, lng=L, hgt=H)


# haversine distance is the angular sitance between two points on the surface of a sphare
def haversine_distance(blh_1, blh_2):
    dlat = blh_2.lat - blh_1.lat
    dlng = blh_2.lng - blh_1.lng
    a = (
        np.sin(dlat / 2) ** 2
        + np.cos(blh_1.lat) * np.cos(blh_2.lat) * np.sin(dlng / 2) ** 2
    )
    dist = 2 * HAVERSINE_RADIUS * np.arcsin(np.sqrt(a))
    return dist


# computer the haversine distances between two dataframe
def pandas_haversine_distance(df1, df2):
    # convert the degree measurement of latitude and longitude to radians
    blh1 = BLH(
        lat=np.deg2rad(df1["LatitudeDegrees"].to_numpy()),
        lng=np.deg2rad(df1["LongitudeDegrees"].to_numpy()),
        hgt=0,
    )
    blh2 = BLH(
        lat=np.deg2rad(df2["LatitudeDegrees"].to_numpy()),
        lng=np.deg2rad(df2["LongitudeDegrees"].to_numpy()),
        hgt=0,
    )
    return haversine_distance(
        blh1, blh2
    )  # computer haversine distance between coordinate sets


# take the gnss raw data and time scale and trpID and generate a dataframe with latitude and longitude values
def ecef_to_lat_lng(tripID, gnss_df, UnixTimeMillis):

    ecef_columns = [
        "WlsPositionXEcefMeters",
        "WlsPositionYEcefMeters",
        "WlsPositionZEcefMeters",
    ]

    columns = ["utcTimeMillis"] + ecef_columns

    ecef_df = (
        gnss_df.drop_duplicates(subset="utcTimeMillis")[
            columns
        ]  # drop duplicat rows with same time and take only specified columns
        .dropna()  # drop empty values
        .reset_index(drop=True)  # reset the idexes amd drop the old set of indexes
    )

    ecef = ECEF.from_numpy(ecef_df[ecef_columns].to_numpy())
    blh = ECEF_to_BLH(ecef)

    TIME = ecef_df[
        "utcTimeMillis"
    ].to_numpy()  # these are the time values originating from the gnss dataframe

    # Create interpolation functions
    lat = InterpolatedUnivariateSpline(TIME, blh.lat, ext=3)(UnixTimeMillis)
    lng = InterpolatedUnivariateSpline(TIME, blh.lng, ext=3)(UnixTimeMillis)

    # construct and return new latitude longitude dataframe
    return pd.DataFrame(
        {
            "tripId": tripID,  # represents both the location and phone
            "UnixTimeMillis": UnixTimeMillis,  # the internal clock time in milliseconds
            "LatitudeDegrees": np.degrees(lat),  # interpolated latitude
            "LongitudeDegrees": np.degrees(lng),  # interpolated longitude
        }
    )


# generte a distribution based on the haversine distance and then compute the average between the 50 and 95 percentile to find the
# score. (This is the metric that the competition uses)
def calc_score(tripID, pred_df, gt_df):
    d = pandas_haversine_distance(pred_df, gt_df)
    score = np.mean([np.quantile(d, 0.50), np.quantile(d, 0.95)])
    return score

In [95]:
!pip install pykalman




[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [156]:
import glob
from pykalman import KalmanFilter

INPUT_PATH = "smartphone-decimeter-2022"

sample_df = pd.read_csv(f"{INPUT_PATH}/sample_submission.csv")
pred_dfs = []
for dirname in tqdm(
    sorted(glob.glob(f"{INPUT_PATH}/test/*/*"))
):  # iterate through each directory each phone in said directory
    drive, phone = dirname.split("\\")[-2:]

    tripID = f"{drive}/{phone}"  # generate the tripID for submission
    print(f"{tripID}")

    gnss_df = pd.read_csv(
        f"{dirname}/device_gnss.csv"
    )  # read the raw gnss data into a dataframe

    # retrieve he UnixTimeMillis from the sample_df where tripID matches
    UnixTimeMillis = sample_df[sample_df["tripId"] == tripID][
        "UnixTimeMillis"
    ].to_numpy()

    # Define Kalman Filter
    # kf = KalmanFilter()

    # apply kalman smoother to the interprolated latitude and longitude
    tmp = ecef_to_lat_lng(tripID, gnss_df, UnixTimeMillis)
    # smoothed_latitute_means, _ = kf.smooth(tmp["LatitudeDegrees"])
    # tmp["LatitudeDegrees"] = smoothed_latitute_means

    # smoothed_longitude_means, _ = kf.smooth(tmp["LongitudeDegrees"])
    # tmp["LongitudeDegrees"] = smoothed_longitude_means

    # add interpolated latitude and longitudes to the prediction_df
    pred_dfs.append(tmp)
sub_df = pd.concat(pred_dfs)


baselines = []
gts = []
for dirname in tqdm(sorted(glob.glob(f"{INPUT_PATH}/train/*/*"))):
    drive, phone = dirname.split("\\")[-2:]

    tripID = f"{drive}/{phone}"
    print(f"{tripID}")

    gnss_df = pd.read_csv(f"{dirname}/device_gnss.csv", low_memory=False)
    gt_df = pd.read_csv(f"{dirname}/ground_truth.csv", low_memory=False)

    # generate and append interpolatoed values using time values from ground truth
    baseline_df = ecef_to_lat_lng(tripID, gnss_df, gt_df["UnixTimeMillis"].to_numpy())
    baselines.append(baseline_df)

    # add a tripId column to the ground truth dataframe and append the dataframe to the ground truths
    gt_df["tripId"] = tripID
    gts.append(gt_df)


baselines = pd.concat(baselines)
gts = pd.concat(gts)

  0%|          | 0/36 [00:00<?, ?it/s]

2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra
2021-06-22-US-MTV-1/XiaomiMi8
2021-08-12-US-MTV-1/GooglePixel4
2021-08-17-US-MTV-1/GooglePixel5
2021-08-24-US-SVL-2/GooglePixel5
2021-09-07-US-MTV-1/SamsungGalaxyS20Ultra
2021-09-14-US-MTV-1/GooglePixel5
2021-09-20-US-MTV-1/XiaomiMi8
2021-09-20-US-MTV-2/GooglePixel4
2021-09-28-US-MTV-1/GooglePixel5
2021-11-05-US-MTV-1/XiaomiMi8
2021-11-30-US-MTV-1/GooglePixel5
2022-01-04-US-MTV-1/SamsungGalaxyS20Ultra
2022-01-11-US-MTV-1/GooglePixel6Pro
2022-01-18-US-SJC-2/GooglePixel5
2022-01-26-US-MTV-1/XiaomiMi8
2022-02-01-US-SJC-1/XiaomiMi8
2022-02-08-US-SJC-1/XiaomiMi8
2022-02-15-US-SJC-1/GooglePixel5
2022-02-23-US-LAX-1/GooglePixel5
2022-02-23-US-LAX-3/XiaomiMi8
2022-02-23-US-LAX-5/XiaomiMi8
2022-02-24-US-LAX-1/SamsungGalaxyS20Ultra
2022-02-24-US-LAX-3/XiaomiMi8
2022-02-24-US-LAX-5/SamsungGalaxyS20Ultra
2022-03-14-US-MTV-1/GooglePixel5
2022-03-17-US-SJC-1/GooglePixel5
2022-03-22-US-MTV-1/SamsungGalaxyS20Ultra
2022-03-31-US-LAX-1/GooglePixel5
2022-03-31-US

  0%|          | 0/170 [00:00<?, ?it/s]

2020-05-15-US-MTV-1/GooglePixel4XL
2020-05-21-US-MTV-1/GooglePixel4
2020-05-21-US-MTV-1/GooglePixel4XL
2020-05-21-US-MTV-2/GooglePixel4
2020-05-21-US-MTV-2/GooglePixel4XL
2020-05-28-US-MTV-2/GooglePixel4
2020-05-28-US-MTV-2/GooglePixel4XL
2020-05-29-US-MTV-1/GooglePixel4
2020-05-29-US-MTV-1/GooglePixel4XL
2020-05-29-US-MTV-2/GooglePixel4
2020-05-29-US-MTV-2/GooglePixel4XL
2020-06-04-US-MTV-1/GooglePixel4
2020-06-04-US-MTV-1/GooglePixel4XL
2020-06-04-US-MTV-2/GooglePixel4
2020-06-04-US-MTV-2/GooglePixel4XL
2020-06-05-US-MTV-1/GooglePixel4
2020-06-05-US-MTV-1/GooglePixel4XL
2020-06-05-US-MTV-2/GooglePixel4
2020-06-05-US-MTV-2/GooglePixel4XL
2020-06-10-US-MTV-1/GooglePixel4
2020-06-10-US-MTV-1/GooglePixel4XL
2020-06-10-US-MTV-2/GooglePixel4
2020-06-10-US-MTV-2/GooglePixel4XL
2020-06-11-US-MTV-1/GooglePixel4
2020-06-11-US-MTV-1/GooglePixel4XL
2020-06-18-US-MTV-1/GooglePixel4
2020-06-18-US-MTV-1/GooglePixel4XL
2020-06-24-US-MTV-1/GooglePixel4
2020-06-24-US-MTV-1/GooglePixel4XL
2020-06-24-US

In [97]:
ss = pd.read_csv("smartphone-decimeter-2022/sample_submission.csv")

In [98]:
baselines["group"] = "train_baseline"
sub_df["group"] = "submission_baseline"
gts["group"] = "train_ground_truth"
combined = pd.concat([baselines, sub_df]).reset_index(drop=True).copy()

In [99]:
sub_df.head()

Unnamed: 0,tripId,UnixTimeMillis,LatitudeDegrees,LongitudeDegrees,group
0,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650832999,23.111897,-75.463789,submission_baseline
1,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650833999,31.939865,-104.288399,submission_baseline
2,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650834999,35.311846,-115.298422,submission_baseline
3,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650835999,36.599837,-119.503866,submission_baseline
4,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650836999,37.091818,-121.110216,submission_baseline


In [100]:
gts.head()

Unnamed: 0,MessageType,Provider,LatitudeDegrees,LongitudeDegrees,AltitudeMeters,SpeedMps,AccuracyMeters,BearingDegrees,UnixTimeMillis,tripId,group
0,Fix,GT,37.416619,-122.082065,,0.002044,0.1,92.96875,1589573679445,2020-05-15-US-MTV-1/GooglePixel4XL,train_ground_truth
1,Fix,GT,37.416619,-122.082065,,0.002198,0.1,92.969666,1589573680445,2020-05-15-US-MTV-1/GooglePixel4XL,train_ground_truth
2,Fix,GT,37.416619,-122.082065,,0.001414,0.1,92.96985,1589573681445,2020-05-15-US-MTV-1/GooglePixel4XL,train_ground_truth
3,Fix,GT,37.416619,-122.082065,,0.001414,0.1,92.96985,1589573682445,2020-05-15-US-MTV-1/GooglePixel4XL,train_ground_truth
4,Fix,GT,37.416619,-122.082065,,0.001414,0.1,92.96991,1589573683445,2020-05-15-US-MTV-1/GooglePixel4XL,train_ground_truth


In [101]:
baselines.head()

Unnamed: 0,tripId,UnixTimeMillis,LatitudeDegrees,LongitudeDegrees,group
0,2020-05-15-US-MTV-1/GooglePixel4XL,1589573679445,37.416664,-122.082013,train_baseline
1,2020-05-15-US-MTV-1/GooglePixel4XL,1589573680445,37.416576,-122.082059,train_baseline
2,2020-05-15-US-MTV-1/GooglePixel4XL,1589573681445,37.416519,-122.082083,train_baseline
3,2020-05-15-US-MTV-1/GooglePixel4XL,1589573682445,37.416542,-122.082077,train_baseline
4,2020-05-15-US-MTV-1/GooglePixel4XL,1589573683445,37.416538,-122.082078,train_baseline


In [102]:
Lat = combined["LatitudeDegrees"].to_numpy()
Long = combined["LongitudeDegrees"].to_numpy()

In [103]:
Lat_gt = gts["LatitudeDegrees"].to_numpy()
Long_gt = gts["LongitudeDegrees"].to_numpy()

# Normalization
Bringing Data to standard form so that is makes sense in the model

In [104]:
from pandas import Series
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from math import sqrt

In [105]:
# Latitude

series_Lat_train = Series(Lat.flatten())

series_Lat_gt = Series(Lat_gt.flatten())

values_train = series_Lat_train.values
values_train = values_train.reshape((len(values_train), 1))

print(values_train.shape)

values_gt = series_Lat_gt.values
values_gt = values_gt.reshape((len(values_gt), 1))

# train the normalization
scaler_lat = MinMaxScaler()
scaler_lat = scaler_lat.fit(values_train)

# normalize the dataset and print
standardized_lat_train = scaler_lat.transform(values_train)

standardized_lat_gt = scaler_lat.transform(values_gt)

# inverse transform and print
inversed_lat_train = scaler_lat.inverse_transform(standardized_lat_train)
inversed_lat_gt = scaler_lat.inverse_transform(standardized_lat_gt)

(361730, 1)


In [106]:
sub_df.shape

(66097, 5)

In [107]:
standardized_lat_train = standardized_lat_train.reshape(361730)
standardized_lat_train_only = standardized_lat_train[0:295633]
baselines["standardized_lat"] = standardized_lat_train_only

standardized_lat_sub = standardized_lat_train[295633:]

ss["standardized_lat"] = standardized_lat_sub
gts["standardized_lat"] = standardized_lat_gt.reshape(295633)

In [108]:
# Long

series_Long_train = Series(Long.flatten())

series_Long_gt = Series(Long_gt.flatten())

values_train_long = series_Long_train.values
values_train_long = values_train_long.reshape((len(values_train_long), 1))

print(values_train_long.shape)

values_gt_long = series_Long_gt.values
values_gt_long = values_gt_long.reshape((len(values_gt_long), 1))

# train the normalization
scaler_long = MinMaxScaler()
scaler_long = scaler_long.fit(values_train_long)

# normalize the dataset
standardized_long_train = scaler_long.transform(values_train_long)

standardized_long_gt = scaler_long.transform(values_gt_long)

# inverse transform
inversed_long_train = scaler_long.inverse_transform(standardized_long_train)

inversed_long_gt = scaler_long.inverse_transform(standardized_long_gt)

(361730, 1)


In [109]:
standardized_long_train = standardized_long_train.reshape(361730)
standardized_long_train_only = standardized_long_train[0:295633]
baselines["standardized_long"] = standardized_long_train_only

standardized_long_sub = standardized_long_train[295633:]
ss["standardized_long"] = standardized_long_sub

gts["standardized_long"] = standardized_long_gt.reshape(295633)

In [110]:
gts.tail()

Unnamed: 0,MessageType,Provider,LatitudeDegrees,LongitudeDegrees,AltitudeMeters,SpeedMps,AccuracyMeters,BearingDegrees,UnixTimeMillis,tripId,group,standardized_lat,standardized_long
1609,Fix,GT,37.41605,-122.08094,-26.379,0.006399,0.1,184.6556,1640724240000,2021-12-28-US-MTV-1/XiaomiMi8,train_ground_truth,0.976042,0.007121
1610,Fix,GT,37.41605,-122.08094,-26.379001,0.00583,0.1,184.6559,1640724241000,2021-12-28-US-MTV-1/XiaomiMi8,train_ground_truth,0.976042,0.007121
1611,Fix,GT,37.41605,-122.08094,-26.379,0.004472,0.1,184.65613,1640724242000,2021-12-28-US-MTV-1/XiaomiMi8,train_ground_truth,0.976042,0.007121
1612,Fix,GT,37.41605,-122.08094,-26.379001,0.003609,0.1,184.65622,1640724243000,2021-12-28-US-MTV-1/XiaomiMi8,train_ground_truth,0.976042,0.007121
1613,Fix,GT,37.41605,-122.08094,-26.379,0.004469,0.1,184.65636,1640724244000,2021-12-28-US-MTV-1/XiaomiMi8,train_ground_truth,0.976042,0.007121


In [111]:
baselines.tail()

Unnamed: 0,tripId,UnixTimeMillis,LatitudeDegrees,LongitudeDegrees,group,standardized_lat,standardized_long
1609,2021-12-28-US-MTV-1/XiaomiMi8,1640724240000,37.416021,-122.080936,train_baseline,0.97604,0.007121
1610,2021-12-28-US-MTV-1/XiaomiMi8,1640724241000,37.416016,-122.080935,train_baseline,0.97604,0.007121
1611,2021-12-28-US-MTV-1/XiaomiMi8,1640724242000,37.416027,-122.08094,train_baseline,0.97604,0.007121
1612,2021-12-28-US-MTV-1/XiaomiMi8,1640724243000,37.416036,-122.080944,train_baseline,0.976041,0.007121
1613,2021-12-28-US-MTV-1/XiaomiMi8,1640724244000,37.416031,-122.080947,train_baseline,0.97604,0.00712


In [112]:
ss.tail()

Unnamed: 0,tripId,UnixTimeMillis,LatitudeDegrees,LongitudeDegrees,standardized_lat,standardized_long
66092,2022-04-25-US-OAK-2/GooglePixel4,1650927742650,37.904611,-86.481078,0.990254,0.010799
66093,2022-04-25-US-OAK-2/GooglePixel4,1650927743642,37.904611,-86.481078,0.990256,0.010797
66094,2022-04-25-US-OAK-2/GooglePixel4,1650927744651,37.904611,-86.481078,0.990254,0.010799
66095,2022-04-25-US-OAK-2/GooglePixel4,1650927745640,37.904611,-86.481078,0.990255,0.0108
66096,2022-04-25-US-OAK-2/GooglePixel4,1650927746632,37.904611,-86.481078,0.990251,0.010803


# Reshaping the Data and Padding

In [113]:
max_length = np.amax(baselines["tripId"].value_counts())
desired_rows = max_length
desired_cols = 2
count = 0
for trip in baselines.tripId.unique():
    if count == 0:
        oneTrip = baselines.loc[
            (baselines["tripId"] == trip), ["standardized_lat", "standardized_long"]
        ]

        oneTrip = oneTrip.to_numpy()
        trainingPadded0 = np.pad(
            oneTrip,
            (
                (0, desired_rows - oneTrip.shape[0]),
                (0, desired_cols - oneTrip.shape[1]),
            ),
            "constant",
            constant_values=0,
        )
        count = 1
    elif count == 1:
        oneTrip = baselines.loc[
            (baselines["tripId"] == trip), ["standardized_lat", "standardized_long"]
        ]
        oneTrip = oneTrip.to_numpy()
        trainingPadded1 = np.pad(
            oneTrip,
            (
                (0, desired_rows - oneTrip.shape[0]),
                (0, desired_cols - oneTrip.shape[1]),
            ),
            "constant",
            constant_values=0,
        )
        trainingPadded = np.stack((trainingPadded0, trainingPadded1))
        count = 2
    else:
        oneTrip = baselines.loc[
            (baselines["tripId"] == trip), ["standardized_lat", "standardized_long"]
        ]
        oneTrip = oneTrip.to_numpy()
        oneTripPadded = np.pad(
            oneTrip,
            (
                (0, desired_rows - oneTrip.shape[0]),
                (0, desired_cols - oneTrip.shape[1]),
            ),
            "constant",
            constant_values=0,
        )

        trainingPadded = np.append(trainingPadded, [oneTripPadded], axis=0)

print(trainingPadded.shape)

(170, 3362, 2)


In [114]:
max_length = np.amax(gts["tripId"].value_counts())
desired_rows = max_length
desired_cols = 2
count = 0
for trip in gts.tripId.unique():
    if count == 0:
        oneTrip = gts.loc[
            (gts["tripId"] == trip), ["standardized_lat", "standardized_long"]
        ]

        oneTrip = oneTrip.to_numpy()
        gtsPadded0 = np.pad(
            oneTrip,
            (
                (0, desired_rows - oneTrip.shape[0]),
                (0, desired_cols - oneTrip.shape[1]),
            ),
            "constant",
            constant_values=0,
        )
        count = 1
    elif count == 1:
        oneTrip = gts.loc[
            (gts["tripId"] == trip), ["standardized_lat", "standardized_long"]
        ]
        oneTrip = oneTrip.to_numpy()
        gtsPadded1 = np.pad(
            oneTrip,
            (
                (0, desired_rows - oneTrip.shape[0]),
                (0, desired_cols - oneTrip.shape[1]),
            ),
            "constant",
            constant_values=0,
        )
        gtsPadded = np.stack((gtsPadded0, gtsPadded1))
        count = 2
    else:
        oneTrip = gts.loc[
            (gts["tripId"] == trip), ["standardized_lat", "standardized_long"]
        ]
        oneTrip = oneTrip.to_numpy()
        oneTripPadded = np.pad(
            oneTrip,
            (
                (0, desired_rows - oneTrip.shape[0]),
                (0, desired_cols - oneTrip.shape[1]),
            ),
            "constant",
            constant_values=0,
        )

        # print(oneTrip.shape)
        # print(oneTripPadded.shape)
        gtsPadded = np.append(gtsPadded, [oneTripPadded], axis=0)

print(gtsPadded.shape)

(170, 3362, 2)


In [115]:
max_length = np.amax(ss["tripId"].value_counts())
desired_rows = max_length
desired_cols = 2
count = 0
for trip in ss.tripId.unique():
    if count == 0:
        oneTrip = ss.loc[
            (ss["tripId"] == trip), ["standardized_lat", "standardized_long"]
        ]

        oneTrip = oneTrip.to_numpy()
        ssPadded0 = np.pad(
            oneTrip,
            (
                (0, desired_rows - oneTrip.shape[0]),
                (0, desired_cols - oneTrip.shape[1]),
            ),
            "constant",
            constant_values=0,
        )
        count = 1
    elif count == 1:
        oneTrip = ss.loc[
            (ss["tripId"] == trip), ["standardized_lat", "standardized_long"]
        ]
        oneTrip = oneTrip.to_numpy()
        ssPadded1 = np.pad(
            oneTrip,
            (
                (0, desired_rows - oneTrip.shape[0]),
                (0, desired_cols - oneTrip.shape[1]),
            ),
            "constant",
            constant_values=0,
        )
        ssPadded = np.stack((ssPadded0, ssPadded1))
        count = 2
    else:
        oneTrip = ss.loc[
            (ss["tripId"] == trip), ["standardized_lat", "standardized_long"]
        ]
        oneTrip = oneTrip.to_numpy()
        oneTripPadded = np.pad(
            oneTrip,
            (
                (0, desired_rows - oneTrip.shape[0]),
                (0, desired_cols - oneTrip.shape[1]),
            ),
            "constant",
            constant_values=0,
        )

        ssPadded = np.append(ssPadded, [oneTripPadded], axis=0)

print(ssPadded.shape)

(36, 4514, 2)


# Training

In [116]:
from keras.models import Sequential
from keras.models import Model

from keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import RMSprop, Adam
from keras.metrics import MeanSquaredError
from keras.layers import Dense, Dropout
from keras.layers import LSTM, Masking, Bidirectional

from tensorflow.keras.layers import Input

In [169]:
number_of_features = 2  # just feeding the gnss longitude and latitude
batch_size = 340
time_steps = 3362

input = Input(shape=(None, number_of_features))

masking = Masking(mask_value=0.0)(input)

# Bidrectional layer
Bidirectional_1 = Bidirectional(
    LSTM(number_of_features, return_sequences=True), merge_mode="sum"
)(masking)

# Droupout Layer
Dropout_1 = Dropout(0.2)(Bidirectional_1)
out = Dense(number_of_features, activation="sigmoid")(Dropout_1)

model = Model(inputs=input, outputs=out)
model.compile(
    loss="MeanSquaredError",
    optimizer=Adam(learning_rate=0.0005, decay=1e-3),
    metrics=["MeanSquaredError"],
)



In [170]:
# fit the model to the padded training data
model.fit(
    trainingPadded,
    gtsPadded,
    epochs=20,
    batch_size=17,
    verbose=1,
    shuffle=True,
    validation_split=0.1,
)

Epoch 1/20


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 977ms/step - MeanSquaredError: 0.2893 - loss: 0.3250 - val_MeanSquaredError: 0.2565 - val_loss: 0.2669
Epoch 2/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 992ms/step - MeanSquaredError: 0.2855 - loss: 0.3174 - val_MeanSquaredError: 0.2538 - val_loss: 0.2599
Epoch 3/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step - MeanSquaredError: 0.2818 - loss: 0.3086 - val_MeanSquaredError: 0.2512 - val_loss: 0.2530
Epoch 4/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step - MeanSquaredError: 0.2772 - loss: 0.3027 - val_MeanSquaredError: 0.2486 - val_loss: 0.2463
Epoch 5/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step - MeanSquaredError: 0.2743 - loss: 0.2959 - val_MeanSquaredError: 0.2461 - val_loss: 0.2398
Epoch 6/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1s/step - MeanSquaredError: 0.2701 - loss: 0.2871 - val_MeanSquar

<keras.src.callbacks.history.History at 0x2ae6220d570>

#### Prediction

In [171]:
result = model.predict(trainingPadded)
print(result.shape)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 616ms/step
(170, 3362, 2)


In [172]:
result_sub = model.predict(ssPadded)
print(ssPadded.shape)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 677ms/step
(36, 4514, 2)


In [173]:
result_sub

array([[[0.61520404, 0.45021436],
        [0.5960958 , 0.46831927],
        [0.57934266, 0.48372155],
        ...,
        [0.521031  , 0.4780509 ],
        [0.521031  , 0.4780509 ],
        [0.521031  , 0.4780509 ]],

       [[0.6153367 , 0.45025736],
        [0.5961171 , 0.4684258 ],
        [0.5792877 , 0.48387495],
        ...,
        [0.521031  , 0.4780509 ],
        [0.521031  , 0.4780509 ],
        [0.521031  , 0.4780509 ]],

       [[0.61517155, 0.45019984],
        [0.59608895, 0.468287  ],
        [0.57935065, 0.48367563],
        ...,
        [0.521031  , 0.4780509 ],
        [0.521031  , 0.4780509 ],
        [0.521031  , 0.4780509 ]],

       ...,

       [[0.6154397 , 0.4502573 ],
        [0.5961209 , 0.46845597],
        [0.57919896, 0.4839241 ],
        ...,
        [0.521031  , 0.4780509 ],
        [0.521031  , 0.4780509 ],
        [0.521031  , 0.4780509 ]],

       [[0.61581695, 0.45039058],
        [0.5961787 , 0.46877784],
        [0.57904917, 0.4843849 ],
        .

In [174]:
count = 0
predicted_lat = []
predicted_long = []
for trip in baselines.tripId.unique():
    timestamps = (baselines["tripId"] == trip).sum()
    for x in range(timestamps):
        predicted_lat.append(result[count][x][0])
        predicted_long.append(result[count][x][1])
    count = count + 1

In [175]:
np_predicted = np.array(predicted_lat)
series_np_predicted = Series(np_predicted.flatten())
values_np_predicted = series_np_predicted.values
values_np_predicted = values_np_predicted.reshape((len(values_np_predicted), 1))
inversed_np_predicted = scaler_lat.inverse_transform(values_np_predicted)
baselines["predicted_lat"] = inversed_np_predicted

np_predicted = np.array(predicted_long)
series_np_predicted = Series(np_predicted.flatten())
values_np_predicted = series_np_predicted.values
values_np_predicted = values_np_predicted.reshape((len(values_np_predicted), 1))
inversed_np_predicted = scaler_long.inverse_transform(values_np_predicted)
baselines["predicted_long"] = inversed_np_predicted

In [176]:
predicted_baseline = baselines[["tripId", "predicted_lat", "predicted_long"]].copy()
predicted_baseline.rename(columns={"predicted_lat": "LatitudeDegrees"}, inplace=True)
predicted_baseline.rename(columns={"predicted_long": "LongitudeDegrees"}, inplace=True)

scores = []
for tripID in predicted_baseline["tripId"].unique():
    score = calc_score(tripID, predicted_baseline, gts)
    scores.append(score)

mean_score = np.mean(scores)
print(f"mean_score = {mean_score:.3f}")

mean_score = 2400645.585


In [177]:
count = 0
predicted_lat = []
predicted_long = []
for trip in ss.tripId.unique():
    timestamps = (ss["tripId"] == trip).sum()
    for x in range(timestamps):
        predicted_lat.append(result_sub[count][x][0])
        predicted_long.append(result_sub[count][x][1])
    count = count + 1

In [178]:
np_predicted = np.array(predicted_lat)
series_np_predicted = Series(np_predicted.flatten())
values_np_predicted = series_np_predicted.values
values_np_predicted = values_np_predicted.reshape((len(values_np_predicted), 1))
inversed_np_predicted = scaler_lat.inverse_transform(values_np_predicted)
ss["LatitudeDegrees"] = inversed_np_predicted

np_predicted = np.array(predicted_long)
series_np_predicted = Series(np_predicted.flatten())
values_np_predicted = series_np_predicted.values
values_np_predicted = values_np_predicted.reshape((len(values_np_predicted), 1))
inversed_np_predicted = scaler_long.inverse_transform(values_np_predicted)
ss["LongitudeDegrees"] = inversed_np_predicted

In [179]:
ss.head()

Unnamed: 0,tripId,UnixTimeMillis,LatitudeDegrees,LongitudeDegrees
0,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650832999,31.299097,-100.161339
1,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650833999,30.97517,-99.265694
2,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650834999,30.69117,-98.503754
3,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650835999,30.542473,-98.006615
4,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650836999,30.48926,-97.725548


In [180]:
ss.drop(["standardized_lat", "standardized_long"], axis=1, inplace=True)

KeyError: "['standardized_lat', 'standardized_long'] not found in axis"

In [None]:
ss.head()  # updated with our predictions

Unnamed: 0,tripId,UnixTimeMillis,LatitudeDegrees,LongitudeDegrees
0,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650832999,30.581749,-103.071983
1,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650833999,30.406307,-102.302666
2,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650834999,30.217392,-101.774338
3,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650835999,30.069365,-101.545792
4,2021-04-28-US-MTV-2/SamsungGalaxyS20Ultra,1619650836999,29.959482,-101.483612


In [None]:
ss.reset_index(drop=True)[ss.columns].to_csv("submission.csv", index=False)