# Unzip training data on Google Drive

Import Google Drive module

In [1]:
from google.colab import drive

Mount the Google Drive that contains the training data

In [2]:
drive.mount("/gdrive")

Mounted at /gdrive


Create a symbolic link (shortcut) to the IndoorNav data from Kaggle

In [3]:
!ln -s "/gdrive/MyDrive/IndoorNav" "/content/data"

Unzip the archived training data

In [None]:
!unzip data/train.zip -d data/train

Flush the mount to make Colab sync with Drive

In [5]:
drive.flush_and_unmount()

# Connect to Google Drive

Import Google Drive module

In [1]:
from google.colab import drive

Mount the Google Drive that contains the training data

In [2]:
drive.mount("/gdrive")

Mounted at /gdrive


Create a symbolic link (shortcut) to the IndoorNav data from Kaggle

In [3]:
!ln -s "/gdrive/MyDrive/IndoorNav" "/content/data"

# BuildObs Function

In [1]:
import pandas as pd
import re

def buildObs(inFName: str, floorID: str):

    waypoint_x = 0.0
    waypoint_y = 0.0
    floor = "0"

    inFile = open (inFName, "r")
    inFileLines = inFile.readlines()

    names=["ts",
           "floor",
           "waypoint_x",
           "waypoint_y",
           "accel_1",
           "accel_2",
           "accel_3",
           "magnet_1",
           "magnet_2",
           "magnet_3",
           "gyro_1",
           "gyro_2",
           "gyro_3",
           "wifi_1",
           "wifi_2",
           "wifi_3",
           "wifi_4",
           "wifi_5",
           "beacon_1",
           "beacon_2",
           "beacon_3",
           "beacon_4",
           "beacon_5",
           "beacon_6",
           "beacon_7"
           ]
    dFrame = pd.DataFrame(columns=names)

    #Set the floor where 1F or F1 = 0, F2 = 1, and 1B = 01
    if "B" in (floorID):
        floor = str((int(re.findall("\d+", floorID)[0]) ) * -1)

    if "F" in (floorID):
        floor = str(int(re.findall("\d+", floorID)[0]) - 1)

    #Create rows of data for each observation by timestamp
    for i in range(0,len(inFileLines)):

        splitLine = inFileLines[i].split()

        if splitLine[1] == "TYPE_WAYPOINT":

            waypoint_x = float(splitLine[2])
            waypoint_y = float(splitLine[3])

        elif splitLine[1] == "TYPE_ACCELEROMETER":

            accel_1 = float(splitLine[2])
            accel_2 = float(splitLine[3])
            accel_3 = float(splitLine[4])

            i += 1
            splitLine = inFileLines[i].split()

            magnet_1 = float(splitLine[2])
            magnet_2 = float(splitLine[3])
            magnet_3 = float(splitLine[4])

            i += 1
            splitLine = inFileLines[i].split()

            gyro_1 = float(splitLine[2])
            gyro_2 = float(splitLine[3])
            gyro_3 = float(splitLine[4])

            i += 1
            splitLine = inFileLines[i].split()

            rot_1 = float(splitLine[2])
            rot_2 = float(splitLine[3])
            rot_3 = float(splitLine[4])

            #skip over uncalibrated lines
            i += 3

            dFrame = dFrame.append({
                "ts": float(splitLine[0]),
                "floor" : floor,
                "waypoint_x": waypoint_x,
                "waypoint_y": waypoint_y,
                "accel_1": accel_1,
                "accel_2": accel_2,
                "accel_3": accel_3,
                "magnet_1": magnet_1,
                "magnet_2": magnet_2,
                "magnet_3": magnet_3,
                "gyro_1": gyro_1,
                "gyro_2": gyro_2,
                "gyro_3": gyro_3,
                "rot_1": rot_1,
                "rot_2": rot_2,
                "rot_3": rot_3},
                ignore_index=True)

        elif splitLine[1] == "TYPE_WIFI":

            dFrame = dFrame.append({
                "ts": float(splitLine[0]),
                "floor": floor,
                "waypoint_x": waypoint_x,
                "waypoint_y": waypoint_y,
                "wifi_1": str(splitLine[2]),
                "wifi_2": str(splitLine[3]),
                "wifi_3": float(splitLine[4]),
                "wifi_4": float(splitLine[5]),
                "wifi_5": float(splitLine[6])},
                ignore_index=True)

        elif splitLine[1] == "TYPE_BEACON":

            dFrame = dFrame.append({
                "ts": float(splitLine[0]),
                "floor": floor,
                "waypoint_x": waypoint_x,
                "waypoint_y": waypoint_y,
                "beacon_1": str(splitLine[2]),
                "beacon_2": str(splitLine[3]),
                "beacon_3": str(splitLine[4]),
                "beacon_4": float(splitLine[5]),
                "beacon_5": float(splitLine[6]),
                "beacon_6": float(splitLine[7]),
                "beacon_7": str(splitLine[8])},
                ignore_index=True)

    return dFrame.copy()

# CleanObs Function

In [2]:
import pandas as pd
import numpy as np

def cleanObs(dFrame: pd.DataFrame):

    #perform bidirectional linear interpolation to fill in missing values
    dFrame = dFrame.interpolate(limit_direction="both")

    for column in dFrame.columns:

        if dFrame[column].dtype == "float64":

            #if there are any leftover missing float values replace them with the mean
            dFrame[column] = dFrame[column].replace(np.nan,dFrame[column].mean())

        else:

            #if there are any missing string values replace them with NULL
            dFrame[column] = dFrame[column].replace(np.nan, "NULL")

    return dFrame

# Main

In [None]:
import pandas as pd
import random as rnd

from os import listdir
from os import remove
from os.path import isfile

path = "data/train"

obsProgFile = "data/obsProg.txt"
obsProgRead = open(obsProgFile, "r").read()

for file in listdir(path):

    if((file + "\n") not in obsProgRead):

      open(obsProgFile,"a+").write(file + "\n")

      for floor in listdir(path + "/" + file):

          folder = listdir(path + "/" + file + "/" + floor)

          #take a random sampling of five files from each folder
          for i in rnd.sample(range(0, len(folder)), 5):

              fileName = folder[i]

              if "txt" in fileName:

                  fullName = path + "/" + file + "/" + floor + "/" + fileName
                  newName = (path + "/_processed/" + fileName).replace("txt","csv")

                  print("Processing " + fullName + ":")
                  print("Building observations...")
                  dFrame = pd.DataFrame(buildObs(fullName, floor))

                  print("Cleaning observations...")
                  dFrame = pd.DataFrame(cleanObs(dFrame))

                  print("Writing " + newName + "...")
                  dFrame.to_csv(newName, index=False)

                  print("Complete.\n")


Processing data/train/5a0546857ecc773753327266/B1/5e15bf8ef4c3420006d5233f.txt:
Building observations...
Cleaning observations...
Writing data/train/_processed/5e15bf8ef4c3420006d5233f.csv...
Complete.

Processing data/train/5a0546857ecc773753327266/B1/5e1581d51506f2000638fc80.txt:
Building observations...
Cleaning observations...
Writing data/train/_processed/5e1581d51506f2000638fc80.csv...
Complete.

Processing data/train/5a0546857ecc773753327266/B1/5e1580b41506f2000638fc5c.txt:
Building observations...
Cleaning observations...
Writing data/train/_processed/5e1580b41506f2000638fc5c.csv...
Complete.

Processing data/train/5a0546857ecc773753327266/B1/5e1581cbf4c3420006d52101.txt:
Building observations...
Cleaning observations...
Writing data/train/_processed/5e1581cbf4c3420006d52101.csv...
Complete.

Processing data/train/5a0546857ecc773753327266/B1/5e158ef61506f2000638fd1f.txt:
Building observations...
Cleaning observations...
Writing data/train/_processed/5e158ef61506f2000638fd1f.csv