# Prepocessing of files for the model

In [26]:
%pip install -U kaleido
%pip list | grep kaleido
%pip install pydot

Note: you may need to restart the kernel to use updated packages.
kaleido                   0.2.1
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [27]:
# Utility functions and definitions

import os
import plotly.graph_objects as go
import plotly.io as pio
from PIL import Image
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import json


train_dir = "/Users/lina/Desktop/ba-implementation/train/5d27075f03f801723c2e360f/F1/"
output_dir = "/Users/lina/Desktop/ba-implementation/train/5d27075f03f801723c2e360f/F1_filtered/"
filtered_dir = "/Users/lina/Desktop/ba-implementation/train/5d27075f03f801723c2e360f/F1_filtered/"

pio.kaleido.scope.default_format = "pdf"

def get_file(file_name):
    return output_dir + os.path.splitext(os.path.basename(file_name))[0]

if not os.path.exists(output_dir + "images"):
    os.mkdir(output_dir + "images")


# Modified version of: https://github.com/location-competition/indoor-location-competition-20/blob/master/visualize_f.py
def visualize_trajectory(trajectory, floor_plan_filename, width_meter, height_meter, title, filename, mode='lines + markers + text', show=False):
    fig = go.Figure()

    size_list = [6] * trajectory.shape[0]
    size_list[0] = 10
    size_list[-1] = 10

    color_list = ['rgba(4, 174, 4, 0.5)'] * trajectory.shape[0]
    color_list[0] = 'rgba(12, 5, 235, 1)'
    color_list[-1] = 'rgba(235, 5, 5, 1)'

    position_count = {}
    text_list = []
    for i in range(trajectory.shape[0]):
        if str(trajectory[i]) in position_count:
            position_count[str(trajectory[i])] += 1
        else:
            position_count[str(trajectory[i])] = 0
        text_list.append('        ' * position_count[str(trajectory[i])] + f'{i}')

    fig.add_trace(
        go.Scattergl(
            x=trajectory[:, 0],
            y=trajectory[:, 1],
            mode=mode,
            marker=dict(size=size_list, color=color_list),
            line=dict(shape='linear', color='rgb(100, 10, 100)', width=2, dash='dot'),
            text=[],
            textposition="top center",
            name='trajectory',
        ))

    floor_plan = Image.open(floor_plan_filename)
    fig.update_layout(images=[
        go.layout.Image(
            source=floor_plan,
            xref="x",
            yref="y",
            x=0,
            y=height_meter,
            sizex=width_meter,
            sizey=height_meter,
            sizing="contain",
            opacity=1,
            layer="below",
        )
    ])

    fig.update_xaxes(title="x in meters", autorange=False, range=[0, width_meter])
    fig.update_yaxes(title="y in meters", autorange=False, range=[0, height_meter], scaleanchor="x", scaleratio=1)
    fig.update_layout(
        title=go.layout.Title(
            text=title,
            
            xref="paper",
            x=0,
        ),
        autosize=True,
        width=900,
        height=200 + 900 * height_meter / width_meter,
        template="plotly_white",
    )
    
    fig.write_image(output_dir + "images/" + filename + ".pdf")

    if show:
        fig.show()

    return fig


In [29]:
# mainly copied from https://github.com/ttvand/Indoor-Location-Navigation-Public/blob/master/src/utils.py
class ReadData:
  acce: np.ndarray
  wifi: np.ndarray
  waypoint: np.ndarray

def read_data_file(file_path):
    acce = []
    wifi = []
    waypoint = []

    with open(file_path, 'r') as f:
        lines = f.readlines()

    added_break_lines = []
    for line_txt in lines:
        line_data = line_txt.strip()

        if not line_data or line_data[0] == "#":
            continue
        
        type_count = line_data.count("TYPE_")
        if type_count == 1:
            added_break_lines.append(line_txt)
        elif type_count == 0:
            raise ValueError("This should not happen")
        else:
            type_positions = [m.start() for m in re.finditer("TYPE_", line_data)]
            start_pos = 0
            for p_id, p in enumerate(type_positions[1:]):
                end_pos = p - 14
                added_break_lines.append(line_data[start_pos:end_pos])
                start_pos = p - 14

            added_break_lines.append(line_data[start_pos:])

    lines = added_break_lines

    unique_times = []
    for line_id, line_data in enumerate(lines):
        line_data = line_data.strip()

        if not line_data or line_data[0] == "#":
            continue

        line_data = line_data.split("\t")

        new_time = not unique_times or unique_times[-1] != line_data[0]
        if new_time:
            unique_times.append(line_data[0])
        
        if line_data[1] == "TYPE_ACCELEROMETER":
            acce.append([
                int(line_data[0]),
                float(line_data[2]),
                float(line_data[3]),
                float(line_data[4]),
            ])
            continue

        if line_data[1] == "TYPE_WIFI":
            wifi.append([int(line_data[0]),
                         line_data[2],
                         line_data[3],
                         int(line_data[4]),
            ])
            continue

        if line_data[1] == "TYPE_WAYPOINT":
            waypoint.append(
                [int(line_data[0]),
                float(line_data[2]),
                float(line_data[3])])
    
    acce = np.array(acce)
    wifi = np.array(wifi)
    waypoint = np.array(waypoint)

    file_name = get_file(file)

    np.save(file_name  + '_acce.npy', acce)
    np.save(file_name + '_wifi.npy', wifi)
    np.save(file_name + '_waypoint.npy', waypoint)


for file in os.listdir(train_dir):
    file = os.path.join(train_dir, file)
    if not os.path.isdir(file) and not file.endswith(".DS_Store") and file.startswith("aggregated.txt"):
        read_data_file(file)

In [31]:
# use linear interpolation the waypoint x and y values and acceleration x, y and z to get the corresponding values for each WiFi timestamp

def interpolate_waypoints(npy_waypoint_file, npy_wifi_file, npy_acce_file):
    waypoints = np.load(npy_waypoint_file)
    wifi_data = np.load(npy_wifi_file)
    acce_data = np.load(npy_acce_file)

    timestamps = waypoints[:, 0].astype(int)
    x_coordinates = waypoints[:, 1]
    y_coordinates = waypoints[:, 2]
    timestamps_acce = acce_data[:, 0].astype(int)
    acce_x = acce_data[:, 1]
    acce_y = acce_data[:, 2]
    acce_z = acce_data[:, 3]
    wifi_timestamps = wifi_data[:, 0].astype(int)

    x_coordinates_interpolated = np.interp(wifi_timestamps, timestamps, x_coordinates)
    y_coordinates_interpolated = np.interp(wifi_timestamps, timestamps, y_coordinates)
    acce_x_interpolated = np.interp(wifi_timestamps, timestamps_acce, acce_x)
    acce_y_interpolated = np.interp(wifi_timestamps, timestamps_acce, acce_y)
    acce_z_interpolated = np.interp(wifi_timestamps, timestamps_acce, acce_z)

    interpolated_waypoints = np.stack((wifi_timestamps, x_coordinates_interpolated, y_coordinates_interpolated, acce_x_interpolated, acce_y_interpolated, acce_z_interpolated), axis=-1)

    return interpolated_waypoints

for file_name in os.listdir(filtered_dir):
    if not file_name.startswith('floor'):
        if file_name.endswith('_waypoint.npy'):
            waypoint_file_name = filtered_dir + file_name
            file_prefix = os.path.basename(file_name)[0:24]
            npy_wifi_file_name = filtered_dir + file_prefix + '_wifi.npy'
            npy_acce_file_name = filtered_dir + file_prefix + '_acce.npy'
            interpolated_waypoints = interpolate_waypoints(waypoint_file_name, npy_wifi_file_name, npy_acce_file_name)

            np.save(filtered_dir + file_prefix + '_interpolated_waypoints_acce.npy', interpolated_waypoints)        
            df = pd.DataFrame(interpolated_waypoints, columns=['timestamp', 'x', 'y', 'acce_x', 'acce_y', 'acce_z'])
            df.to_csv(filtered_dir + file_prefix + '_interpolated_waypoints_acce.csv', index=False)



In [32]:
def concat_all_files_wo_interpolation(dir):
    waypoints = []
    
    for file in os.listdir(dir):
        file = os.path.join(dir, file)
        if not file.startswith("floor_"):
            if file.endswith("_waypoint.npy"):
                data = np.load(file)
                waypoints.append(data)
    waypoints = np.concatenate(waypoints, axis=0)
    
    # Remove duplicates based on timestamp (first column)
    _, unique_indices = np.unique(waypoints[:, 0], return_index=True)
    waypoints = waypoints[unique_indices]
    waypoints = waypoints[waypoints[:, 0].argsort()]

    np.save(filtered_dir + 'floor_waypoints.npy', waypoints)
    df = pd.DataFrame(waypoints, columns=['timestamp', 'x', 'y'])
    df.to_csv(dir + 'floor_waypoints.csv', index=False)

def concat_all_files_interpolation(dir):
    waypoints = []
    
    for file in os.listdir(dir):
        file = os.path.join(dir, file)
        if not file.startswith("floor"):
            if file.endswith("_interpolated_waypoints.npy"):
                waypoints.append(np.load(file))
    waypoints = np.concatenate(waypoints, axis=0)
    
    # Remove duplicates based on timestamp (first column)
    _, unique_indices = np.unique(waypoints[:, 0], return_index=True)
    waypoints = waypoints[unique_indices]
    waypoints = waypoints[waypoints[:, 0].argsort()]

    np.save(filtered_dir + 'floor_interpolated_waypoints.npy', waypoints)
    df = pd.DataFrame(waypoints, columns=['timestamp', 'x', 'y'])
    df.to_csv(dir + 'floor_interpolated_waypoints.csv', index=False)


def concat_all_files_interpolation_acce(dir):
    waypoints = []
    
    for file in os.listdir(dir):
        file = os.path.join(dir, file)
        if not file.startswith("floor"):
            if file.endswith("_interpolated_waypoints_acce.npy"):
                data = np.load(file)
                waypoints.append(data)
    waypoints = np.concatenate(waypoints, axis=0)
    
    # Remove duplicates based on timestamp (first column)
    _, unique_indices = np.unique(waypoints[:, 0], return_index=True)
    waypoints = waypoints[unique_indices]
    waypoints = waypoints[waypoints[:, 0].argsort()]

    np.save(filtered_dir + 'floor_interpolated_waypoints_acce.npy', waypoints)
    df = pd.DataFrame(waypoints, columns=['timestamp', 'x', 'y', 'x_acce', 'y_acce', 'z_acce'])
    df.to_csv(dir + 'floor_interpolated_waypoints_acce.csv', index=False)


concat_all_files_wo_interpolation(filtered_dir)
concat_all_files_interpolation(filtered_dir)
concat_all_files_interpolation_acce(filtered_dir)

In [33]:
site = '/Users/lina/Desktop/ba-implementation/metadata/5d27075f03f801723c2e360f/F1/'
file_image = site + 'floor_image.png'
json_plan_filename = site + 'floor_info.json'

with open(json_plan_filename) as json_file:
    json_data = json.load(json_file)
    
width_meter = json_data["map_info"]["width"]
height_meter = json_data["map_info"]["height"]
for file in os.listdir(filtered_dir):
    
    if file.endswith("floor_interpolated_waypoints.npy"):
        interpolated_data = np.load(filtered_dir + file)
        interpolated_data = interpolated_data[:, 1:]
        file = os.path.join(filtered_dir, file)
        filename = "whole_floor_visualization_interpolated"
        floor_plan_filename = site + 'floor_image.png'

        visualize_trajectory(trajectory=interpolated_data, 
                            floor_plan_filename=floor_plan_filename, 
                            width_meter=width_meter, 
                            height_meter=height_meter,
                            show=True,
                            filename=filename,
                            title="Visualization of waypoints with linear interpolation of floor F1 of shopping mall <br>in Yintai City (Chengxi Branch)")
    
    if file.endswith("floor_waypoints.npy"):
        interpolated_data = np.load(filtered_dir + file)
        interpolated_data = interpolated_data[:, 1:]
        file = os.path.join(filtered_dir, file)
        filename = "whole_floor_visualization_wo_interpolated"
        floor_plan_filename = site + 'floor_image.png'

        visualize_trajectory(trajectory=interpolated_data, 
                            floor_plan_filename=floor_plan_filename, 
                            width_meter=width_meter, 
                            height_meter=height_meter,
                            show=True,
                            filename=filename,
                            title="Visualization of floor F1 of shopping mall in Yintai City (Chengxi Branch)")

In [35]:
def split_metric_interpolated_waypoints_acce(dir):
    interpolated_waypoints = np.load(dir + 'floor_interpolated_waypoints_acce.npy')
    x_coordinates = interpolated_waypoints[:, 1]
    y_coordinates = interpolated_waypoints[:, 2]
    x_acce = interpolated_waypoints[:, 3]
    y_acce = interpolated_waypoints[:, 4]
    z_acce = interpolated_waypoints[:, 5]

    # Calculate the distance between consecutive coordinates
    distance = np.sqrt(np.diff(x_coordinates) ** 2 + np.diff(y_coordinates) ** 2)

    # Create a list of tuples (x1, y1, x2, y2, distance)
    distance_list = list(zip(x_coordinates[:-1], y_coordinates[:-1], x_coordinates[1:], y_coordinates[1:], distance))
    print(distance_list)

    # Split the interpolated waypoints into different files based on the distance
    interpolated_waypoints_list = np.split(interpolated_waypoints, np.where(distance > 5.57)[0] + 1)

    # Save the interpolated waypoints in different files
    for i, interpolated_waypoints in enumerate(interpolated_waypoints_list):
        np.save(dir + f'floor_metric_interpolated_waypoints_acce_{i}.npy', interpolated_waypoints)

        df = pd.DataFrame(interpolated_waypoints, columns=['timestamp', 'x', 'y', 'x_acce', 'y_acce', 'z_acce'])
        df.to_csv(dir + f'floor_metric_interpolated_waypoints_acce_{i}.csv', index=False)

split_metric_interpolated_waypoints_acce(filtered_dir)

[(7.33606733966127, 81.96129604876232, 4.756239157174763, 81.86744759352317, 2.5815346179554535), (4.756239157174763, 81.86744759352317, 3.073908960856031, 82.23180489105059, 1.7213341133281788), (3.073908960856031, 82.23180489105059, 3.092590106536965, 83.6219701225681, 1.3902907451767403), (3.092590106536965, 83.6219701225681, 3.111194454085603, 85.00642038910506, 1.384575264209919), (3.111194454085603, 85.00642038910506, 4.517345472126873, 85.39704328046926, 1.4593995096601309), (4.517345472126873, 85.39704328046926, 6.586268048229416, 85.27752990441017, 2.072371605905616), (6.586268048229416, 85.27752990441017, 8.668997602509233, 85.15721895448621, 2.0862016012222337), (8.668997602509233, 85.15721895448621, 10.732609802389744, 85.0380123376059, 2.0670523769376845), (10.732609802389744, 85.0380123376059, 12.825960109113621, 84.91708786932435, 2.0968400591580965), (12.825960109113621, 84.91708786932435, 15.57133371620227, 85.09983980908153, 2.7514495296087023), (15.57133371620227, 85

In [37]:
for file in os.listdir(filtered_dir):
    
    if file.startswith("floor_time") and file.endswith(".npy"):
        interpolated_data = np.load(filtered_dir + file)
        interpolated_data = interpolated_data[:, 1:]
        file = os.path.join(filtered_dir, file)
        title = os.path.splitext(os.path.basename(file))[0]
        save_path = os.path.join(filtered_dir + title)
        floor_plan_filename = site + 'floor_image.png'

        visualize_trajectory(trajectory=interpolated_data, 
                            floor_plan_filename=floor_plan_filename, 
                            width_meter=width_meter, 
                            height_meter=height_meter,
                            show=False, 
                            filename=title,
                            title=title)

In [38]:
for file in os.listdir(filtered_dir):
    
    if file.startswith("floor_metric_") and file.endswith(".npy"):
        interpolated_data = np.load(filtered_dir + file)
        interpolated_data = interpolated_data[:, 1:]
        file = os.path.join(filtered_dir, file)
        title = os.path.splitext(os.path.basename(file))[0]
        save_path = os.path.join(filtered_dir + title)
        floor_plan_filename = site + 'floor_image.png'

        visualize_trajectory(trajectory=interpolated_data, 
                            floor_plan_filename=floor_plan_filename, 
                            width_meter=width_meter, 
                            height_meter=height_meter,
                            show=False,
                            filename=title, 
                            title=title)

## Generate one combined RSSI table
### Create one combined table for RSSIs and bssid with timestamps for specific floor

# bssid should be the column name, timestamp the index and rssi the value
# Store the rssi value in the dictionary with bssid as the key

In [39]:
import csv



main_wifi_dict = {}

for file in os.listdir(filtered_dir):
    if file.endswith("_wifi.npy"):
        wifi = np.load(filtered_dir + file)

        for line in wifi:
            timestamp = int(line[0])
            bssid = line[2]
            rssi = int(line[3])
            
            if timestamp not in main_wifi_dict:
                main_wifi_dict[timestamp] = {}

            main_wifi_dict[timestamp][bssid] = rssi

all_bssids = set()
for timestamp in main_wifi_dict:
    all_bssids.update(main_wifi_dict[timestamp].keys())

sorted_timestamps = sorted(main_wifi_dict.keys())

wifi_table = []
header_row = ["timestamp"] + list(all_bssids)
wifi_table.append(header_row)

for timestamp in sorted_timestamps:
    row = [timestamp]
    for bssid in all_bssids:
        rssi = main_wifi_dict[timestamp].get(bssid, -999)
        row.append(rssi)
    wifi_table.append(row)

name = filtered_dir + 'combined_table.csv'
with open(name, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(wifi_table)


In [42]:
import os
import numpy as np

combined_table = pd.read_csv(filtered_dir + 'combined_table.csv')

N = 0
for file in os.listdir(filtered_dir):
    if file.startswith("floor_metric_interpolated_waypoints_acce") and file.endswith(".npy"):
        N += 1

for i in range(0, N): 

    file_name = f'floor_metric_interpolated_waypoints_acce_{i}'
    extension = '.npy'
    floor_metric = np.load(filtered_dir + file_name + extension)
    
    floor_metric[:, 0] = floor_metric[:, 0].astype(int)
    floor_metric_df = pd.DataFrame(floor_metric, columns=['timestamp', 'x', 'y', 'x_acce', 'y_acce', 'z_acce'])

    # Perform an inner join on the two dataframes based on the 'timestamp' column
    merged_df = pd.merge(floor_metric_df, combined_table, on='timestamp')
    print(merged_df)

    merged_df.to_csv(filtered_dir + file_name + '_merged_data_acce.csv', index=False)

        timestamp          x          y    x_acce    y_acce     z_acce   
0    1.571208e+12   7.336067  81.961296  0.292752 -1.601031   1.191943  \
1    1.571208e+12   4.756239  81.867448 -1.187312 -1.981351   7.177215   
2    1.571208e+12   3.073909  82.231805 -0.002992  0.030051   7.745654   
3    1.571208e+12   3.092590  83.621970 -0.867060 -3.606515   8.481958   
4    1.571208e+12   3.111194  85.006420 -0.313273 -1.039706   7.787724   
..            ...        ...        ...       ...       ...        ...   
113  1.571208e+12  45.947840  63.501948 -0.565257  0.341053  10.299487   
114  1.571208e+12  45.608631  61.400071 -0.536535 -7.975437  17.642127   
115  1.571208e+12  45.276554  59.342391 -0.593460 -1.306093   6.567165   
116  1.571208e+12  45.110996  58.316525 -0.119110 -0.644966  10.003982   
117  1.571208e+12  45.110996  58.316525 -1.188006  1.489886   9.764352   

     12691b0c6527cc0979bbea61f89a0ea0eb7d09c4   
0                                        -999  \
1            

In [44]:
# check if the merged data has more than window_size lines

window_size = 3

for file in os.listdir(filtered_dir):
    if file.startswith("floor_metric_interpolated_waypoints_") and file.endswith("_merged_data.csv"):
        # first row in data frame is column names so exclude it
        merged_df = pd.read_csv(filtered_dir + file)
        if len(merged_df) <= window_size:
            print(f'{file} has less than {window_size} lines')

floor_metric_interpolated_waypoints_42_merged_data.csv has less than 3 lines
floor_metric_interpolated_waypoints_106_merged_data.csv has less than 3 lines
