# KNN Experiments

In [1]:
import os
import sys

PROJECT_HOME = "./../.."
# Add utilities library for Microsoft Indoor Localization 2.0 Dataset
CODE_DIR = os.path.join(PROJECT_HOME, "code")
print("The code directory is located at", CODE_DIR)
sys.path.append(CODE_DIR)

The code directory is located at ./../../code


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from py_indoor_loc.floor_map import read_floor_data, extract_floor_map_geometries
from py_indoor_loc.extract.wifi_fingerprint import read_wifi_location_df_list, extract_fingerprint_df
from py_indoor_loc.plot import plot_n_unique_bssids_by_rssi, plot_floor_map
from py_indoor_loc.knn.preprocessing import extract_train_test
from py_indoor_loc.knn.experiment import run_knn_regression_experiments
from tqdm import tqdm

In [6]:
%matplotlib inline

In [7]:
np.random.seed(2023)

In [8]:
EXEC_FLAG = True

## Utilities

In [9]:
def get_track_filepaths(assignment_df: pd.DataFrame, base_data_dir: str) -> list[str]:
  result = []

  for _, row in assignment_df.iterrows():
    file_path = f"{base_data_dir}{row['site_id']}/{row['floor_id']}/{row['track_id']}_wifi_fingerprint.csv"
    if not os.path.exists(file_path):
      print(f"File {file_path} does not exist.")
      continue
    result.append(file_path)
    
  return result

## Env Variables

In [10]:
BASE_DATA_DIR = "../../data/wifi_fingerprint_manual/train/"
SUPERVISION_PCT = 0.3

## Read Train/Test Assignment

In [11]:
supervision_df = pd.read_csv("../../data/output/20230903_semi_supervision_setup.csv", index_col=0)
supervision_df["site_id"] = supervision_df["site_id"].astype("category")
supervision_df["floor_id"] = supervision_df["floor_id"].astype("category")
supervision_df["dataset"] = supervision_df["dataset"].astype("category")


In [12]:
train_test_assignment = pd.read_csv("../../data/output/20230903_train_test_assignment.csv")
train_test_assignment["site_id"] = train_test_assignment["site_id"].astype("category")
train_test_assignment["floor_id"] = train_test_assignment["floor_id"].astype("category")
train_test_assignment["dataset"] = train_test_assignment["dataset"].astype("category")

In [13]:
supervision_floor_df = supervision_df.loc[
    (supervision_df["supervision_pct"] == SUPERVISION_PCT) &
    (supervision_df["dataset"] == "unlabeled")]

unique_floors = supervision_floor_df[["site_id", "floor_id"]].drop_duplicates().values

print("The number of floors:", unique_floors.shape[0])
print("The number of tracks:", supervision_floor_df.shape[0])

The number of floors: 137
The number of tracks: 5618


## Extract WiFi Location Data

In [14]:
suffix = str(int(SUPERVISION_PCT * 100)).zfill(2)
output_data_dir = f"../../data/output/knn_ground_truth_{suffix}"
os.makedirs(output_data_dir, exist_ok=True)

In [15]:
site_id, floor_id = unique_floors[100]

train_tracks = supervision_floor_df.loc[
    (supervision_floor_df["site_id"] == site_id) &
    (supervision_floor_df["floor_id"] == floor_id)]

test_tracks = train_test_assignment.loc[
    (train_test_assignment["site_id"] == site_id) &
    (train_test_assignment["floor_id"] == floor_id) &
    (train_test_assignment["dataset"] == "test")]

train_files = get_track_filepaths(train_tracks, BASE_DATA_DIR)
test_files = get_track_filepaths(test_tracks, BASE_DATA_DIR)

train_wifi_location_df_list = read_wifi_location_df_list(train_files)
test_wifi_location_df_list = read_wifi_location_df_list(test_files)

train_wifi_location_df = pd.concat(train_wifi_location_df_list)
test_wifi_location_df = pd.concat(test_wifi_location_df_list)

print(f"Train: n_files={len(train_files)}, n_records={train_wifi_location_df.shape[0]}")
print(f"Test: n_files={len(test_files)}, n_records={test_wifi_location_df.shape[0]}")

Train: n_files=43, n_records=280622
Test: n_files=48, n_records=105355


### Visualization: BSSIDs vs. RSS

In [16]:
if not EXEC_FLAG:
  fig, ax = plt.subplots(1, 1, figsize=(10, 6))

  plot_n_unique_bssids_by_rssi(wifi_location_df=pd.concat([train_wifi_location_df, test_wifi_location_df]),
                              ax=ax,
                              label="num_unique_bssids[train+test]",
                              plot_kwargs={"color": "black"})

  plt.show()

### Visualization: Floormap + Train/Test Locations

In [17]:
if not EXEC_FLAG:
  floor_info, floor_map = read_floor_data(f"../../data/metadata/{site_id}/{floor_id}")

  transform_func = lambda coords: coords

  (
    floor_polygons,
    store_polygons,
    x_min,
    y_min,
    x_max,
    y_max,
    width_meter,
    height_meter,
  ) = extract_floor_map_geometries(floor_map, floor_info, transform=transform_func)

In [18]:
if not EXEC_FLAG:
  fig, ax = plt.subplots(1, 1, figsize=(10, 6))

  plot_floor_map(floor_polygons, store_polygons, ax=ax)

  train_fingerprint_location = transform_func(train_wifi_location_df[["x", "y"]].values)
  test_fingerprint_location = transform_func(test_wifi_location_df[["x", "y"]].values)
  ax.scatter(train_fingerprint_location[:, 0],
            train_fingerprint_location[:, 1],
            s=0.5,
            marker="o",
            color="red",
            label="train",
            alpha=0.5)
  ax.scatter(test_fingerprint_location[:, 0],
            test_fingerprint_location[:, 1],
            marker="^",
            s=0.5,
            color="blue",
            label="test",
            alpha=0.5)
  ax.legend()

  plt.ticklabel_format(useOffset=False)
  plt.show()

## KNN Regression

In [18]:
failed = []

for site_id, floor_id in tqdm(unique_floors, desc="Progress on [site/floor]"):
  try:
    print(f"Start processing for {site_id}/{floor_id}")
    train_tracks = supervision_floor_df.loc[
        (supervision_floor_df["site_id"] == site_id) &
        (supervision_floor_df["floor_id"] == floor_id)]

    test_tracks = train_test_assignment.loc[
        (train_test_assignment["site_id"] == site_id) &
        (train_test_assignment["floor_id"] == floor_id) &
        (train_test_assignment["dataset"] == "test")]

    train_files = get_track_filepaths(train_tracks, BASE_DATA_DIR)
    test_files = get_track_filepaths(test_tracks, BASE_DATA_DIR)

    train_wifi_location_df_list = read_wifi_location_df_list(train_files)
    test_wifi_location_df_list = read_wifi_location_df_list(test_files)

    train_wifi_location_df = pd.concat(train_wifi_location_df_list)
    test_wifi_location_df = pd.concat(test_wifi_location_df_list)

    print(f"Train: n_files={len(train_files)}, n_records={train_wifi_location_df.shape[0]}")
    print(f"Test: n_files={len(test_files)}, n_records={test_wifi_location_df.shape[0]}")

    train_fingerprint_df, train_bssid = extract_fingerprint_df(train_wifi_location_df_list)
    test_fingerprint_df, test_bssid = extract_fingerprint_df(test_wifi_location_df_list)
    X_train, y_train, X_test, y_test, bssid_vector = extract_train_test(
      train_fingerprint_df, train_bssid,
      test_fingerprint_df, test_bssid
    )
    assert len(X_train) == len(y_train)
    assert len(X_test) == len(y_test)

    print(f"The number of train samples: {X_train.shape[0]}")
    print(f"The number of test samples: {X_test.shape[0]}")
    options_min_rss = range(-100, -70, 5)
    options_metric = ["l1", "l2", "cosine"]
    options_n_neighbors = [1, 2, 4, 8]

    results = run_knn_regression_experiments(
      X_train,
      y_train,
      X_test,
      y_test,
      options_metric=options_metric,
      options_n_neighbors=options_n_neighbors,
      options_min_rss=options_min_rss
    )
    result_df = pd.DataFrame(results)
    result_df.to_csv(os.path.join(output_data_dir, f"{site_id}_{floor_id}.csv"), index=False)
  
  except Exception as e:
    print(f"Failed to run kNN experiment for {site_id}/{floor_id}, caused by {type(e)}: {str(e)}")
    failed.append((site_id, floor_id))


Progress on [site/floor]:   0%|          | 0/137 [00:00<?, ?it/s]

Start processing for 5a0546857ecc773753327266/B1
Train: n_files=47, n_records=186710
Test: n_files=52, n_records=66133
The number of train samples: 1067
The number of test samples: 444


100%|██████████| 72/72 [00:17<00:00,  4.02it/s]
Progress on [site/floor]:   1%|          | 1/137 [00:21<48:31, 21.41s/it]

Start processing for 5a0546857ecc773753327266/F1
Train: n_files=58, n_records=359509
Test: n_files=62, n_records=154442
The number of train samples: 914
The number of test samples: 381


