<a href="https://colab.research.google.com/github/kyochanpy/Kaggle_Indoor_Location_Navigation/blob/main/note_books/lstm_wifi_beacon_rssi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow_addons

Collecting tensorflow_addons
[?25l  Downloading https://files.pythonhosted.org/packages/74/e3/56d2fe76f0bb7c88ed9b2a6a557e25e83e252aec08f13de34369cd850a0b/tensorflow_addons-0.12.1-cp37-cp37m-manylinux2010_x86_64.whl (703kB)
[K     |████████████████████████████████| 706kB 3.0MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.12.1


In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from pathlib import Path
import glob
import pickle

import random
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K
import tensorflow_addons as tfa
from tensorflow_addons.layers import WeightNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# options

N_SPLITS = 10

SEED = 2021

NUM_WIFI_FEATS = 45 # number of features that we use. there are 100 feats but we don't need to use all of them
NUM_BEACON_FEATS = 10


base_path = '/content/drive/MyDrive'

In [5]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    session_conf = tf.compat.v1.ConfigProto(
        intra_op_parallelism_threads=1,
        inter_op_parallelism_threads=1
    )
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)
    
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]

In [6]:
feature_dir = f"{base_path}/wifi_100_beacon_50"
train_files = sorted(glob.glob(os.path.join(feature_dir, '*_train.csv')))
test_files = sorted(glob.glob(os.path.join(feature_dir, '*_test.csv')))
subm = pd.read_csv(f'{base_path}/sample_submission.csv', index_col=0)

In [7]:
with open(f'{feature_dir}/train_all.pkl', 'rb') as f:
  data = pickle.load( f)

with open(f'{feature_dir}/test_all.pkl', 'rb') as f:
  test_data = pickle.load(f)

In [8]:
# training target features

WIFI_BSSID_FEATS = [f'wifi_bssid_{i}' for i in range(NUM_WIFI_FEATS)]
WIFI_RSSI_FEATS  = [f'wifi_rssi_{i}' for i in range(NUM_WIFI_FEATS)]
WIFI_TIMEGAP_FEATS = [f'wifi_timegap_{i}' for i in range(NUM_WIFI_FEATS)]

BEACON_MACADDRESS_FEATS = [f'beacon_macaddress_{i}' for i in range(NUM_BEACON_FEATS)]
BEACON_RSSI_FEATS  = [f'beacon_rssi_{i}' for i in range(NUM_BEACON_FEATS)]
BEACON_TIMEGAP_FEATS = [f'beacon_timegap_{i}' for i in range(NUM_BEACON_FEATS)]

In [9]:
wifi_bssids_column = [i+3 for i in range(1, 306) if i % 3 == 0]
wifi_bssids_column_test = [i+4 for i in range(1, 307) if i % 3 == 0]

In [10]:
# get numbers of bssids to embed them in a layer

wifi_bssids = []
for i in wifi_bssids_column:
    wifi_bssids.extend(data.iloc[:,i].values.tolist())
wifi_bssids = list(set(wifi_bssids))

wifi_bssids_size = len(wifi_bssids)
print(f'BSSID TYPES: {wifi_bssids_size}')

wifi_bssids_test = []
for i in wifi_bssids_column_test:
    wifi_bssids_test.extend(test_data.iloc[:,i].values.tolist())
wifi_bssids_test = list(set(wifi_bssids_test))

wifi_bssids_size = len(wifi_bssids_test)
print(f'BSSID TYPES: {wifi_bssids_size}')

wifi_bssids.extend(wifi_bssids_test)
wifi_bssids_size = len(wifi_bssids)

BSSID TYPES: 64925
BSSID TYPES: 31894


In [11]:
beacon_macaddress_column = [i+303 for i in range(1, 33) if i % 3 == 0]
beacon_macaddress_column_test = [i+304 for i in range(1, 33) if i % 3 == 0]

In [12]:
beacon_macaddress = []
for i in beacon_macaddress_column:
    beacon_macaddress.extend(data.iloc[:,i].values.tolist())
beacon_macaddress = list(set(beacon_macaddress))

beacon_macaddress_size = len(beacon_macaddress)
print(f'MACADRESS TYPES: {beacon_macaddress_size}')

beacon_macaddress_test = []
for i in beacon_macaddress_column_test:
    beacon_macaddress_test.extend(test_data.iloc[:,i].values.tolist())
beacon_macaddress_test = list(set(beacon_macaddress_test))

beacon_macaddress_size = len(beacon_macaddress_test)
print(f'MACADRESS TYPES: {beacon_macaddress_size}')

beacon_macaddress.extend(beacon_macaddress_test)
beacon_macaddress_size = len(beacon_macaddress)

MACADRESS TYPES: 14161
MACADRESS TYPES: 5733


In [13]:
le_wifi = LabelEncoder()
le_wifi.fit(wifi_bssids)

le_beacon = LabelEncoder()
le_beacon.fit(beacon_macaddress)

le_site = LabelEncoder()
le_site.fit(data['site'])

ss = StandardScaler()
ss.fit(data.loc[:,WIFI_RSSI_FEATS+WIFI_TIMEGAP_FEATS+BEACON_RSSI_FEATS+BEACON_TIMEGAP_FEATS+['floor']])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [14]:
data.loc[:,WIFI_RSSI_FEATS+WIFI_TIMEGAP_FEATS+BEACON_RSSI_FEATS+BEACON_TIMEGAP_FEATS+['floor']] = ss.transform(data.loc[:,WIFI_RSSI_FEATS+WIFI_TIMEGAP_FEATS+BEACON_RSSI_FEATS+BEACON_TIMEGAP_FEATS+['floor']])

for i in WIFI_BSSID_FEATS:
    data.loc[:,i] = le_wifi.transform(data.loc[:,i])
    data.loc[:,i] = data.loc[:,i] + 1
for i in BEACON_MACADDRESS_FEATS:
    data.loc[:,i] = le_beacon.transform(data.loc[:,i])
    data.loc[:,i] = data.loc[:,i] + 1
    
data.loc[:, 'site'] = le_site.transform(data.loc[:, 'site'])

data.loc[:,WIFI_RSSI_FEATS+WIFI_TIMEGAP_FEATS+BEACON_RSSI_FEATS+BEACON_TIMEGAP_FEATS+['floor']] = ss.transform(data.loc[:,WIFI_RSSI_FEATS+WIFI_TIMEGAP_FEATS+BEACON_RSSI_FEATS+BEACON_TIMEGAP_FEATS+['floor']])

Add floor predictions.

In [15]:
simple_accurate_99 = pd.read_csv('/content/drive/MyDrive/submission_floor_accurate (1).csv') 

In [16]:
test_data['floor'] = simple_accurate_99['floor'].values

In [17]:
test_data.loc[:,WIFI_RSSI_FEATS+WIFI_TIMEGAP_FEATS+BEACON_RSSI_FEATS+BEACON_TIMEGAP_FEATS+['floor']] = ss.transform(test_data.loc[:,WIFI_RSSI_FEATS+WIFI_TIMEGAP_FEATS+BEACON_RSSI_FEATS+BEACON_TIMEGAP_FEATS+['floor']])

for i in WIFI_BSSID_FEATS:
    test_data.loc[:,i] = le_wifi.transform(test_data.loc[:,i])
    test_data.loc[:,i] = test_data.loc[:,i] + 1
for i in BEACON_MACADDRESS_FEATS:
    test_data.loc[:,i] = le_beacon.transform(test_data.loc[:,i])
    test_data.loc[:,i] = test_data.loc[:,i] + 1
    
test_data.loc[:, 'site'] = le_site.transform(test_data.loc[:, 'site'])

test_data.loc[:,WIFI_RSSI_FEATS+WIFI_TIMEGAP_FEATS+BEACON_RSSI_FEATS+BEACON_TIMEGAP_FEATS+['floor']] = ss.transform(test_data.loc[:,WIFI_RSSI_FEATS+WIFI_TIMEGAP_FEATS+BEACON_RSSI_FEATS+BEACON_TIMEGAP_FEATS+['floor']])

In [18]:
site_count = len(data['site'].unique())
data.reset_index(drop=True, inplace=True)

In [19]:
set_seed(SEED)

## The model
The first Embedding layer is very important. <br>
Thanks to the layer, we can make sense of these BSSID features. <br>
<br>
We concatenate all the features and put them into LSTM. <br>
<br>
If something is theoritically wrong, please correct me. Thank you in advance. 

In [20]:
def create_model(input_data):

    # bssid feats
    input_dim = input_data[0].shape[1]

    input_embd_bssid_layer = L.Input(shape=(input_dim,))
    x1 = L.Embedding(wifi_bssids_size, 64)(input_embd_bssid_layer)
    x1 = L.Flatten()(x1)

    # wifi_rssi feats
    input_dim = input_data[1].shape[1]

    input_wifi_rssi_layer = L.Input(input_dim, )
    x2 = L.BatchNormalization()(input_wifi_rssi_layer)
    x2 = L.Dense(NUM_WIFI_FEATS * 64, activation='relu')(x2)

    # wifi_timegap feats
    input_dim = input_data[2].shape[1]

    input_wifi_timegap_layer = L.Input(input_dim, )
    x3 = L.BatchNormalization()(input_wifi_timegap_layer)
    x3 = L.Dense(NUM_WIFI_FEATS * 64, activation='relu')(x3)

    # macaddress feats
    input_dim = input_data[3].shape[1]

    input_embd_macaddress_layer = L.Input(shape=(input_dim,))
    x4 = L.Embedding(beacon_macaddress_size, 64)(input_embd_macaddress_layer)
    x4 = L.Flatten()(x4)


    # beacon_rssi feats
    input_dim = input_data[4].shape[1]

    input_beacon_rssi_layer = L.Input(input_dim, )
    x5 = L.BatchNormalization()(input_beacon_rssi_layer)
    x5 = L.Dense(NUM_BEACON_FEATS * 64, activation='relu')(x5)

    # beacon_timegap feats
    input_dim = input_data[5].shape[1]

    input_beacon_timegap_layer = L.Input(input_dim, )
    x6 = L.BatchNormalization()(input_beacon_timegap_layer)
    x6 = L.Dense(NUM_BEACON_FEATS * 64, activation='relu')(x6)

    # site
    input_site_layer = L.Input(shape=(1,))
    x7 = L.Embedding(site_count, 1)(input_site_layer)
    x7 = L.Flatten()(x7)

    # main stream
    x = L.Concatenate(axis=1)([x1, x2, x3, x4, x5, x6, x7])

    x = L.BatchNormalization()(x)
    x = L.Dropout(0.3)(x)
    x = L.Dense(1280, activation='relu')(x)

    x = L.Reshape((1, -1))(x)
    x = L.BatchNormalization()(x)
    #x = L.LSTM(128, dropout=0.3, recurrent_dropout=0.3, return_sequences=True, activation='relu')(x)
    x = L.LSTM(1280, dropout=0.3, recurrent_dropout=0.3, return_sequences=True, activation='relu')(x)
    x = L.LSTM(16, dropout=0.1, return_sequences=False, activation='relu')(x)

    
    output_layer_1 = L.Dense(2, name='xy')(x)
    #output_layer_2 = L.Dense(1, activation='softmax', name='floor')(x)

    model = M.Model([input_embd_bssid_layer, 
                     input_wifi_rssi_layer,
                     input_wifi_timegap_layer,
                     input_embd_macaddress_layer,
                     input_beacon_rssi_layer,
                     input_beacon_timegap_layer, 
                     input_site_layer], 
                     [output_layer_1])

    model.compile(optimizer=tf.optimizers.Adam(lr=0.001),
                  loss='mse', metrics=['mse'])

    return model

In [None]:
score_df = pd.DataFrame()
predictions = list()

preds_x, preds_y = 0, 0
preds_f_arr = np.zeros((test_data.shape[0], N_SPLITS))

for fold, (trn_idx, val_idx) in enumerate(StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED).split(data.loc[:, 'path'], data.loc[:, 'path'])):
    X_train = data.loc[trn_idx, WIFI_BSSID_FEATS + WIFI_RSSI_FEATS + WIFI_TIMEGAP_FEATS + BEACON_MACADDRESS_FEATS + BEACON_RSSI_FEATS + BEACON_TIMEGAP_FEATS + ['floor','site']]
    y_trainx = data.loc[trn_idx, 'x']
    y_trainy = data.loc[trn_idx, 'y']
    y_trainf = data.loc[trn_idx, 'floor']

    tmp = pd.concat([y_trainx, y_trainy], axis=1)
    #y_train = [tmp, y_trainf]
    y_train = tmp

    X_valid = data.loc[val_idx, WIFI_BSSID_FEATS + WIFI_RSSI_FEATS + WIFI_TIMEGAP_FEATS + BEACON_MACADDRESS_FEATS + BEACON_RSSI_FEATS + BEACON_TIMEGAP_FEATS + ['floor','site']]
    y_validx = data.loc[val_idx, 'x']
    y_validy = data.loc[val_idx, 'y']
    y_validf = data.loc[val_idx, 'floor']

    tmp = pd.concat([y_validx, y_validy], axis=1)
    #y_valid = [tmp, y_validf]
    y_valid = tmp

    model = create_model([X_train.loc[:,WIFI_BSSID_FEATS], 
                          X_train.loc[:,WIFI_RSSI_FEATS],
                          X_train.loc[:,WIFI_TIMEGAP_FEATS],
                          X_train.loc[:,BEACON_MACADDRESS_FEATS],
                          X_train.loc[:,BEACON_RSSI_FEATS],
                          X_train.loc[:,BEACON_TIMEGAP_FEATS+['floor']], 
                          X_train.loc[:,'site']])
    model.fit([X_train.loc[:,WIFI_BSSID_FEATS], 
               X_train.loc[:,WIFI_RSSI_FEATS],
               X_train.loc[:,WIFI_TIMEGAP_FEATS],
               X_train.loc[:,BEACON_MACADDRESS_FEATS],
               X_train.loc[:,BEACON_RSSI_FEATS],
               X_train.loc[:,BEACON_TIMEGAP_FEATS+['floor']], 
               X_train.loc[:,'site']], y_train, 
                validation_data=([X_valid.loc[:,WIFI_BSSID_FEATS], 
                          X_valid.loc[:,WIFI_RSSI_FEATS],
                          X_valid.loc[:,WIFI_TIMEGAP_FEATS],
                          X_valid.loc[:,BEACON_MACADDRESS_FEATS],
                          X_valid.loc[:,BEACON_RSSI_FEATS],
                          X_valid.loc[:,BEACON_TIMEGAP_FEATS+['floor']], 
                          X_valid.loc[:,'site']], y_valid), 
                batch_size=128, epochs=1000,
                callbacks=[
                ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_delta=1e-4, mode='min')
                , ModelCheckpoint(f'{base_path}/RNN_{SEED}_{fold}.hdf5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
                , EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=5, mode='min', baseline=None, restore_best_weights=True)
            ])

    model.load_weights(f'{base_path}/RNN_{SEED}_{fold}.hdf5')
    #val_pred = model.predict([X_valid.loc[:,BSSID_FEATS], X_valid.loc[:,RSSI_FEATS], X_valid.loc[:,'site_id'], X_valid.loc[:,'floor']])

    pred = model.predict([test_data.loc[:,WIFI_BSSID_FEATS], 
                          test_data.loc[:,WIFI_RSSI_FEATS],
                          test_data.loc[:,WIFI_TIMEGAP_FEATS],
                          test_data.loc[:,BEACON_MACADDRESS_FEATS],
                          test_data.loc[:,BEACON_RSSI_FEATS],
                          test_data.loc[:,BEACON_TIMEGAP_FEATS+['floor']], 
                          test_data.loc[:,'site']]) # test_data.iloc[:, :-1])
    preds_x += pred[:,0]
    preds_y += pred[:,1]
    #preds_f_arr[:, fold] = pred[1][:,0].astype(int)

    

    break # for demonstration, run just one fold as it takes much time.

preds_x /= (fold + 1)
preds_y /= (fold + 1)
    
print("*+"*40)
print("*+"*40)

#preds_f_mode = stats.mode(preds_f_arr, axis=1)
#preds_f = preds_f_mode[0].astype(int).reshape(-1)
preds_f = test_data['floor']
test_preds = pd.DataFrame(np.stack((preds_f, preds_x, preds_y))).T
test_preds.columns = subm.columns
test_preds["floor"] = test_preds["floor"].astype(int)
predictions.append(test_preds)



Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 11/1000
Epoch 12/1000

In [None]:
all_preds = pd.concat(predictions)
all_preds = all_preds.reindex(subm.index)

## Fix the floor prediction
So far, it is not successfully make the "floor" prediction part with this dataset. <br>
To make it right, we can incorporate [@nigelhenry](https://www.kaggle.com/nigelhenry/)'s [excellent work](https://www.kaggle.com/nigelhenry/simple-99-accurate-floor-model). <br>

In [None]:
simple_accurate_99 = pd.read_csv('/content/drive/MyDrive/submission_floor_accurate (1).csv')

all_preds['floor'] = simple_accurate_99['floor'].values

In [None]:
all_preds.to_csv('submission_lstm_in_floor_09_before_post.csv')

In [None]:
shutil.move('submission_lstm_in_floor_09_before_post.csv', '/content/drive/MyDrive')

That's it. 

Thank you for reading all of it.

I hope it helps!

Please make comments if you found something to point out, insights or suggestions. 