# Conventional Approach for Predicting Indoor Location Using WiFi Fingerprinting
Ha Vu Tran       

markdown

In [1]:
#!pip install scikit-multilearn


In [2]:
# necessary Libraries
import numpy as np
import pandas as pd
import time
import pprint

#Visualizations
import matplotlib.pyplot as plt
import seaborn as sns



#Preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from scipy.sparse import lil_matrix

#Models
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


#Scoring Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error





  from numpy.core.umath_tests import inner1d


# Preprocess data

In [3]:
df = pd.read_csv("challenge1.csv")

#Eleminate unvalid data
df.drop(['Unnamed: 0', 'USERID', 'PHONEID', 'TIMESTAMP'], axis = 1, inplace=True)
col = df.columns[0:520]
for i in col:
    df[i].fillna(0, inplace=True)
df.dropna(subset=['LONGITUDE','LATITUDE', 'FLOOR', 'BUILDINGID' ], inplace=True)
#one can use command trainingData.isnull().sum() to double check


#Rescale signal strength data
df.iloc[:, 0:520] = np.where(df.iloc[:, 0:520] <= 0, 
                        df.iloc[:, 0:520] + 105, 
                        df.iloc[:, 0:520] - 100)

#Process Longtitude
df.iloc[:, 520] = np.where(df.iloc[:, 520] <= 0, 
                        -df.iloc[:, 520], 
                        df.iloc[:, 520])

min_LGT = 7300.818990
min_LAT = 4.864746e+06

df.iloc[:,520] = (df.iloc[:, 520] - min_LGT + 1)
df.iloc[:,521] = (df.iloc[:, 521] - min_LAT + 1)

In [4]:
#Select input and output data
def preprocess_data(df):  

    X = df.drop(['LONGITUDE', 'LATITUDE', 'BUILDINGID','FLOOR'], axis=1)
    y = df[['BUILDINGID', 'FLOOR']]
    
    Z = df.drop(['LONGITUDE', 'LATITUDE', 'BUILDINGID','FLOOR'], axis=1)
    v = df[['LONGITUDE', 'LATITUDE']]
    
    #create Dummies for the targets to feed into the model
    y = pd.get_dummies(data=y, columns=['BUILDINGID', 'FLOOR'])
    
    
    return X, y, Z, v

In [5]:
#Split data into training and testing sets
def split_data(X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size = 0.1, 
                                                        random_state = 42,
                                                        shuffle=True)

    # Show the results of the split
    print("Training set has {} samples.".format(X_train.shape[0]))
    print("Testing set has {} samples.".format(X_test.shape[0]))
    return X_train, X_test, y_train, y_test

In [6]:
X, y, Z, v = preprocess_data(df)
X_train, X_test, y_train, y_test = split_data(X,y)
Z_train, Z_test, v_train, v_test = split_data(Z,v)

Training set has 17243 samples.
Testing set has 1916 samples.
Training set has 17243 samples.
Testing set has 1916 samples.


In [7]:
#Scale Data with Standard Scaler
scaler = StandardScaler()

#Fit only the training set
scaler.fit(X_train)
    
# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Apply transform to both the training set and the test set.
Z_train = scaler.transform(Z_train)
Z_test = scaler.transform(Z_test)

In [8]:
#Apply PCA while keeping 95% of the variation in the data
pca = PCA(.95)

#Fit only the training set    
pca.fit(X_train)

# Apply PCA transform to both the training set and the test set.    
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

Z_train_pca = pca.transform(Z_train)
Z_test_pca = pca.transform(Z_test)

print("Number of PCA Components = {}.".format(pca.n_components_))
#print(pca.n_components_)
print("Total Variance Explained by PCA Components = {}.".format(pca.explained_variance_ratio_.sum()))
#print(pca.explained_variance_ratio_.sum())

Number of PCA Components = 470.
Total Variance Explained by PCA Components = 0.9502160510907495.


In [9]:
#Create sparse matrices to run the scikit multilearn algorithms
X_train_pca = lil_matrix(X_train_pca).toarray()
y_train = lil_matrix(y_train).toarray()
X_test_pca = lil_matrix(X_test_pca).toarray()
y_test = lil_matrix(y_test).toarray()

#Create sparse matrices to run the scikit multilearn algorithms
Z_train_pca = lil_matrix(Z_train_pca).toarray()
v_train = lil_matrix(v_train).toarray()
Z_test_pca = lil_matrix(Z_test_pca).toarray()
v_test = lil_matrix(v_test).toarray()

# Training and Checking Accuracy

## Predicting Building and Floor

In [10]:
start_time = time.time()

MLKNN_classifier = MLkNN(k=1)

MLKNN_classifier.fit(X_train_pca, y_train)

# predict mlknn =3
predictions = MLKNN_classifier.predict(X_test_pca)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))

print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

Accuracy =  0.42275574112734865
--- Run time: 1.45 mins ---


In [11]:
start_time = time.time()

DecisionTree_classifier = DecisionTreeClassifier(random_state=0)

# train
DecisionTree_classifier.fit(X_train_pca, y_train)

# predict 
predictions = DecisionTree_classifier.predict(X_test_pca)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))

print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

Accuracy =  0.668580375782881
--- Run time: 0.64 mins ---


In [12]:
start_time = time.time()

RandomForest_classifier = RandomForestClassifier(n_estimators = 100, random_state=0)

# train
RandomForest_classifier.fit(X_train_pca, y_train)

# predict 
predictions = RandomForest_classifier.predict(X_test_pca)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))

print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

Accuracy =  0.27713987473903967
--- Run time: 1.66 mins ---


## Predicting Longitude and Latitude

### Using decision tree

In [13]:
start_time = time.time()

DecisionTree_Regressor = DecisionTreeRegressor(random_state=0)

# train
DecisionTree_Regressor.fit(Z_train_pca, v_train)

# predict 
predictions = DecisionTree_Regressor.predict(Z_test_pca)

# accuracy
print("RMSE of predicting LONGTITUDE = ", mean_squared_error(v_test[:,0],predictions[:,0])**(0.5))
print("RMSE of predicting LATITUDE = ", mean_squared_error(v_test[:,1],predictions[:,1])**(0.5))


print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

RMSE of predicting LONGTITUDE =  46.55169957879674
RMSE of predicting LATITUDE =  28.27677954213633
--- Run time: 0.36 mins ---


### Using random forest

In [14]:
start_time = time.time()

RandomForest_Regressor = RandomForestRegressor(n_estimators = 100, random_state=0)

# train
RandomForest_Regressor.fit(Z_train_pca, v_train)

# predict 
predictions = RandomForest_Regressor.predict(Z_test_pca)

# accuracy
print("RMSE of predicting LONGTITUDE = ", mean_squared_error(v_test[:,0],predictions[:,0])**(0.5))
print("RMSE of predicting LATITUDE = ", mean_squared_error(v_test[:,1],predictions[:,1])**(0.5))


print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

RMSE of predicting LONGTITUDE =  31.69892418328501
RMSE of predicting LATITUDE =  19.922173923651574
--- Run time: 19.43 mins ---
