# Conventional Approach for Predicting Indoor Location Using WiFi Fingerprinting
## Ha Vu Tran

       

In [1]:
# necessary Libraries
import numpy as np
import pandas as pd
import time
import pprint

#Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
# magic word for producing visualizations in notebook
get_ipython().run_line_magic('matplotlib', 'inline')


#Preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from scipy.sparse import lil_matrix

#Models
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


#Scoring Metrics
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
from sklearn.metrics import f1_score, fbeta_score
from sklearn.metrics import accuracy_score





examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


# Preprocess data

In [2]:
df = pd.read_csv("challenge1.csv")

#Eleminate unvalid data
df.drop(['Unnamed: 0', 'USERID', 'PHONEID', 'TIMESTAMP'], axis = 1, inplace=True)
col = df.columns[0:520]
for i in col:
    df[i].fillna(0, inplace=True)
df.dropna(subset=['LONGITUDE','LATITUDE', 'FLOOR', 'BUILDINGID' ], inplace=True)
#one can use command trainingData.isnull().sum() to double check


#Rescale signal strength data
df.iloc[:, 0:520] = np.where(df.iloc[:, 0:520] <= 0, 
                        df.iloc[:, 0:520] + 105, 
                        df.iloc[:, 0:520] - 100)

In [3]:
#Select input and output data
def preprocess_data(df):  
    global X
    global y
    X = df.drop(['LONGITUDE', 'LATITUDE', 'BUILDINGID','FLOOR'], axis=1)
    y = df[['BUILDINGID', 'FLOOR']]
    
    
    #create Dummies for the targets to feed into the model
    y = pd.get_dummies(data=y, columns=['BUILDINGID', 'FLOOR'])
    
    
    return X, y

In [4]:
#Split data into training and testing sets
def split_data(X, y):
    global X_train
    global X_test
    global y_train
    global y_test
    
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size = 0.3, 
                                                        random_state = 42,
                                                        shuffle=True)

    # Show the results of the split
    print("Training set has {} samples.".format(X_train.shape[0]))
    print("Testing set has {} samples.".format(X_test.shape[0]))
    return X_train, X_test, y_train, y_test

In [5]:
X, y = preprocess_data(df)
X_train, X_test, y_train, y_test = split_data(X,y)

Training set has 13411 samples.
Testing set has 5748 samples.


In [6]:
#Scale Data with Standard Scaler
scaler = StandardScaler()

#Fit only the training set
scaler.fit(X_train)
    
# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
#Apply PCA while keeping 95% of the variation in the data
pca = PCA(.95)

#Fit only the training set    
pca.fit(X_train)

# Apply PCA transform to both the training set and the test set.    
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

print("Number of PCA Components = {}.".format(pca.n_components_))
#print(pca.n_components_)
print("Total Variance Explained by PCA Components = {}.".format(pca.explained_variance_ratio_.sum()))
#print(pca.explained_variance_ratio_.sum())

Number of PCA Components = 469.
Total Variance Explained by PCA Components = 0.9503886620587898.


In [8]:
#Create sparse matrices to run the scikit multilearn algorithms
X_train_pca = lil_matrix(X_train_pca).toarray()
y_train = lil_matrix(y_train).toarray()
X_test_pca = lil_matrix(X_test_pca).toarray()
y_test = lil_matrix(y_test).toarray()

# Training and Checking Accuracy

In [9]:
start_time = time.time()

MLKNN_classifier = MLkNN(k=1)

MLKNN_classifier.fit(X_train_pca, y_train)

# predict mlknn =3
predictions = MLKNN_classifier.predict(X_test_pca)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))

print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

Accuracy =  0.4145789839944328
--- Run time: 2.15 mins ---


In [10]:
start_time = time.time()

DecisionTree_classifier = DecisionTreeClassifier(random_state=0)

# train
DecisionTree_classifier.fit(X_train_pca, y_train)

# predict 
predictions = DecisionTree_classifier.predict(X_test_pca)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))

print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

Accuracy =  0.6485734168406402
--- Run time: 0.72 mins ---


In [12]:
start_time = time.time()

RandomForest_classifier = RandomForestClassifier(n_estimators = 100, random_state=0)

# train
RandomForest_classifier.fit(X_train_pca, y_train)

# predict 
predictions = RandomForest_classifier.predict(X_test_pca)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))

print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

Accuracy =  0.22860125260960334
--- Run time: 1.74 mins ---
