In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import glob
import math
import pickle

In [2]:
# training/validation set
train_set = pd.read_csv("mwidata.csv")

In [3]:
train_set.columns.values

array(['a8:0c:ca:03:9d:d7', '74:ee:2a:cd:eb:43', 'a8:0c:ca:83:9d:d7', ...,
       'magnetic', 'x', 'y'], dtype=object)

In [4]:
pd.isnull(train_set)
pd.isnull(train_set).values.any()

False

In [5]:
train_set.iloc[:, 0:2524].min().min()

-93.0

In [6]:
train_set_P = train_set.copy()
train_set_P.iloc[:, 0:2524] = np.where(train_set_P.iloc[:, 0:2524] < 0, 
                train_set_P.iloc[:, 0:2524] + 100, 
                train_set_P.iloc[:, 0:2524])

In [7]:
train_set.iloc[:, 2524:2544].min().min()

-102.0

In [8]:
train_set_P = train_set.copy()
train_set_P.iloc[:, 2524:2544] = np.where(train_set_P.iloc[:, 2524:2544] < 0, 
                train_set_P.iloc[:, 2524:2544] + 105, 
                train_set_P.iloc[:, 2524:2544])

In [9]:
# Create a single label for the model to predict. FLOOR, LATITUDE, FLOOR, and 
# BUILDINGID pinpoints the exact location of a user inside a building. Stack 
# train set and test set first before assigning unique location so that 
# identical locations are assigned the same UNIQUELOCATION value.
combined = pd.concat([train_set_P]) # stack vertically
combined = combined.assign(UNIQUELOCATION = (combined['x'].astype(str) + '_' + combined['y'].astype(str)))
len(combined["UNIQUELOCATION"]) # 1995 unique locations
                           

7841

In [10]:
# split again
train_set_PU = combined.iloc[0:7841, :]
val_set_U = combined.iloc[0:7841, :]

In [11]:
# Change variable types
train_set_PU["UNIQUELOCATION"] = train_set_PU["UNIQUELOCATION"].astype("category")
train_set_PU.dtypes

a8:0c:ca:03:9d:d7                                    float64
74:ee:2a:cd:eb:43                                    float64
a8:0c:ca:83:9d:d7                                    float64
04:33:89:79:fc:7c                                    float64
12:74:9c:2b:13:8f                                    float64
                                                      ...   
E7FC9D3C-EF01-4B70-B280-2CF6D50FA5CA_13394_63898     float64
magnetic                                             float64
x                                                    float64
y                                                    float64
UNIQUELOCATION                                      category
Length: 2548, dtype: object

In [12]:
X_train = train_set_PU.iloc[:, 0:2545]
y_train = train_set_PU.iloc[:, 2545:2548]

In [13]:
test_set_PU = val_set_U.copy()
test_set_PU["UNIQUELOCATION"] = test_set_PU["UNIQUELOCATION"].astype("category")
test_set_PU.dtypes

a8:0c:ca:03:9d:d7                                    float64
74:ee:2a:cd:eb:43                                    float64
a8:0c:ca:83:9d:d7                                    float64
04:33:89:79:fc:7c                                    float64
12:74:9c:2b:13:8f                                    float64
                                                      ...   
E7FC9D3C-EF01-4B70-B280-2CF6D50FA5CA_13394_63898     float64
magnetic                                             float64
x                                                    float64
y                                                    float64
UNIQUELOCATION                                      category
Length: 2548, dtype: object

In [14]:
X_test = test_set_PU.iloc[:, 0:2545]
y_test = test_set_PU.iloc[:, 2545:2548]

In [15]:
# Create a reference table for looking up the 
# POSITION associated with each UNIQUELOCATION value.
ref_table = pd.concat([y_train.iloc[:, [0,1,2]], y_test.iloc[:, [0,1,2]]])
ref_table = ref_table.drop_duplicates()

In [16]:
#--- save data ---#
def save_data(dataframe, filename):
    file_present = glob.glob(filename) # boolean, file already present?
    if not file_present:
        dataframe.to_csv(filename)
    else:
        print('WARNING: This file already exists.')

In [17]:
# #-- delete unneeded datasets created during preprocessing to free up memory --#
# del train_set, train_set_P, train_set_PU, val_set_U, test_set_PU, combined

In [18]:
# Using cross-validation, train best random forest model to predict 
# UNIQUELOCATION. We report the accuracy and kappa on UNIQUELOCATION predictions
# for cross-validation and on the training set. We report location error metrics
# on the test set.

# Using cross-validation, train best k-nn model for predicting UNIQUELOCATION.
# For cross-validation and training set performance metrics, we will simply use 
# the accuracy and kappa of predicting UNIQUELOCATION values. We will evaluate
# the test set performance using a method defined.


if __name__ == '__main__':
    
    # Select model
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier()
    
    # 'parameters' can be a list of dictionaries for more specificity in 
    # hyperparamter combinations to attempt.
    # hyperparameters: http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
    # for a list of hyperparameters tried, see "tuning_knn.csv"
    hyperparameters = {'n_neighbors': [1],
                       'metric': ['manhattan']}
   
    
    # Apply k-fold cross-validation with grid search
    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics import cohen_kappa_score
    from sklearn.metrics import make_scorer
    
    scoring = {'accuracy': 'accuracy',
               'kappa': make_scorer(cohen_kappa_score)}
    
    grid = GridSearchCV(estimator = classifier,
                        param_grid = hyperparameters,
                        scoring = scoring,
                        cv = 2,
                        refit = 'accuracy', # what best model is based on, and specifies that the best model will be refitted on the whole training set
                        return_train_score = True,
                        n_jobs = -1) # parallel processing
    
    tic = time.time()
    grid_result = grid.fit(X_train, y_train.iloc[:, 2].squeeze()) # squeeze() makes sure y_train is a Series, as recommended now and required in upcoming sklearn versions.
    toc = time.time()
    run_time = (toc - tic)/60
    import winsound; winsound.Beep(frequency = 1500, duration = 2000) 



In [19]:
#--- cross validation metrics and training set metrics (average of folds) ----#
cv_results_ = pd.DataFrame.from_dict(grid_result.cv_results_) 
cv_results_.insert(loc = 0, column = 'Model', 
                   value = ['KNeighborsClassifier']*cv_results_.shape[0])
cv_results_.insert(loc = 25, column = 'mean train - cross_val accuracy', 
                   value = cv_results_['mean_train_accuracy'] - cv_results_['mean_test_accuracy'])
cv_results_.insert(loc = 26, column = 'mean train - cross_val kappa', 
                   value = cv_results_['mean_train_kappa'] - cv_results_['mean_test_kappa'])
with open('tuning_knn.csv', 'a') as f:
    cv_results_.to_csv(f, header = False, index = False)

In [20]:
grid_result.best_estimator_
grid_result.best_score_
grid_result.best_params_

{'metric': 'manhattan', 'n_neighbors': 1}

In [21]:
#--- save best model ---#

def save_model(model, model_name):
    model_name_present = glob.glob(model_name) # boolean, same model name already present?
    if not model_name_present:
        pickle.dump(grid_result, open(model_name, 'wb'))
    else:
        print('WARNING: This file already exists.')

In [22]:
save_model(grid_result, 'KNeighborsClassifier_model.sav')
grid_result = pickle.load(open('KNeighborsClassifier_model.sav', 'rb'))



In [23]:
y_pred = grid_result.predict(X_test)
np.mean(y_pred == y_test.iloc[:, 2])

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.797474811886239

In [24]:
y_test_pos = y_test.iloc[:, 0:2].values 

dict_loc = {}
m_total = ref_table.shape[0]
for i in range(m_total):
    key = ref_table.iloc[i]['UNIQUELOCATION']
    value = ref_table.iloc[i, 0:4].values
    dict_loc[key] = value

y_pred_pos = np.asarray([dict_loc[i] for i in y_pred])[:, 0:2] 

In [25]:
def euclidean(y_test_pos, y_pred_pos):
    """
    Returns the prediction errors based on euclidean distances for each test 
    example. The prediction error for each test set example is the euclidean 
    distance between the test set's position (ground truth) and the predicted 
    position. A "position" is a pair of LONGITUDE and LATITUDE values, 
    e.g. -7515.92, 4.86489e+06.
    
    Arguments:
    y_test_pos -- test set positions represented by numpy array of shape 
                  (m_test, 2)
    y_pred_pos -- predicted test set position represented by numpy array of shape
                  (m_test, 2)
    
    Returns:
    D_error -- prediction errors between test set positions and predicted test 
               set positions represented by numpy array of shape (m_train, 1)
    """
    m_test = y_test_pos.shape[0]
    D_error = np.sum((y_test_pos - y_pred_pos)**2, axis = 1)**0.5
    
    return D_error

In [26]:
D_error = euclidean(y_test_pos, y_pred_pos) # position errors for each test set example, in order as they appear 
sorted_D_error = sorted(D_error)

m_test = y_test.shape[0]
mean_error = np.mean(D_error) # meters
percentile_25th = sorted_D_error[math.ceil(m_test*0.25) - 1] # -1 since 0-indexed. meters
percentile_50th = sorted_D_error[math.ceil(m_test*0.50) - 1] # meters
percentile_75th = sorted_D_error[math.ceil(m_test*0.75) - 1] # meters
percentile_95th = sorted_D_error[math.ceil(m_test*0.95) - 1] # meters
percentile_100th = sorted_D_error[math.ceil(m_test*1.00) - 1] # meters

In [27]:
print(mean_error)

14.091118828192188
