# KNN

First run the file [/notebooks/CaseStudy1/Clean%20Data%20(From%20Book).ipynb](/notebooks/CaseStudy1/Clean%20Data%20(From%20Book).ipynb)


In [1]:
OFFLINE_SOURCE_CSV = '../data/offline.final.trace.csv'
OFFLINE_SOURCE = '../data/offline.final.trace.txt'
ONLINE_SOURCE = '../data/online.final.trace.txt'

ALL_MACS = c('00:0f:a3:39:dd:cd', '00:0f:a3:39:e1:c0', '00:14:bf:3b:c7:c6', '00:14:bf:b1:97:81', '00:14:bf:b1:97:8a', '00:14:bf:b1:97:8d', '00:14:bf:b1:97:90')

# remove 00:0f:a3:39:dd:cd
ORIGINAL_MACS = c('00:0f:a3:39:e1:c0', '00:14:bf:3b:c7:c6', '00:14:bf:b1:97:81', '00:14:bf:b1:97:8a', '00:14:bf:b1:97:8d', '00:14:bf:b1:97:90')

NEXT_MACS = c('00:0f:a3:39:dd:cd', '00:14:bf:3b:c7:c6', '00:14:bf:b1:97:81', '00:14:bf:b1:97:8a', '00:14:bf:b1:97:8d', '00:14:bf:b1:97:90')


NUM_ROWS_FOR_SAMPLE = 1000 # out of 914,951 rows

In [2]:
roundOrientation = function(angles) {
  refs = seq(0, by = 45, length  = 9)
  q = sapply(angles, function(o) which.min(abs(o - refs)))
  c(refs[1:8], 0)[q]
}
             
processLine =
function(x)
{
  tokens = strsplit(x, "[;=,]")[[1]]
  tmp = matrix(tokens[ - (1:10) ], ncol = 4, byrow = TRUE)
  cbind(matrix(tokens[c(2, 4, 6:8, 10)], nrow = nrow(tmp),
ncol = 6, byrow = TRUE), tmp)
}

readData = 
  function(filename = 'offline.final.trace.txt', 
           subMacs = ALL_MACS)
  {
    txt = readLines(filename)
    lines = txt[ substr(txt, 1, 1) != "#" ]
    tmp = lapply(lines, processLine)
    offline = as.data.frame(do.call("rbind", tmp), 
                            stringsAsFactors= FALSE) 
    
    names(offline) = c("time", "scanMac", 
                       "posX", "posY", "posZ", "orientation", 
                       "mac", "signal", "channel", "type")
    
     # keep only signals from access points
    offline = offline[ offline$type == "3", ]
    
    # drop scanMac, posZ, channel, and type - no info in them
    dropVars = c("scanMac", "posZ", "channel", "type")
    offline = offline[ , !( names(offline) %in% dropVars ) ]
    
    # drop more unwanted access points
    offline = offline[ offline$mac %in% subMacs, ]
    
    # convert numeric values
    numVars = c("time", "posX", "posY", "orientation", "signal")
    offline[ numVars ] = lapply(offline[ numVars ], as.numeric)

    # convert time to POSIX
    offline$rawTime = offline$time
    offline$time = offline$time/1000
    class(offline$time) = c("POSIXt", "POSIXct")
    
    # round orientations to nearest 45
    offline$angle = roundOrientation(offline$orientation)
      
    return(offline)
  }
online = readData(ONLINE_SOURCE)

In [3]:
offline = read.csv(OFFLINE_SOURCE_CSV, row.names=1)

In [4]:
trainSample = offline[sample(nrow(offline), NUM_ROWS_FOR_SAMPLE), ]

In [5]:
#Create a special factor that contains all of the unique combinations 
#of the observed (x, y) pairs for the 166 locations. 

get.summary = function(df) {
    df$posXY = paste(df$posX, df$posY, sep = "-")

    #create a list of data frames for every combination of (x, y), angle, and access point
    byLocAngleAP = with(df,
    by(df, list(posXY, angle, mac),
    function(x) x))

    #create summary statistics on each of the data frames

    signalSummary =
        lapply(byLocAngleAP,
        function(oneLoc) {
            ans = oneLoc[1, ]
            ans$medSignal = median(oneLoc$signal)
            ans$avgSignal = mean(oneLoc$signal)
            ans$num = length(oneLoc$signal)
            ans$sdSignal = sd(oneLoc$signal)
            ans$iqrSignal = IQR(oneLoc$signal)
            return(ans)
        })
    return(do.call("rbind", signalSummary))
}

   


In [6]:
offlineSummary = get.summary(offline)

In [7]:
online$posXY = paste(online$posX, online$posY, sep = "-")
tabonlineXYA = table(online$posXY, online$angle)

keepVars = c("posXY", "posX","posY", "orientation", "angle")
byLoc = with(online,
             by(online, list(posXY),
                function(x) {
                  ans = x[1, keepVars]
                  avgSS = tapply(x$signal, x$mac, mean)
                  y = matrix(avgSS, nrow = 1, ncol = 6)
                  cbind(ans, y)
                }))
onlineSummary = do.call("rbind", byLoc)

“data length [7] is not a sub-multiple or multiple of the number of columns [6]”

In [8]:
# provide a scalar x and y along with a dataframe that has the columns posX and posY
# will return the dataframe sorted by whose posX/posY values are closest to the x/y provided
findNN.with.x.y = function(x, y, trainSubset) {
  diffs = apply(trainSubset[c('posX', 'posY')], 1, function(row) row - c(x, y))
  dists = apply(diffs, 2, function(x) sqrt(sum(x^2)) )
  closest = order(dists)
  return(trainSubset[closest,])
}

In [9]:
# Idealy, this would have a bunch of posX=4, posY=8 rows
head(findNN.with.x.y(4, 8, trainSample))

Unnamed: 0,time,posX,posY,orientation,mac,signal,rawTime,angle
625448,2006-02-11 14:55:23,4,8,45.2,00:14:bf:b1:97:8d,-52,1139691000000.0,45
629130,2006-02-11 14:58:06,4,8,225.0,00:14:bf:b1:97:90,-55,1139691000000.0,225
627609,2006-02-11 14:56:57,4,8,135.1,00:14:bf:b1:97:90,-52,1139691000000.0,135
628287,2006-02-11 14:57:30,4,8,179.5,00:0f:a3:39:dd:cd,-58,1139691000000.0,180
628843,2006-02-11 14:57:57,4,8,225.0,00:14:bf:b1:97:90,-57,1139691000000.0,225
624470,2006-02-11 14:54:32,4,8,359.6,00:14:bf:b1:97:8d,-65,1139691000000.0,0


In [10]:
bossStuff = 0
selectTrain = function (angleNewObs, signals, m, macs=ALL_MACS) {
    refs = seq(0, by = 45, length  = 8)
    nearestAngle = roundOrientation(angleNewObs)
    if (m %% 2 == 1) {
      angles = seq(-45 * (m - 1) /2, 45 * (m - 1) /2, length = m)
    } else {
        m=m+1
        angles = seq(-45 * (m - 1) /2, 45 * (m - 1) /2, length = m)
        if (sign(angleNewObs - nearestAngle) > -1)
            angles = angles[ -1 ]
        else
            angles = angles[ -m ]
    }
    
    angles = angles + nearestAngle
    angles[angles < 0] = angles[ angles < 0 ] + 360
    angles[angles > 360] = angles[ angles > 360 ] - 360
    offlineSubset = signals[ signals$angle %in% angles, ]
    #offlineSubset = subset(offlineSubset, mac %in% macs)

    reshapeSS = function(data, varSignal = "signal",
                     keepVars = c("posXY", "posX","posY")) {
        
        byLocation =
            with(data, by(data, list(posXY),
                          function(x) {
                            ans = x[1, keepVars]
                            avgSS = tapply(x[ , varSignal ], x$mac, mean)
                            #print(avgSS)
                            bossStuff <<- avgSS
                            # Take suset of the data for only rows with mac matching the supplied macs parameter
                            #print(avgSS)
                            avgSS = avgSS[macs]
                            #print(avgSS)
                            #print(class(avgSS))
                            #print(list(ans$posXY, names(avgSS)))
                            # TODO: bring dim.names back perhaps?
                            dim.names = list(ans$posXY, names(avgSS))[2][[1]]
                            #print(length(dim.names))
                            #print(dim.names)
                            y = matrix(avgSS, nrow = 1, dimnames=list(ans$posXY, names(avgSS)))
        cbind(ans, y)
            }))
          newDataSS = do.call("rbind", byLocation)
      return(newDataSS)
    }


    # TODO: I'm getting a bunch of warnings about columns length not matching
    return(suppressWarnings(reshapeSS(offlineSubset, varSignal = "avgSignal")))
}

train130 = selectTrain(130, offlineSummary, m = 3, ALL_MACS)
print(dim(train130))
tail(train130)

[1] 166  10


Unnamed: 0,posXY,posX,posY,00:0f:a3:39:dd:cd,00:0f:a3:39:e1:c0,00:14:bf:3b:c7:c6,00:14:bf:b1:97:81,00:14:bf:b1:97:8a,00:14:bf:b1:97:8d,00:14:bf:b1:97:90
8-7,8-7,8,7,-56.68887,-45.24518,-55.07055,-57.5991,-59.24071,-51.31293,-68.9181
8-8,8-8,8,8,-57.07843,-47.67784,-57.24047,-59.4533,-59.65443,-52.12153,-63.5917
9-3,9-3,9,3,-62.04891,-50.80303,-49.24919,-51.72192,-56.44777,-55.64502,-64.21229
9-4,9-4,9,4,-63.94288,-49.30732,-51.6741,-52.77017,-56.62694,-58.61451,-68.32434
9-7,9-7,9,7,-58.95216,-49.00909,-50.58772,-61.8956,-58.74623,-52.84032,-68.97933
9-8,9-8,9,8,-54.56367,-48.33939,-51.36724,-59.43084,-60.26411,-51.9269,-65.83801


In [11]:
tail(selectTrain(130, offlineSummary, m = 3, ORIGINAL_MACS))

Unnamed: 0,posXY,posX,posY,00:0f:a3:39:e1:c0,00:14:bf:3b:c7:c6,00:14:bf:b1:97:81,00:14:bf:b1:97:8a,00:14:bf:b1:97:8d,00:14:bf:b1:97:90
8-7,8-7,8,7,-45.24518,-55.07055,-57.5991,-59.24071,-51.31293,-68.9181
8-8,8-8,8,8,-47.67784,-57.24047,-59.4533,-59.65443,-52.12153,-63.5917
9-3,9-3,9,3,-50.80303,-49.24919,-51.72192,-56.44777,-55.64502,-64.21229
9-4,9-4,9,4,-49.30732,-51.6741,-52.77017,-56.62694,-58.61451,-68.32434
9-7,9-7,9,7,-49.00909,-50.58772,-61.8956,-58.74623,-52.84032,-68.97933
9-8,9-8,9,8,-48.33939,-51.36724,-59.43084,-60.26411,-51.9269,-65.83801


In [12]:
setdiff(ALL_MACS, ORIGINAL_MACS)

In [13]:
setdiff(ALL_MACS, NEXT_MACS)

In [14]:
setdiff(NEXT_MACS, ORIGINAL_MACS)

In [15]:
head(offline)

time,posX,posY,orientation,mac,signal,rawTime,angle
2006-02-11 01:31:58,0,0,0,00:14:bf:b1:97:8a,-38,1139643000000.0,0
2006-02-11 01:31:58,0,0,0,00:14:bf:b1:97:90,-56,1139643000000.0,0
2006-02-11 01:31:58,0,0,0,00:0f:a3:39:e1:c0,-53,1139643000000.0,0
2006-02-11 01:31:58,0,0,0,00:14:bf:b1:97:8d,-65,1139643000000.0,0
2006-02-11 01:31:58,0,0,0,00:14:bf:b1:97:81,-65,1139643000000.0,0
2006-02-11 01:31:58,0,0,0,00:14:bf:3b:c7:c6,-66,1139643000000.0,0


In [16]:
select.last.n.columns.from.df = function (df, n) {
    df[,(ncol(df)-n+1):ncol(df)]
}
head(select.last.n.columns.from.df(offline, 2))

rawTime,angle
1139643000000.0,0
1139643000000.0,0
1139643000000.0,0
1139643000000.0,0
1139643000000.0,0
1139643000000.0,0


In [17]:
findNN = function(newSignal, trainSubset, macs) {
    mac.columns = select.last.n.columns.from.df(trainSubset, length(macs))
  diffs = apply(mac.columns, 1,
                function(x) x - newSignal)
  dists = apply(diffs, 2, function(x) sqrt(sum(x^2)) )
  closest = order(dists)
  return(trainSubset[closest, 1:3 ])
}

In [18]:
predXY = function(newSignals, newAngles, trainData,
                  numAngles = 1, k = 3, macs = ALL_MACS){
    print(macs)
    print(dim(trainData))
  closeXY = list(length = nrow(newSignals))
  for (i in 1:nrow(newSignals)) {
    trainSS = selectTrain(newAngles[i], trainData, m = numAngles, macs)
    closeXY[[i]] =
      findNN(newSignal = as.numeric(newSignals[i, ]), trainSS, macs)
  }
  estXY = lapply(closeXY,
                 function(x) sapply(x[ , 2:3],
                                    function(x) mean(x[1:k])))
  estXY = do.call("rbind", estXY)
  return(estXY)
}

In [19]:
estXYk1 = predXY(newSignals = onlineSummary[ , 6:11],
                 newAngles = onlineSummary[ , 4],
                 offlineSummary, numAngles = 3, k = 1)

estXYk3 = predXY(newSignals = onlineSummary[ , 6:11],
                 newAngles = onlineSummary[ , 4],
                 offlineSummary, numAngles = 3, k = 3)

[1] "00:0f:a3:39:dd:cd" "00:0f:a3:39:e1:c0" "00:14:bf:3b:c7:c6"
[4] "00:14:bf:b1:97:81" "00:14:bf:b1:97:8a" "00:14:bf:b1:97:8d"
[7] "00:14:bf:b1:97:90"
[1] 9296   14
















































































































































































































































“longer object length is not a multiple of shorter object length”

[1] "00:0f:a3:39:dd:cd" "00:0f:a3:39:e1:c0" "00:14:bf:3b:c7:c6"
[4] "00:14:bf:b1:97:81" "00:14:bf:b1:97:8a" "00:14:bf:b1:97:8d"
[7] "00:14:bf:b1:97:90"
[1] 9296   14
















































































































































































































































“longer object length is not a multiple of shorter object length”

In [20]:
calcError = function(estXY, actualXY)
   sum( rowSums( (estXY - actualXY)^2) )

### Errors comparing K=m1 versus K=3

In [21]:
actualXY = onlineSummary[ , c("posX", "posY")]
sapply(list(estXYk1, estXYk3), calcError, actualXY)

In [22]:
estXYk1 = predXY(newSignals = onlineSummary[ , 6:11],
                 newAngles = onlineSummary[ , 4],
                 offlineSummary, numAngles = 3, k = 1)

estXYk3 = predXY(newSignals = onlineSummary[ , 6:11],
                 newAngles = onlineSummary[ , 4],
                 offlineSummary, numAngles = 3, k = 3)
actualXY = onlineSummary[ , c("posX", "posY")]
sapply(list(estXYk1, estXYk3), calcError, actualXY)

[1] "00:0f:a3:39:dd:cd" "00:0f:a3:39:e1:c0" "00:14:bf:3b:c7:c6"
[4] "00:14:bf:b1:97:81" "00:14:bf:b1:97:8a" "00:14:bf:b1:97:8d"
[7] "00:14:bf:b1:97:90"
[1] 9296   14
















































































































































































































































“longer object length is not a multiple of shorter object length”

[1] "00:0f:a3:39:dd:cd" "00:0f:a3:39:e1:c0" "00:14:bf:3b:c7:c6"
[4] "00:14:bf:b1:97:81" "00:14:bf:b1:97:8a" "00:14:bf:b1:97:8d"
[7] "00:14:bf:b1:97:90"
[1] 9296   14
















































































































































































































































“longer object length is not a multiple of shorter object length”

# The real analysis


In [28]:
getErrorFromMac = function (macs) {
    offline2 = readData(OFFLINE_SOURCE, macs)
    offlineSummary2 = get.summary(offline2)
    signals = select.last.n.columns.from.df(onlineSummary, length(macs))
    kyPrediction.original.macs = predXY(newSignals = signals,
                     newAngles = onlineSummary[ , 4],
                     offlineSummary2, numAngles = 3, k = 3, macs)
    calcError(kyPrediction.original.macs, actualXY)
}
getErrorFromMac(ORIGINAL_MACS[1:2])

“data length exceeds size of matrix”

[1] "00:0f:a3:39:e1:c0" "00:14:bf:3b:c7:c6"
[1] 2656   14


### Using different MACs

In [29]:
getErrorFromMac(NEXT_MACS)

“data length exceeds size of matrix”

[1] "00:0f:a3:39:dd:cd" "00:14:bf:3b:c7:c6" "00:14:bf:b1:97:81"
[4] "00:14:bf:b1:97:8a" "00:14:bf:b1:97:8d" "00:14:bf:b1:97:90"
[1] 7968   14


### Using MACs from the book

In [30]:
getErrorFromMac(ORIGINAL_MACS)

“data length exceeds size of matrix”

[1] "00:0f:a3:39:e1:c0" "00:14:bf:3b:c7:c6" "00:14:bf:b1:97:81"
[4] "00:14:bf:b1:97:8a" "00:14:bf:b1:97:8d" "00:14:bf:b1:97:90"
[1] 7968   14


### All MACS

In [31]:
getErrorFromMac(ALL_MACS)

“data length exceeds size of matrix”

[1] "00:0f:a3:39:dd:cd" "00:0f:a3:39:e1:c0" "00:14:bf:3b:c7:c6"
[4] "00:14:bf:b1:97:81" "00:14:bf:b1:97:8a" "00:14:bf:b1:97:8d"
[7] "00:14:bf:b1:97:90"
[1] 9296   14
