In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
rawdata = pd.read_csv('Z:/C5T4 - Capstone/UJIndoorLoc/trainingData.csv', header =0)

In [3]:
rawdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19937 entries, 0 to 19936
Columns: 529 entries, WAP001 to TIMESTAMP
dtypes: float64(2), int64(527)
memory usage: 80.5 MB


In [4]:
rawdata.head()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP520,LONGITUDE,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP
0,100,100,100,100,100,100,100,100,100,100,...,100,-7541.2643,4864921.0,2,1,106,2,2,23,1371713733
1,100,100,100,100,100,100,100,100,100,100,...,100,-7536.6212,4864934.0,2,1,106,2,2,23,1371713691
2,100,100,100,100,100,100,100,-97,100,100,...,100,-7519.1524,4864950.0,2,1,103,2,2,23,1371714095
3,100,100,100,100,100,100,100,100,100,100,...,100,-7524.5704,4864934.0,2,1,102,2,2,23,1371713807
4,100,100,100,100,100,100,100,100,100,100,...,100,-7632.1436,4864982.0,0,0,122,2,11,13,1369909710


In [5]:
rawdata['BUILDINGID'].describe()

count    19937.000000
mean         1.212820
std          0.833139
min          0.000000
25%          0.000000
50%          1.000000
75%          2.000000
max          2.000000
Name: BUILDINGID, dtype: float64

In [6]:
#Effective coding is all about mooching off past programs to save time
#I know from past exploration of the dataset that there are some WAP variables that have all 100s (x54);
#i.e., no readings and thus no predictive value.  We can remove them right now to make things a bit smaller

#note: min=100 for WAP variables means no readings, but it is possible this criteria could exclude other non-WAP variables
#I happen to know that the other 10 variables would not be affected by this criteria, but this could be confirmed with summary statistics

#failed attempts to limit using a single line 
#cleandata = rawdata[c for c in rawdata.columns if rawdata.c.min()<>100]
#cleandata = np.where[c for c in rawdata.columns if rawdata.c.min()<>100]
#cleandata = rawdata[rawdata.min != 100]; cleandata
#cleandata = rawdata[rawdata.index.min() != 100]
#so time to break it down

#returns 2 columns, first is attribute name and second is min of each column
#>rawdata.min(axis=0)
#WAP001             -9.700000e+01
#WAP002             -9.000000e+01
#WAP003              1.000000e+02

#returns 2 columns, first is attribute name and second is boolean of whether min of each column does not equal 100 or not
mask=rawdata.min(axis=0) !=100
#>print(mask)
#WAP001               True
#WAP002               True
#WAP003              False

#removes non-true
mask2 = mask[mask == True]
#WAP001              True
#WAP002              True
#WAP005              True

#>mask2.index
#'WAP001', 'WAP002', 'WAP005',

#calls mask2 by index, which are the attribute names
#hopefully there is a more elegant solution out there, but this is the one I found that works
cleandata=rawdata[mask2.index]

#to circle back, to make it a two liner, the first being 'mask', see below
#cleandata=rawdata[mask[mask == True].index]

#this one took me 3 lines in R using apply to find min, then subset to remove the == 100, then a third to apply the variable list

In [7]:
cleandata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19937 entries, 0 to 19936
Columns: 474 entries, WAP001 to TIMESTAMP
dtypes: float64(2), int64(472)
memory usage: 72.1 MB


In [8]:
#Similarly, there are some observations (x76) with no WAP signal strengths.  These represent a multitude of places, 
#thus can't be effectively lumped into one, so we'll remove them

#we want observations where all of the WAP variables are 100, but the non-WAP attributes can be anything
#there are 9 non-WAP attributes
#there is probably a better way, but I think I will do it like this:
#1) munge a new variable, whether the WAPs are all 100
#2) filter on that variable
#3) remove the munged variable
#if I can get that to work, then maybe I can make it into a more elegant or more succinct one liner

#but first, lets see if we can make an array of 100s and just see if we can filter it out
#temp = cleandata.iloc[:,0:465].values == np.full(465, 100)
#unfortunately this gives us a True/False for each of 474 elements (then x each row)

#this one seems to get what we want, a single value for each row
temp = (cleandata.iloc[:,0:465].values == np.full(465, 100)).all(axis=1)
#>temp.shape
#(19937,)
#>temp
#array([False, False, False, ..., False, False, False])
#>temp[temp==True].shape
#(76,)

#so now we just need to filter by the boolean
#unlike the previous mask, we do not have column names to index by
#>cleandata[temp==True].shape
#(76, 474)
#we want the rest
cleandata = cleandata[temp==False]

#This one took me 1 line in R, using subset with a nested apply

In [9]:
#confirm we haven't screwed the pooch
cleandata.shape
#excellent

(19861, 474)

In [10]:
#My general approach is to munge some variables that might have meaning, based on my understanding of the data
#For small datasets, I keep the originals in too, as to not lose data
#For this one, there are so many variables that I ended up removing all of the WAP variables in R before
#doing the learning model and just sticking with the munged ones.  I think I will do that again here
#The risk is that I will lose possible data by removing those variables.  If my assumptions are wrong, or
#missing a large predictive factor, then my predictions may not be as accurate

#So, starting off with predicting building
#When doing the prediction, we will be looking at WAP readings and trying to guess which building
#Some WAPs may only be associated with one building, some may be associated with more
#Here is how I imagine a WAP based fingerprint at the building level working
#For a given WAPX, we can see how many observations are for building 0, then for building 1, then for building 2
#Then we can store these counts
#Then, over all of the WAPs for an observation, we can add the building 0 counts, building 1 counts, and building 2 counts
#From there, some pairwise variables.  An absolute count for building 0 will not necessarily help us,
#until we also know how many building 1 counts and building 2 counts we have
#So, we can munge ratio variables to compare one to another
#As another possibility, we could normalize the counts by total counts
#Just a straight count could bias the data towards places that are visited more often than others.  That said,
#that may not be a bad thing from a predictive point of view, from a bayesian standpoint
#One thing I noticed in my original run through and tried to correct via weighting is that some WAPs are only observed
#occasionally from a given.  Counting the observations will incorporate this weighting implicitly

#If we are going to get building counts for each WAP, we'll need to start with a unique list of WAPs
WAPs = np.array(cleandata.iloc[:,0:465].columns.values)

In [11]:
WAPs.shape
(465,)

(465,)

#append the three building counts
WAPs2 = np.concatenate((WAPs, np.empty([465,3])), axis=0)

#doesn't work
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-8eeec22417d0> in <module>()
      1 #append the three building counts
----> 2 WAPs2 = np.concatenate((WAPs, np.empty([465,3])), axis=0)

ValueError: all the input arrays must have same number of dimensions



np.empty([465,3]).shape
(465, 3)

WAPs.shape
(465,)

#it's not working (yet)
#maybe assigning the names as an array entry is not what we need anyway; not sure exactly how they will interface
#so I guess will start trying to do the group by to get the values, and then see if we want to add the names on as an attribute,
#or add them as names
grouped = cleandata.groupby('BUILDINGID')
print(grouped["WAP001"].count)

<bound method SeriesGroupBy.count of <pandas.core.groupby.SeriesGroupBy object at 0x00000000092E4908>>

cleandata.iloc[:,0:465].apply(lambda x: 1*(x<100.0), axis=1).groupby('BUILDINGID').sum(axis=0)

#doesn't work because group by field isn't in data
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-17-b08f5c81b8ac> in <module>()
----> 1 cleandata.iloc[:,0:465].apply(lambda x: 1*(x<100.0), axis=1).groupby('BUILDINGID').sum(axis=0)

~\Anaconda3\lib\site-packages\pandas\core\generic.py in groupby(self, by, axis, level, as_index, sort, group_keys, squeeze, **kwargs)
   5160         return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
   5161                        sort=sort, group_keys=group_keys, squeeze=squeeze,
-> 5162                        **kwargs)
   5163 
   5164     def asfreq(self, freq, method=None, how=None, normalize=False,

~\Anaconda3\lib\site-packages\pandas\core\groupby.py in groupby(obj, by, **kwds)
   1846         raise TypeError('invalid type: %s' % type(obj))
   1847 
-> 1848     return klass(obj, by, **kwds)
   1849 
   1850 

~\Anaconda3\lib\site-packages\pandas\core\groupby.py in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, **kwargs)
    514                                                     level=level,
    515                                                     sort=sort,
--> 516                                                     mutated=self.mutated)
    517 
    518         self.obj = obj

~\Anaconda3\lib\site-packages\pandas\core\groupby.py in _get_grouper(obj, key, axis, level, sort, mutated, validate)
   2932                 in_axis, name, level, gpr = False, None, gpr, None
   2933             else:
-> 2934                 raise KeyError(gpr)
   2935         elif isinstance(gpr, Grouper) and gpr.key is not None:
   2936             # Add key to exclusions

KeyError: 'BUILDINGID'

In [12]:
#I can't seem to get a function to apply to a subset of attributes but then retain the other attributes for grouping purposes
#so will have to break it up into multiple steps

WAPdata = pd.concat((cleandata.iloc[:,0:465].apply(lambda x: 1*(x<100.0)),cleandata.iloc[:,465:474]), axis = 1)
WAPdata.head(3)

Unnamed: 0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP519,LONGITUDE,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP
0,0,0,0,0,0,0,0,0,0,0,...,0,-7541.2643,4864921.0,2,1,106,2,2,23,1371713733
1,0,0,0,0,0,0,0,0,0,0,...,0,-7536.6212,4864934.0,2,1,106,2,2,23,1371713691
2,0,0,0,0,0,1,0,0,0,0,...,0,-7519.1524,4864950.0,2,1,103,2,2,23,1371714095


In [13]:
WAPdata.shape

(19861, 474)

In [14]:
WAPdata.groupby('BUILDINGID').sum(axis=0)
#i can't for the life of me get python to do a noncontiguous iloc to include building ID but not the other non-WAPs
#iloc only takes in 1 range
#even setting it as a list, list itself only takes in 1 range
#dumb dumb dumb dumb dumb
#we'll just have to ignore those others or drop them or something
#...of course, drop probably wouldn't work either, because there are two noncontiguous ranges to drop

Unnamed: 0_level_0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP518,WAP519,LONGITUDE,LATITUDE,FLOOR,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP
BUILDINGID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,18,19,0,0,578,121,595,0,0,0,...,0,3,-40091110.0,25531300000.0,8415,881657,10467,30358,70961,7192845651045
1,0,0,0,280,0,556,0,87,112,104,...,0,0,-38632870.0,25097940000.0,7009,688632,8694,53195,65655,7076691401067
2,0,0,40,28,0,0,0,0,2844,2879,...,22,0,-69525800.0,45991970000.0,17848,1379543,17253,96497,122602,12968239674060


In [15]:
buildingdata = WAPdata.iloc[:,0:470].drop(['LONGITUDE','LATITUDE','FLOOR','SPACEID'], axis=1).groupby('BUILDINGID').sum(axis=0)
#managed to get it to work.  had to browbeat it manually though, really hard
#drop a range?  nope
#drop a list of numbers?  nope.  rows work i guess, but not columns

In [16]:
#now need sublevels for future
floordata = WAPdata.iloc[:,0:470].drop(['LONGITUDE','LATITUDE','SPACEID'], axis=1).groupby(['BUILDINGID','FLOOR']).sum(axis=0)
roomdata = WAPdata.iloc[:,0:470].drop(['LONGITUDE','LATITUDE'], axis=1).groupby(['BUILDINGID','FLOOR','SPACEID']).sum(axis=0)
relposdata = WAPdata.iloc[:,0:471].drop(['LONGITUDE','LATITUDE'], axis=1).groupby(['BUILDINGID','FLOOR','SPACEID','RELATIVEPOSITION']).sum(axis=0)


In [17]:
floordata

Unnamed: 0_level_0,Unnamed: 1_level_0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
BUILDINGID,FLOOR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,0,0,0,0,246,18,25,0,0,0,...,0,0,0,0,0,2,0,0,0,0
0,1,0,0,0,0,209,46,90,0,0,0,...,0,0,0,0,0,2,0,0,0,0
0,2,5,9,0,0,119,19,340,0,0,0,...,0,0,0,0,0,37,0,0,0,0
0,3,13,10,0,0,4,38,140,0,0,0,...,0,0,0,0,0,19,0,0,0,3
1,0,0,0,0,226,0,160,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,26,0,336,0,40,13,20,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,28,0,60,0,30,43,47,...,0,0,0,0,0,0,0,0,0,0
1,3,0,0,0,0,0,0,0,17,56,37,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,22,0,0,0,0,0,0,...,0,9,0,0,0,0,1395,495,0,0
2,1,0,0,0,4,0,0,0,0,305,374,...,0,137,8,123,126,0,956,1281,5,0


In [18]:
roomdata

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
BUILDINGID,FLOOR,SPACEID,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,0,102,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,107,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,110,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,111,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,113,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,114,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,115,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,116,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
relposdata

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
BUILDINGID,FLOOR,SPACEID,RELATIVEPOSITION,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
0,0,102,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,106,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,107,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,110,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,111,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,112,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,113,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,114,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,115,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,116,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
#alright, so now we have our 4 levels of grouping with counts of non-100 observations
#time to munge the variables

#for building, here is what we want to do for each observation:
#add up the building 0, 1, and 2 counts for each WAP variable
#munge some ratio and difference variables

#let's define some functions
#first, for a particular observation, we want the columns with non-100 values
def applWAPs(observation):    #applicable WAPs
    mask=observation !=100
    #print(mask)
    #removes non-true
    mask2 = mask[mask == True]
    #print(mask2)
    #print(mask2.index)
    #calls mask2 by index, which are the attribute names
    return mask2.index


#test calling the function
cleandata.iloc[1]
applWAPs(cleandata.iloc[0,0:465])

In [21]:
#test2 calling the building data array using the function as the columns
buildingdata[applWAPs(cleandata.iloc[0,0:465])]

Unnamed: 0_level_0,WAP090,WAP091,WAP103,WAP104,WAP105,WAP106,WAP125,WAP126,WAP166,WAP167,WAP168,WAP169,WAP171,WAP172,WAP173,WAP191,WAP192
BUILDINGID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,0,0,6,5,0,0,0,0,55,57,3,6,1,163,155,9,9
1,1006,1012,935,1022,926,939,789,798,1085,1106,654,666,880,1029,1043,569,582
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
#test3 coming up with the by building sums
buildingdata[applWAPs(cleandata.iloc[0,0:465])].sum(axis=1)

BUILDINGID
0      469
1    15041
2        0
dtype: int64

In [23]:
#final individual test, assigning the data
buildingdata[applWAPs(cleandata.iloc[0,0:465])].sum(axis=1)[0]

469

In [24]:
#some test data for doing a group of them
print(buildingdata[applWAPs(cleandata.iloc[0,0:465])].sum(axis=1))
print(buildingdata[applWAPs(cleandata.iloc[1,0:465])].sum(axis=1))
print(buildingdata[applWAPs(cleandata.iloc[2,0:465])].sum(axis=1))
print(buildingdata[applWAPs(cleandata.iloc[3,0:465])].sum(axis=1))

BUILDINGID
0      469
1    15041
2        0
dtype: int64
BUILDINGID
0      367
1    13582
2        0
dtype: int64
BUILDINGID
0     1100
1    13400
2     2599
dtype: int64
BUILDINGID
0     2040
1    13486
2        0
dtype: int64


apply(buildingdata[cleandata.iloc[0:4,0:465].applWAPs],axis=1).sum(axis=1)
#this isn't going to work as is.  Need to make another function that can be used in the apply

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-64-ba036472984f> in <module>()
----> 1 apply(buildingdata[cleandata.iloc[0:4,0:465].applWAPs],axis=1).sum(axis=1)

NameError: name 'apply' is not defined

In [25]:
#function to return the buildingdata[applWAPs(cleandata.iloc[0,0:465])].sum(axis=1) type data
#and then maybe generalize to send back all 3 variables at once to do a single call?
#(efficiency, so that the same aggregation is not happening 3 times)

def buildingsums (observation):
    temp = buildingdata[applWAPs(observation)].sum(axis=1)
    return temp[0], temp[1], temp[2]

In [26]:
#test
buildingsums(cleandata.iloc[0,0:465])
#fantabular

(469, 15041, 0)

In [27]:
#apply method to get all
cleandata.iloc[0:4,0:465].apply(buildingsums,axis=1)

0        (469, 15041, 0)
1        (367, 13582, 0)
2    (1100, 13400, 2599)
3       (2040, 13486, 0)
dtype: object

In [28]:
#to add to an array
temparray = cleandata[0:4]
temparray2 = pd.concat((temparray, temparray.iloc[:,0:465].apply(buildingsums,axis=1)), axis=1)
print(temparray2)

#can we call one of them?
#nope, can't figure out how to call the nested tuple by name or numerical index
#temparray.iloc[0,475][1]

   WAP001  WAP002  WAP005  WAP006  WAP007  WAP008  WAP009  WAP010  WAP011  \
0     100     100     100     100     100     100     100     100     100   
1     100     100     100     100     100     100     100     100     100   
2     100     100     100     100     100     -97     100     100     100   
3     100     100     100     100     100     100     100     100     100   

   WAP012         ...           LONGITUDE      LATITUDE  FLOOR  BUILDINGID  \
0     100         ...          -7541.2643  4.864921e+06      2           1   
1     100         ...          -7536.6212  4.864934e+06      2           1   
2     100         ...          -7519.1524  4.864950e+06      2           1   
3     100         ...          -7524.5704  4.864934e+06      2           1   

   SPACEID  RELATIVEPOSITION  USERID  PHONEID   TIMESTAMP                    0  
0      106                 2       2       23  1371713733      (469, 15041, 0)  
1      106                 2       2       23  1371713691    

In [29]:
#let's see if we can do it other ways
temparray = cleandata[0:4]
temparray[('bldg0sum','bldg1sum','bldg2sum')] =temparray.iloc[:,0:465].apply(buildingsums,axis=1)
print(temparray)
#same result

#can we call one of them?
#temparray['bldg0sum']
#nope

   WAP001  WAP002  WAP005  WAP006  WAP007  WAP008  WAP009  WAP010  WAP011  \
0     100     100     100     100     100     100     100     100     100   
1     100     100     100     100     100     100     100     100     100   
2     100     100     100     100     100     -97     100     100     100   
3     100     100     100     100     100     100     100     100     100   

   WAP012               ...                LONGITUDE      LATITUDE  FLOOR  \
0     100               ...               -7541.2643  4.864921e+06      2   
1     100               ...               -7536.6212  4.864934e+06      2   
2     100               ...               -7519.1524  4.864950e+06      2   
3     100               ...               -7524.5704  4.864934e+06      2   

   BUILDINGID  SPACEID  RELATIVEPOSITION  USERID  PHONEID   TIMESTAMP  \
0           1      106                 2       2       23  1371713733   
1           1      106                 2       2       23  1371713691   
2        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [30]:
#lets try a tweaked function

def buildingsums2 (observation):
    temp = buildingdata[applWAPs(observation)].sum(axis=1)
    return {'bldg0sum':temp[0], 'bldg1sum':temp[1], 'bldg2sum':temp[2]}

In [31]:
#to add to an array
temparray = cleandata[0:4]
temparray2 = pd.concat((temparray, temparray.iloc[:,0:465].apply(buildingsums2,axis=1)), axis=1)
print(temparray2)
#nope, still a tuple

   WAP001  WAP002  WAP005  WAP006  WAP007  WAP008  WAP009  WAP010  WAP011  \
0     100     100     100     100     100     100     100     100     100   
1     100     100     100     100     100     100     100     100     100   
2     100     100     100     100     100     -97     100     100     100   
3     100     100     100     100     100     100     100     100     100   

   WAP012                        ...                          LONGITUDE  \
0     100                        ...                         -7541.2643   
1     100                        ...                         -7536.6212   
2     100                        ...                         -7519.1524   
3     100                        ...                         -7524.5704   

       LATITUDE  FLOOR  BUILDINGID  SPACEID  RELATIVEPOSITION  USERID  \
0  4.864921e+06      2           1      106                 2       2   
1  4.864934e+06      2           1      106                 2       2   
2  4.864950e+06    

In [32]:
#so, let's see if we can do this on the whole enchilada
#and let's concatenate in place so we don't have to make more variables
#what could go wrong?
cleandata['temp'] =cleandata.iloc[:,0:465].apply(buildingsums,axis=1)
cleandata=pd.concat((cleandata, cleandata['temp'].apply(pd.Series,index=["bldg0sum","bldg1sum", "bldg2sum"])), axis=1)
cleandata.drop('temp', axis=1, inplace=True)
print(cleandata.head(5))
#of course, nothing goes wrong.  I never have trouble with Python.  Hooray!

   WAP001  WAP002  WAP005  WAP006  WAP007  WAP008  WAP009  WAP010  WAP011  \
0     100     100     100     100     100     100     100     100     100   
1     100     100     100     100     100     100     100     100     100   
2     100     100     100     100     100     -97     100     100     100   
3     100     100     100     100     100     100     100     100     100   
5     100     100     100     100     100     100     100     100     100   

   WAP012    ...     FLOOR  BUILDINGID  SPACEID  RELATIVEPOSITION  USERID  \
0     100    ...         2           1      106                 2       2   
1     100    ...         2           1      106                 2       2   
2     100    ...         2           1      103                 2       2   
3     100    ...         2           1      102                 2       2   
5     100    ...         2           1      105                 2       2   

   PHONEID   TIMESTAMP  bldg0sum  bldg1sum  bldg2sum  
0       23  1371713

In [33]:
#now, to make the other munged variables for the first learning model to use
#let's start with a function to 'guess' which building is the winner - the max
def bldgguess(observation):
    temp = -1
    if observation['bldg0sum']>observation['bldg1sum']:
        if observation['bldg0sum']>observation['bldg2sum']:
            temp = 0
        else: temp = 2
    elif observation['bldg1sum']>observation['bldg2sum']:
        temp = 1
    else: temp = 2
    #note, ties will default to 1 for 0 & 1, 2 for 0 & 2, and 2 for 1 & 2
    #this risk is deemed to be very low, but we could adapt this function to make a '3' category in that case
    return int(temp)

In [34]:
#test
bldgguess(cleandata.iloc[0,474:477])
#fantabular

1

In [35]:
#apply method to get all
cleandata.iloc[0:4,474:477].apply(bldgguess,axis=1)

0    1
1    1
2    1
3    1
dtype: int64

In [36]:
#so, let's see if we can do this on the whole enchilada

#run this first if rerunning
cleandata=cleandata.iloc[:,0:477]

cleandata=pd.concat((cleandata, cleandata.iloc[:,474:477].apply(bldgguess,axis=1).rename('bldgguess')), axis=1)
print(cleandata.head(5))

   WAP001  WAP002  WAP005  WAP006  WAP007  WAP008  WAP009  WAP010  WAP011  \
0     100     100     100     100     100     100     100     100     100   
1     100     100     100     100     100     100     100     100     100   
2     100     100     100     100     100     -97     100     100     100   
3     100     100     100     100     100     100     100     100     100   
5     100     100     100     100     100     100     100     100     100   

   WAP012    ...      BUILDINGID  SPACEID  RELATIVEPOSITION  USERID  PHONEID  \
0     100    ...               1      106                 2       2       23   
1     100    ...               1      106                 2       2       23   
2     100    ...               1      103                 2       2       23   
3     100    ...               1      102                 2       2       23   
5     100    ...               1      105                 2       2       23   

    TIMESTAMP  bldg0sum  bldg1sum  bldg2sum  bldgguess  

In [37]:
#now, some % of total variables
#to avoid the hassle of tuple unpacking contortions nonsense, will call it 3 times
#and python will just have to suffer through doing the sum 3 times

#start with a function
def bldgratios(observation, bldg):
    var = {0:'bldg0sum', 1:'bldg1sum', 2:'bldg2sum'}
    return (observation[var[bldg]]/(observation[var[0]]+observation[var[1]]+observation[var[2]]))

In [38]:
#make enchiladas
#make them spicy/risky by trying apply with extra arg
cleandata=pd.concat((cleandata, cleandata.iloc[:,474:477].apply(bldgratios,axis=1, bldg=0).rename('bldg0ratio')), axis=1)
cleandata=pd.concat((cleandata, cleandata.iloc[:,474:477].apply(bldgratios,axis=1, bldg=1).rename('bldg1ratio')), axis=1)
cleandata=pd.concat((cleandata, cleandata.iloc[:,474:477].apply(bldgratios,axis=1, bldg=2).rename('bldg2ratio')), axis=1)
print(cleandata.head(5))
#mondoriffic, first try!
#I mean, yeah, all the ones above were first try too.  Brilliant

   WAP001  WAP002  WAP005  WAP006  WAP007  WAP008  WAP009  WAP010  WAP011  \
0     100     100     100     100     100     100     100     100     100   
1     100     100     100     100     100     100     100     100     100   
2     100     100     100     100     100     -97     100     100     100   
3     100     100     100     100     100     100     100     100     100   
5     100     100     100     100     100     100     100     100     100   

   WAP012     ...      USERID  PHONEID   TIMESTAMP  bldg0sum  bldg1sum  \
0     100     ...           2       23  1371713733       469     15041   
1     100     ...           2       23  1371713691       367     13582   
2     100     ...           2       23  1371714095      1100     13400   
3     100     ...           2       23  1371713807      2040     13486   
5     100     ...           2       23  1371713841      1623     11490   

   bldg2sum  bldgguess  bldg0ratio  bldg1ratio  bldg2ratio  
0         0          1    0.030

In [39]:
#one more ratio type
#I can't effectively explain why I want to add this one, but the same reason having a normalized ratio is important
#over just the sums, we need something similar for the ratios themselves to give force to the degree the winner exceeds the other
#I expect that in the learning model will interact with bldgguess
#that is, if bldgguess doesn't win outright anyway
#what it is is the degree to which it exceeds the next one
def bldgsuperratio(observation):
    var = {0:'bldg0sum', 1:'bldg1sum', 2:'bldg2sum'}  #as always, copy what you can to make life easier
    maxone = max(observation[var[0]], observation[var[1]],observation[var[2]])
    medone = max(min(observation[var[0]], observation[var[1]]),min(observation[var[1]],observation[var[2]]), min(observation[var[0]],observation[var[2]]))
    #above line shows how cleverness, even to make a succinct 1 line, often sacrifices readibility
    #three pairs of mins will always not have the max, but the min(max, second max) will return the second max, ie median
    #which is what we want.  max(min, min, median) will get us the median
    #we could have also done sum(all 3) minus max() minus min() but I think that would have been even longer
    
    #now, we need to work around if there is no runner up
    #lets do arbitrarily high 999
    if medone == 0:
        temp = 999
    else:
        temp = maxone/medone
    return temp #returns how much the max exceeds the runner up, as a percentage, or 999 if no runner up
        

In [40]:
#do it

#run this first if rerunning
cleandata=cleandata.iloc[:,0:481]

cleandata=pd.concat((cleandata, cleandata.iloc[:,474:477].apply(bldgsuperratio,axis=1).rename('bldgsuperratio')), axis=1)
print(cleandata.head(5))

   WAP001  WAP002  WAP005  WAP006  WAP007  WAP008  WAP009  WAP010  WAP011  \
0     100     100     100     100     100     100     100     100     100   
1     100     100     100     100     100     100     100     100     100   
2     100     100     100     100     100     -97     100     100     100   
3     100     100     100     100     100     100     100     100     100   
5     100     100     100     100     100     100     100     100     100   

   WAP012       ...        PHONEID   TIMESTAMP  bldg0sum  bldg1sum  bldg2sum  \
0     100       ...             23  1371713733       469     15041         0   
1     100       ...             23  1371713691       367     13582         0   
2     100       ...             23  1371714095      1100     13400      2599   
3     100       ...             23  1371713807      2040     13486         0   
5     100       ...             23  1371713841      1623     11490         0   

   bldgguess  bldg0ratio  bldg1ratio  bldg2ratio  bldgsu

In [48]:
#and hopefully that will give us enough for the learning models, and maybe then some, to predict building
#which will go super fast with our 8 variables

#imports, copied over from last classification project
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [49]:
#dependent variable
depVar = cleandata['BUILDINGID']

#features
features = cleandata.iloc[:,474:482]
print(features.head(5))

#make train and test sets
xtrain, xtest, ytrain, ytest = train_test_split(features, depVar, random_state = 123) #added random state so reproduceable
print(xtrain.shape)
print(xtest.shape)

   bldg0sum  bldg1sum  bldg2sum  bldgguess  bldg0ratio  bldg1ratio  \
0       469     15041         0          1    0.030239    0.969761   
1       367     13582         0          1    0.026310    0.973690   
2      1100     13400      2599          1    0.064331    0.783672   
3      2040     13486         0          1    0.131393    0.868607   
5      1623     11490         0          1    0.123770    0.876230   

   bldg2ratio  bldgsuperratio  
0    0.000000       32.070362  
1    0.000000       37.008174  
2    0.151997        5.155829  
3    0.000000        6.610784  
5    0.000000        7.079482  
(14895, 8)
(4966, 8)


In [50]:
#random forest
rf_clf = make_pipeline(RandomForestClassifier(random_state = 123))
param_grid = {'randomforestclassifier__n_estimators': np.arange(1, 26, 5),
             'randomforestclassifier__criterion': ["gini", "entropy"]}
rf_cv = GridSearchCV(rf_clf, param_grid, cv = 10)
%timeit rf_cv.fit(xtrain, ytrain)
print("Tuned rf parameters: {}".format(rf_cv.best_params_))
print("Best score is {}".format(rf_cv.best_score_))
#print(sorted(rf_clf.get_params().keys()))

8.78 s ± 73.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Tuned rf parameters: {'randomforestclassifier__criterion': 'gini', 'randomforestclassifier__n_estimators': 16}
Best score is 0.9991943605236656
['memory', 'randomforestclassifier', 'randomforestclassifier__bootstrap', 'randomforestclassifier__class_weight', 'randomforestclassifier__criterion', 'randomforestclassifier__max_depth', 'randomforestclassifier__max_features', 'randomforestclassifier__max_leaf_nodes', 'randomforestclassifier__min_impurity_decrease', 'randomforestclassifier__min_impurity_split', 'randomforestclassifier__min_samples_leaf', 'randomforestclassifier__min_samples_split', 'randomforestclassifier__min_weight_fraction_leaf', 'randomforestclassifier__n_estimators', 'randomforestclassifier__n_jobs', 'randomforestclassifier__oob_score', 'randomforestclassifier__random_state', 'randomforestclassifier__verbose', 'randomforestclassifier__warm_start', 'steps']


In [51]:
#given the high accuracy on RF for building, probably don't need any more models, but want to set them up anyway
#for use at the other 3 levels of prediction
#gradient boosting
gb_clf = make_pipeline(GradientBoostingClassifier(random_state = 123))
param_grid_gb = { 
        'gradientboostingclassifier__n_estimators': [5, 20, 100], 
        'gradientboostingclassifier__learning_rate': [.05, .1, .5, 1.0],
        'gradientboostingclassifier__max_depth': [3, 5, 7]}
gb_cv = GridSearchCV(gb_clf, param_grid_gb, cv = 10)
%timeit gb_cv.fit(xtrain, ytrain)
print("Tuned gb parameters: {}".format(gb_cv.best_params_))
print("Best score is {}".format(gb_cv.best_score_))
#print(sorted(gb_clf.get_params().keys()))

3min 54s ± 333 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Tuned gb parameters: {'gradientboostingclassifier__learning_rate': 0.1, 'gradientboostingclassifier__max_depth': 3, 'gradientboostingclassifier__n_estimators': 100}
Best score is 0.9992614971466935


In [52]:
#knn
knn_clf = make_pipeline(KNeighborsClassifier())
param_grid_knn = { 
        'kneighborsclassifier__n_neighbors': [3, 5, 7, 9, 15, 25] 
                }
knn_cv = GridSearchCV(knn_clf, param_grid_knn, cv = 10)
%timeit knn_cv.fit(xtrain, ytrain)
print("Tuned knn parameters: {}".format(knn_cv.best_params_))
print("Best score is {}".format(knn_cv.best_score_))
#print(sorted(knn_clf.get_params().keys()))

7.08 s ± 28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Tuned knn parameters: {'kneighborsclassifier__n_neighbors': 3}
Best score is 0.99906008727761


In [53]:
#decision tree, allows for visualization if can figure out graphviz
dt_clf = make_pipeline(DecisionTreeClassifier(random_state = 123))
param_grid = {'decisiontreeclassifier__criterion': ["gini", "entropy"]}
dt_cv = GridSearchCV(dt_clf, param_grid, cv = 10)
%timeit dt_cv.fit(xtrain, ytrain)
print("Tuned dt parameters: {}".format(dt_cv.best_params_))
print("Best score is {}".format(dt_cv.best_score_))
#print(sorted(dt_clf.get_params().keys()))

543 ms ± 2.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Tuned dt parameters: {'decisiontreeclassifier__criterion': 'entropy'}
Best score is 0.998992950654582


In [None]:
#svc
svc_clf = make_pipeline(SVC(random_state = 123))
param_grid_svc = { 
        'svc__kernel': ['linear','rbf'],
        'svc__C': [1, 10],
                    }
svc_cv = GridSearchCV(svc_clf, param_grid_svc, cv = 10)
%timeit svc_cv.fit(xtrain, ytrain)
print("Tuned svc parameters: {}".format(svc_cv.best_params_))
print("Best score is {}".format(svc_cv.best_score_))
#print(sorted(svc_clf.get_params().keys()))