# Methods to be tested
Three methods will be used to predict positioning. All methods will predict location one section at a time in the following order:  

Building > Floor > Position > Space  

Ideally, based on a correct prediction we would then subset the data to only the remaining records. e.g. When building 2 is predicted then the data is filtered to only building 2 records then floor is predicted and so on.  

Since there a many possible scenarios of predictions results we will test this theory by filtering the largest possible subset of data and smallest possible subset of data between each prediction. We can take an average from method 1 and 2 to get a good idea what kind of accuracy we can expect from subsetting the data. Additionally, our last method we will not subset at all between predictions.  

•	Method 1: Filter down the data by largest possible subset between predictions.  
•	Method 2: Filter down the data by smallest possible subset between predictions.  
    o	Average to be taken from methods 1 and 2 to estimate score of many possible subsetting scenarios  

•	Method 3: Use full dataset each time for predictions.


### Import Libraries

In [2]:
#numpy, pandas, scipy, math, matplotlib, time
import numpy as np
import pandas as pd
import scipy
from math import sqrt
import matplotlib.pyplot as plt
from time import time

#preprocessing/feature selection
from sklearn.feature_selection import VarianceThreshold

#estimators
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

#model metrics
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

#cross validation/tuning
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Set random seed
np.random.seed(0)

### Data Import

In [4]:
#data
wifi = pd.read_csv('prepdata.csv')
wifi.head()

Unnamed: 0,longitude,latitude,position,floor,building,space,location,userID,phoneID,timestamp,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
0,-7541.2643,4864920.778,2,2,1,106,221106,2,23,1371713733,...,100,100,100,100,100,100,100,100,100,100
1,-7536.6212,4864934.225,2,2,1,106,221106,2,23,1371713691,...,100,100,100,100,100,100,100,100,100,100
2,-7519.1524,4864949.532,2,2,1,103,221103,2,23,1371714095,...,100,100,100,100,100,100,100,100,100,100
3,-7524.5704,4864934.093,2,2,1,102,221102,2,23,1371713807,...,100,100,100,100,100,100,100,100,100,100
4,-7632.1436,4864982.217,2,0,0,122,200122,11,13,1369909710,...,100,100,100,100,100,100,100,100,100,100


## Predict Building

### Select Features

In [5]:
#features
features = wifi.iloc[:,10:475]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
2,100,100,100,100,100,-97,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
3,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


### Select Dependent Variable

In [6]:
#dependent variable
depVar = wifi['building']

### Split Train/Test Data at a 70/30 Ratio

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=.30, random_state=0)
print(" X_train",X_train.shape,'\n',"X_test", X_test.shape,'\n',"y_train", y_train.shape,'\n',"y_test", y_test.shape)

 X_train (13955, 465) 
 X_test (5982, 465) 
 y_train (13955,) 
 y_test (5982,)


### Build Models

In [8]:
modelRF = RandomForestClassifier( random_state=0)
modelSV = SVC(random_state=0)
modelKN = KNeighborsClassifier()
modelGB = GradientBoostingClassifier(random_state=0)

### Train and score all models

In [9]:
#Random Forest
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [10]:
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.99763543 0.99806535 0.9972049 ]


0.9980652096022931

In [38]:
#Support Vector Classification
modelSV.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [39]:
print(cross_val_score(modelSV, X_train, y_train))
modelSV.score(X_train,y_train)

[0.61865864 0.6177988  0.62266179]


0.9980652096022931

In [40]:
#K Nearest Neighbor
modelKN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [41]:
print(cross_val_score(modelKN, X_train, y_train))
modelKN.score(X_train,y_train)

[0.99699054 0.99742046 0.99677489]


0.9977069150841992

In [42]:
#Gradient Boosting Classifier
modelGB.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [43]:
print(cross_val_score(modelGB, X_train, y_train))
modelGB.score(X_train,y_train)

[0.99785039 0.99828031 0.99784992]


0.9980652096022931

### Building Predictions

In [11]:
#Random Forest
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.9976596456034771 
   Kappa:  0.9963315199000853


In [45]:
#Support Vector Classification
SVpred = modelSV.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,SVpred),"\n","  Kappa: ",cohen_kappa_score(y_test,SVpred))

Accuracy:  0.6559679037111334 
   Kappa:  0.3930650918939512


In [46]:
#K Nearest Neighbor
KNpred = modelKN.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,KNpred),"\n","  Kappa: ",cohen_kappa_score(y_test,KNpred))

Accuracy:  0.9973253092611167 
   Kappa:  0.9958083965290517


In [47]:
#Gradient Boosted Classification
GBpred = modelGB.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,GBpred),"\n","  Kappa: ",cohen_kappa_score(y_test,GBpred))

Accuracy:  0.9976596456034771 
   Kappa:  0.9963319334781235


# Method 1:
####  Filter data by largest subset.

## Predict Floor

In [50]:
#filter data to only largest building subset
building2 = wifi.query('building==2')
building2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9492 entries, 46 to 19933
Columns: 475 entries, longitude to WAP519
dtypes: float64(2), int64(473)
memory usage: 34.5 MB


### Select Features

In [51]:
#features
features = building2.iloc[:,10:475]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
46,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,-93,100,100
47,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,-89,100,100
49,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-91,100
50,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,-90,100,100
51,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,-92,100,100


### Select Dependent Variable

In [53]:
#dependent variable
depVar = building2['floor']

### Split Train/Test Data at a 70/30 Ratio

In [54]:
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=.30, random_state=0)
print(" X_train",X_train.shape,'\n',"X_test", X_test.shape,'\n',"y_train", y_train.shape,'\n',"y_test", y_test.shape)

 X_train (6644, 465) 
 X_test (2848, 465) 
 y_train (6644,) 
 y_test (2848,)


### Build Models

In [55]:
modelRF = RandomForestClassifier( random_state=0)
modelSV = SVC(random_state=0)
modelKN = KNeighborsClassifier()
modelGB = GradientBoostingClassifier(random_state=0)

### Train and score all models

In [56]:
#Random Forest
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [57]:
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.97969314 0.9751693  0.97424311]


0.9996989765201686

In [59]:
#Support Vector Classification
modelSV.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [60]:
print(cross_val_score(modelSV, X_train, y_train))
modelSV.score(X_train,y_train)

[0.3984657  0.40225734 0.40352463]


1.0

In [61]:
#K Nearest Neighbor
modelKN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [62]:
print(cross_val_score(modelKN, X_train, y_train))
modelKN.score(X_train,y_train)

[0.97066787 0.97020316 0.96972436]


0.9890126429861529

In [63]:
#Gradient Boosting Classifier
modelGB.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [64]:
print(cross_val_score(modelGB, X_train, y_train))
modelGB.score(X_train,y_train)

[0.99548736 0.99277652 0.9945775 ]


1.0

### Floor Predictions

In [65]:
#Random Forest
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.9845505617977528 
   Kappa:  0.9802202710968175


In [66]:
#Support Vector Classification
SVpred = modelSV.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,SVpred),"\n","  Kappa: ",cohen_kappa_score(y_test,SVpred))

Accuracy:  0.43082865168539325 
   Kappa:  0.21795167591415265


In [67]:
#K Nearest Neighbor
KNpred = modelKN.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,KNpred),"\n","  Kappa: ",cohen_kappa_score(y_test,KNpred))

Accuracy:  0.9789325842696629 
   Kappa:  0.9730337163761784


In [68]:
#Gradient Boosted Classification
GBpred = modelGB.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,GBpred),"\n","  Kappa: ",cohen_kappa_score(y_test,GBpred))

Accuracy:  0.9950842696629213 
   Kappa:  0.993710497374221


## Predict Position

In [431]:
#filter data to only largest building/floor subset
floor3 = wifi[(wifi.building == 2) & (wifi.floor == 3)]
floor3

Unnamed: 0,longitude,latitude,position,floor,building,space,location,userID,phoneID,timestamp,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
46,-7331.748000,4864767.318,2,3,2,247,232247,2,23,1371716690,...,100,100,100,100,100,100,100,-93,100,100
47,-7331.400500,4864768.479,2,3,2,248,232248,2,23,1371716637,...,100,100,100,100,100,100,100,-89,100,100
49,-7327.492165,4864766.669,1,3,2,247,132247,2,23,1371716719,...,100,100,100,100,100,100,100,100,-91,100
50,-7336.700400,4864764.479,2,3,2,246,232246,2,23,1371716762,...,100,100,100,100,100,100,100,-90,100,100
51,-7337.394800,4864763.227,2,3,2,245,232245,2,23,1371716797,...,100,100,100,100,100,100,100,-92,100,100
52,-7340.524600,4864757.586,2,3,2,244,232244,2,23,1371716826,...,100,100,100,100,100,100,100,100,100,100
53,-7341.215000,4864756.341,2,3,2,243,232243,2,23,1371716857,...,100,100,100,100,100,100,100,100,100,100
54,-7339.153887,4864749.385,1,3,2,242,132242,2,23,1371716935,...,100,100,100,100,100,100,100,100,100,100
55,-7337.232139,4864752.849,1,3,2,243,132243,2,23,1371716982,...,100,100,100,100,100,100,100,100,100,100
56,-7344.968400,4864749.576,2,3,2,241,232241,2,23,1371717041,...,100,100,100,100,100,100,100,100,100,100


### Select Features

In [432]:
#features
features = floor3.iloc[:,10:475]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
46,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,-93,100,100
47,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,-89,100,100
49,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-91,100
50,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,-90,100,100
51,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,-92,100,100


### Select Dependent Variable

In [433]:
#dependent variable
depVar = floor3['position']

### Split Train/Test Data at a 70/30 Ratio

In [434]:
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=.30, random_state=0)
print(" X_train",X_train.shape,'\n',"X_test", X_test.shape,'\n',"y_train", y_train.shape,'\n',"y_test", y_test.shape)

 X_train (1896, 465) 
 X_test (813, 465) 
 y_train (1896,) 
 y_test (813,)


### Build Models

In [438]:
modelRF = RandomForestClassifier( random_state=0)
modelSV = SVC(random_state=0)
modelKN = KNeighborsClassifier()
modelGB = GradientBoostingClassifier(random_state=0)

### Train and score all models

In [439]:
#Random Forest
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [91]:
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.94462025 0.94936709 0.95411392]


0.9989451476793249

In [92]:
#Support Vector Classification
modelSV.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [93]:
print(cross_val_score(modelSV, X_train, y_train))
modelSV.score(X_train,y_train)

[0.89240506 0.89082278 0.89082278]


1.0

In [94]:
#K Nearest Neighbor
modelKN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [95]:
print(cross_val_score(modelKN, X_train, y_train))
modelKN.score(X_train,y_train)

[0.94778481 0.93670886 0.93512658]


0.9688818565400844

In [96]:
#Gradient Boosting Classifier
modelGB.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [97]:
print(cross_val_score(modelGB, X_train, y_train))
modelGB.score(X_train,y_train)

[0.93670886 0.94303797 0.92721519]


0.9767932489451476

### Position Predictions

In [440]:
#Random Forest
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.974169741697417 
   Kappa:  0.8570280115563371


In [99]:
#Support Vector Classification
SVpred = modelSV.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,SVpred),"\n","  Kappa: ",cohen_kappa_score(y_test,SVpred))

Accuracy:  0.8954489544895449 
   Kappa:  0.11140685877406153


In [100]:
#K Nearest Neighbor
KNpred = modelKN.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,KNpred),"\n","  Kappa: ",cohen_kappa_score(y_test,KNpred))

Accuracy:  0.959409594095941 
   Kappa:  0.7753297324456727


In [101]:
#Gradient Boosted Classification
GBpred = modelGB.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,GBpred),"\n","  Kappa: ",cohen_kappa_score(y_test,GBpred))

Accuracy:  0.948339483394834 
   Kappa:  0.6783655476432688


### Tuning Models

In [119]:
modelRF.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [446]:
RF_grid = {"max_features": [5,50,100,125, 'auto'],
           "min_samples_split": [2,3,4,5],
           "min_samples_leaf": [1,2,3,4],
           "bootstrap": [True, False],
           "criterion": ["gini", "entropy"]}

In [447]:
grid_search = GridSearchCV(modelRF, param_grid=RF_grid)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [4, None], 'max_features': [5, 50, 100, 125, 'auto'], 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2, 3, 4], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [448]:
grid_search.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 125,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [449]:
grid_search.best_score_

0.9688818565400844

### Rebuild Models with tuned parameters and repredict.

In [558]:
modelRF = RandomForestClassifier(bootstrap = False,
             criterion = 'entropy',
             max_features = 125,                    
             min_samples_leaf = 1,
             min_samples_split = 6,
             random_state = 0)

In [559]:
#Train Random Forest model
modelRF.fit(X_train,y_train)
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.97151899 0.96360759 0.95094937]


1.0

In [560]:
#Random Forest score
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.9790897908979089 
   Kappa:  0.8889549504672071


## Predict Space

In [268]:
#filter data to only largest building/floor/position subset
position2 = wifi[(wifi.building == 2) & (wifi.floor == 3) & (wifi.position == 2)]
position2

Unnamed: 0,longitude,latitude,position,floor,building,space,location,userID,phoneID,timestamp,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
46,-7331.7480,4864767.318,2,3,2,247,232247,2,23,1371716690,...,100,100,100,100,100,100,100,-93,100,100
47,-7331.4005,4864768.479,2,3,2,248,232248,2,23,1371716637,...,100,100,100,100,100,100,100,-89,100,100
50,-7336.7004,4864764.479,2,3,2,246,232246,2,23,1371716762,...,100,100,100,100,100,100,100,-90,100,100
51,-7337.3948,4864763.227,2,3,2,245,232245,2,23,1371716797,...,100,100,100,100,100,100,100,-92,100,100
52,-7340.5246,4864757.586,2,3,2,244,232244,2,23,1371716826,...,100,100,100,100,100,100,100,100,100,100
53,-7341.2150,4864756.341,2,3,2,243,232243,2,23,1371716857,...,100,100,100,100,100,100,100,100,100,100
56,-7344.9684,4864749.576,2,3,2,241,232241,2,23,1371717041,...,100,100,100,100,100,100,100,100,100,100
57,-7344.2114,4864750.940,2,3,2,242,232242,2,23,1371716896,...,100,100,100,100,100,100,100,100,100,100
58,-7344.9684,4864749.576,2,3,2,241,232241,2,23,1371717071,...,100,100,100,100,100,100,100,100,100,100
59,-7347.1083,4864748.046,2,3,2,240,232240,2,23,1371717264,...,100,100,100,100,100,100,100,100,100,100


### Select Features

In [269]:
#features
features = position2.iloc[:,10:475]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
46,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,-93,100,100
47,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,-89,100,100
50,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,-90,100,100
51,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,-92,100,100
52,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


### Select Dependent Variable

In [270]:
#dependent variable
depVar = position2['space']

### Split Train/Test Data at a 70/30 Ratio

In [271]:
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=.30, random_state=0)
print(" X_train",X_train.shape,'\n',"X_test", X_test.shape,'\n',"y_train", y_train.shape,'\n',"y_test", y_test.shape)

 X_train (1685, 465) 
 X_test (723, 465) 
 y_train (1685,) 
 y_test (723,)


### Build Models

In [115]:
modelRF = RandomForestClassifier( random_state=0)
modelSV = SVC(random_state=0)
modelKN = KNeighborsClassifier()
modelGB = GradientBoostingClassifier(random_state=0)

### Train and score all models

In [116]:
#Random Forest
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [117]:
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.66723549 0.67667845 0.7260788 ]


0.997626112759644

In [109]:
#Support Vector Classification
modelSV.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [110]:
print(cross_val_score(modelSV, X_train, y_train))
modelSV.score(X_train,y_train)

[0.07508532 0.07243816 0.07879925]


0.9994065281899109

In [111]:
#K Nearest Neighbor
modelKN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [112]:
print(cross_val_score(modelKN, X_train, y_train))
modelKN.score(X_train,y_train)

[0.54095563 0.57597173 0.58348968]


0.7922848664688428

In [113]:
#Gradient Boosting Classifier
modelGB.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [114]:
print(cross_val_score(modelGB, X_train, y_train))
modelGB.score(X_train,y_train)

[0.57679181 0.58657244 0.630394  ]


0.9994065281899109

### Space Predictions

In [118]:
#Random Forest
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.7053941908713693 
   Kappa:  0.7012558973244769


In [116]:
#Support Vector Classification
SVpred = modelSV.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,SVpred),"\n","  Kappa: ",cohen_kappa_score(y_test,SVpred))

Accuracy:  0.08713692946058091 
   Kappa:  0.06330604793592831


In [117]:
#K Nearest Neighbor
KNpred = modelKN.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,KNpred),"\n","  Kappa: ",cohen_kappa_score(y_test,KNpred))

Accuracy:  0.6002766251728907 
   Kappa:  0.5946987110477464


In [118]:
#Gradient Boosted Classification
GBpred = modelGB.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,GBpred),"\n","  Kappa: ",cohen_kappa_score(y_test,GBpred))

Accuracy:  0.6473029045643154 
   Kappa:  0.6423971696685922


### Tuning Models

In [119]:
modelRF.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [143]:
RF_grid = {"max_depth": [4, None],
              "max_features": [85,100,125, 'auto'],
              "min_samples_split": [4,5],
              "min_samples_leaf": [1,4],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [144]:
grid_search = GridSearchCV(modelRF, param_grid=RF_grid)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=100, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [4, None], 'max_features': [85, 100, 125, 'auto'], 'min_samples_split': [4, 5], 'min_samples_leaf': [1, 4], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [145]:
grid_search.best_params_

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 100,
 'min_samples_leaf': 1,
 'min_samples_split': 4}

In [146]:
grid_search.best_score_

0.7578635014836795

### Rebuild Models with tuned parameters and repredict.

In [427]:
modelRF = RandomForestClassifier(bootstrap = False,
             criterion = 'gini',
             max_features = 100,                    
             max_depth = None,
             min_samples_leaf = 1,
             min_samples_split = 3,
             random_state = 0)

In [428]:
#Train Random Forest model
modelRF.fit(X_train,y_train)
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.75426621 0.74028269 0.74296435]


1.0

In [429]:
#Random Forest score
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.7786998616874136 
   Kappa:  0.7755795297801386


# Method 2:
#### Filter data by smallest subset.

## Predict Floor

In [48]:
#filter data to only smallest building subset
building1 = wifi.query('building==1')
building1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5196 entries, 0 to 19936
Columns: 475 entries, longitude to WAP519
dtypes: float64(2), int64(473)
memory usage: 18.9 MB


### Select Features

In [51]:
#features
features = building1.iloc[:,10:475]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
2,100,100,100,100,100,-97,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
3,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
5,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


### Select Dependent Variable

In [52]:
#dependent variable
depVar = building1['floor']

### Split Train/Test Data at a 70/30 Ratio

In [53]:
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=.30, random_state=0)
print(" X_train",X_train.shape,'\n',"X_test", X_test.shape,'\n',"y_train", y_train.shape,'\n',"y_test", y_test.shape)

 X_train (3637, 465) 
 X_test (1559, 465) 
 y_train (3637,) 
 y_test (1559,)


### Build Models

In [54]:
modelRF = RandomForestClassifier( random_state=0)
modelSV = SVC(random_state=0)
modelKN = KNeighborsClassifier()
modelGB = GradientBoostingClassifier(random_state=0)

### Train and score all models

In [55]:
#Random Forest
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [56]:
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.98434926 0.98433636 0.99090909]


1.0

In [57]:
#Support Vector Classification
modelSV.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [58]:
print(cross_val_score(modelSV, X_train, y_train))
modelSV.score(X_train,y_train)

[0.55848435 0.56553998 0.57603306]


1.0

In [59]:
#K Nearest Neighbor
modelKN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [60]:
print(cross_val_score(modelKN, X_train, y_train))
modelKN.score(X_train,y_train)

[0.94728171 0.96207749 0.96115702]


0.9821281275776739

In [61]:
#Gradient Boosting Classifier
modelGB.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [62]:
print(cross_val_score(modelGB, X_train, y_train))
modelGB.score(X_train,y_train)

[0.98270181 0.98516076 0.99090909]


0.9983502886994776

### Floor Predictions

In [63]:
#Random Forest
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.9916613213598461 
   Kappa:  0.9887901901027119


In [64]:
#Support Vector Classification
SVpred = modelSV.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,SVpred),"\n","  Kappa: ",cohen_kappa_score(y_test,SVpred))

Accuracy:  0.5997434252726106 
   Kappa:  0.46280920542571446


In [65]:
#K Nearest Neighbor
KNpred = modelKN.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,KNpred),"\n","  Kappa: ",cohen_kappa_score(y_test,KNpred))

Accuracy:  0.9704939063502245 
   Kappa:  0.9603652757060043


In [66]:
#Gradient Boosted Classification
GBpred = modelGB.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,GBpred),"\n","  Kappa: ",cohen_kappa_score(y_test,GBpred))

Accuracy:  0.9878127004490058 
   Kappa:  0.9836160329571011


## Predict Position

In [561]:
#filter data to only smallest building/floor subset
floor3 = wifi[(wifi.building == 1) & (wifi.floor == 3)]
floor3

Unnamed: 0,longitude,latitude,position,floor,building,space,location,userID,phoneID,timestamp,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
4629,-7536.584400,4864918.110,2,3,1,101,231101,8,1,1371714017,...,100,100,100,100,100,100,100,100,100,100
4633,-7519.111800,4864949.510,2,3,1,102,231102,8,1,1371714444,...,100,100,100,100,100,100,100,100,100,100
4634,-7513.104491,4864943.870,1,3,1,102,131102,8,1,1371714370,...,100,100,100,100,100,100,100,100,100,100
4635,-7526.320097,4864919.689,1,3,1,101,131101,8,1,1371714217,...,100,100,100,100,100,100,100,100,100,100
4636,-7524.351300,4864934.490,2,3,1,102,231102,8,1,1371714326,...,100,100,100,100,100,100,100,100,100,100
4637,-7527.402600,4864929.238,2,3,1,101,231101,8,1,1371714102,...,100,100,100,100,100,100,100,100,100,100
4638,-7533.671000,4864939.662,2,3,1,116,231116,8,1,1371714577,...,100,100,100,100,100,100,100,100,100,100
4639,-7524.863818,4864950.191,1,3,1,103,131103,8,1,1371714519,...,100,100,100,100,100,100,100,100,100,100
4640,-7523.601200,4864951.944,2,3,1,103,231103,8,1,1371714485,...,100,100,100,100,100,100,100,100,100,100
4641,-7537.092714,4864945.534,1,3,1,116,131116,8,1,1371714616,...,100,100,100,100,100,100,100,100,100,100


### Select Features

In [562]:
#features
features = floor3.iloc[:,10:475]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
4629,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4633,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4634,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4635,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4636,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


### Select Dependent Variable

In [563]:
#dependent variable
depVar = floor3['position']

### Split Train/Test Data at a 70/30 Ratio

In [564]:
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=.30, random_state=0)
print(" X_train",X_train.shape,'\n',"X_test", X_test.shape,'\n',"y_train", y_train.shape,'\n',"y_test", y_test.shape)

 X_train (663, 465) 
 X_test (285, 465) 
 y_train (663,) 
 y_test (285,)


### Build Models

In [565]:
modelRF = RandomForestClassifier( random_state=0)
modelSV = SVC(random_state=0)
modelKN = KNeighborsClassifier()
modelGB = GradientBoostingClassifier(random_state=0)

### Train and score all models

In [566]:
#Random Forest
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [567]:
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.86936937 0.88687783 0.90909091]


0.9773755656108597

In [76]:
#Support Vector Classification
modelSV.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [77]:
print(cross_val_score(modelSV, X_train, y_train))
modelSV.score(X_train,y_train)

[0.82882883 0.84162896 0.82272727]


0.9638009049773756

In [78]:
#K Nearest Neighbor
modelKN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [79]:
print(cross_val_score(modelKN, X_train, y_train))
modelKN.score(X_train,y_train)

[0.84234234 0.88687783 0.87272727]


0.8989441930618401

In [80]:
#Gradient Boosting Classifier
modelGB.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [81]:
print(cross_val_score(modelGB, X_train, y_train))
modelGB.score(X_train,y_train)

[0.89189189 0.87330317 0.88636364]


0.9532428355957768

### Position Predictions

In [568]:
#Random Forest
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.9403508771929825 
   Kappa:  0.7816879196142927


In [83]:
#Support Vector Classification
SVpred = modelSV.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,SVpred),"\n","  Kappa: ",cohen_kappa_score(y_test,SVpred))

Accuracy:  0.8736842105263158 
   Kappa:  0.42655935613682094


In [84]:
#K Nearest Neighbor
KNpred = modelKN.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,KNpred),"\n","  Kappa: ",cohen_kappa_score(y_test,KNpred))

Accuracy:  0.887719298245614 
   Kappa:  0.5784023668639053


In [85]:
#Gradient Boosted Classification
GBpred = modelGB.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,GBpred),"\n","  Kappa: ",cohen_kappa_score(y_test,GBpred))

Accuracy:  0.9298245614035088 
   Kappa:  0.7318656505786056


### Tuning Models

In [119]:
modelRF.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [569]:
RF_grid = {"max_features": [5,50,100,125, 'auto'],
           "min_samples_split": [2,3,4,5],
           "min_samples_leaf": [1,2,3,4],
           "bootstrap": [True, False],
           "criterion": ["gini", "entropy"]}

In [570]:
grid_search = GridSearchCV(modelRF, param_grid=RF_grid)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_features': [5, 50, 100, 125, 'auto'], 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2, 3, 4], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [571]:
grid_search.best_params_

{'bootstrap': False,
 'criterion': 'gini',
 'max_features': 50,
 'min_samples_leaf': 1,
 'min_samples_split': 4}

In [572]:
grid_search.best_score_

0.9064856711915535

### Rebuild Models with tuned parameters and repredict.

In [672]:
modelRF = RandomForestClassifier(bootstrap = False,
             criterion = 'gini',
             max_features = 30,                    
             min_samples_leaf = 1,
             min_samples_split = 5,
             random_state = 0)

In [673]:
#Train Random Forest model
modelRF.fit(X_train,y_train)
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.90540541 0.89140271 0.90909091]


0.9773755656108597

In [674]:
#Random Forest score
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.9543859649122807 
   Kappa:  0.8301938677299601


## Predict Space

In [150]:
#filter data to only smallest building/floor/position subset
position1 = wifi[(wifi.building == 1) & (wifi.floor == 3) & (wifi.position == 1)]
position1

Unnamed: 0,longitude,latitude,position,floor,building,space,location,userID,phoneID,timestamp,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
4634,-7513.104491,4864943.870,1,3,1,102,131102,8,1,1371714370,...,100,100,100,100,100,100,100,100,100,100
4635,-7526.320097,4864919.689,1,3,1,101,131101,8,1,1371714217,...,100,100,100,100,100,100,100,100,100,100
4639,-7524.863818,4864950.191,1,3,1,103,131103,8,1,1371714519,...,100,100,100,100,100,100,100,100,100,100
4641,-7537.092714,4864945.534,1,3,1,116,131116,8,1,1371714616,...,100,100,100,100,100,100,100,100,100,100
4643,-7535.227970,4864893.274,1,3,1,113,131113,8,1,1371715052,...,100,100,100,100,100,100,100,100,100,100
4644,-7539.413493,4864923.973,1,3,1,104,131104,8,1,1371714921,...,100,100,100,100,100,100,100,100,100,100
4683,-7526.320097,4864919.689,1,3,1,101,131101,8,1,1371714204,...,100,100,100,100,100,100,100,100,100,100
4685,-7513.104491,4864943.870,1,3,1,102,131102,8,1,1371714354,...,100,100,100,100,100,100,100,100,100,100
4687,-7524.863818,4864950.191,1,3,1,103,131103,8,1,1371714503,...,100,100,100,100,100,100,100,100,100,100
4689,-7537.092714,4864945.534,1,3,1,116,131116,8,1,1371714602,...,100,100,100,100,100,100,100,100,100,100


### Select Features

In [151]:
#features
features = position1.iloc[:,10:475]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
4634,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4635,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4639,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4641,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4643,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


### Select Dependent Variable

In [152]:
#dependent variable
depVar = position1['space']

### Split Train/Test Data at a 70/30 Ratio

In [153]:
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=.30, random_state=0)
print(" X_train",X_train.shape,'\n',"X_test", X_test.shape,'\n',"y_train", y_train.shape,'\n',"y_test", y_test.shape)

 X_train (125, 465) 
 X_test (54, 465) 
 y_train (125,) 
 y_test (54,)


### Build Models

In [154]:
modelRF = RandomForestClassifier( random_state=0)
modelSV = SVC(random_state=0)
modelKN = KNeighborsClassifier()
modelGB = GradientBoostingClassifier(random_state=0)

### Train and score all models

In [155]:
#Random Forest
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [156]:
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.68888889 0.76190476 0.78947368]


0.944

In [95]:
#Support Vector Classification
modelSV.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [96]:
print(cross_val_score(modelSV, X_train, y_train))
modelSV.score(X_train,y_train)

[0.37777778 0.4047619  0.39473684]


0.936

In [97]:
#K Nearest Neighbor
modelKN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [98]:
print(cross_val_score(modelKN, X_train, y_train))
modelKN.score(X_train,y_train)

[0.57777778 0.61904762 0.5       ]


0.768

In [99]:
#Gradient Boosting Classifier
modelGB.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [100]:
print(cross_val_score(modelGB, X_train, y_train))
modelGB.score(X_train,y_train)

[0.64444444 0.66666667 0.73684211]


0.944

### Space Predictions

In [157]:
#Random Forest
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.8333333333333334 
   Kappa:  0.8154897494305239


In [102]:
#Support Vector Classification
SVpred = modelSV.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,SVpred),"\n","  Kappa: ",cohen_kappa_score(y_test,SVpred))

Accuracy:  0.35185185185185186 
   Kappa:  0.3007769145394006


In [103]:
#K Nearest Neighbor
KNpred = modelKN.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,KNpred),"\n","  Kappa: ",cohen_kappa_score(y_test,KNpred))

Accuracy:  0.6851851851851852 
   Kappa:  0.6524043922756532


In [104]:
#Gradient Boosted Classification
GBpred = modelGB.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,GBpred),"\n","  Kappa: ",cohen_kappa_score(y_test,GBpred))

Accuracy:  0.8148148148148148 
   Kappa:  0.7942857142857143


### Tuning Models

In [14]:
modelRF.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [165]:
RF_grid = {"max_depth": [None,1],
              "max_features": ['auto',1,2,3],
              "min_samples_split": [2,3,4],
              "min_samples_leaf": [1,2],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [166]:
grid_search = GridSearchCV(modelRF, param_grid=RF_grid)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [None, 1], 'max_features': ['auto', 1, 2, 3], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [167]:
grid_search.best_params_

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 4}

In [168]:
grid_search.best_score_

0.784

### Rebuild Models with tuned parameters and repredict.

In [265]:
modelRF = RandomForestClassifier(bootstrap = False,
             criterion = 'gini',
             max_features = 8,                    
             max_depth = None,
             min_samples_leaf = 1,
             min_samples_split = 3,
             random_state = 0)

In [266]:
#Train Random Forest model
modelRF.fit(X_train,y_train)
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.73333333 0.69047619 0.73684211]


0.944

In [267]:
#Random Forest score
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.9074074074074074 
   Kappa:  0.8966309341500766


# Method 3:
#### Use full data set for each prediction.

## Predict Floor

In [135]:
#features
features = wifi.iloc[:,10:475]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
2,100,100,100,100,100,-97,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
3,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


### Select Dependent Variable

In [136]:
#dependent variable
depVar = wifi['floor']

### Split Train/Test Data at a 70/30 Ratio

In [137]:
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=.30, random_state=0)
print(" X_train",X_train.shape,'\n',"X_test", X_test.shape,'\n',"y_train", y_train.shape,'\n',"y_test", y_test.shape)

 X_train (13955, 465) 
 X_test (5982, 465) 
 y_train (13955,) 
 y_test (5982,)


### Build Models

In [138]:
modelRF = RandomForestClassifier( random_state=0)
modelSV = SVC(random_state=0)
modelKN = KNeighborsClassifier()
modelGB = GradientBoostingClassifier(random_state=0)

### Train and score all models

In [139]:
#Random Forest
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [140]:
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.977219   0.98129837 0.97935484]


0.9974919383733429

In [141]:
#Support Vector Classification
modelSV.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [142]:
print(cross_val_score(modelSV, X_train, y_train))
modelSV.score(X_train,y_train)

[0.42725124 0.42024936 0.42473118]


0.9980652096022931

In [143]:
#K Nearest Neighbor
modelKN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [144]:
print(cross_val_score(modelKN, X_train, y_train))
modelKN.score(X_train,y_train)

[0.9516441  0.95292347 0.95075269]


0.9762809029021856

In [145]:
#Gradient Boosting Classifier
modelGB.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [146]:
print(cross_val_score(modelGB, X_train, y_train))
modelGB.score(X_train,y_train)

[0.95981087 0.96603611 0.96537634]


0.9756359727696167

### Floor Predictions

In [147]:
#Random Forest
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.9864593781344032 
   Kappa:  0.9824928080402533


In [148]:
#Support Vector Classification
SVpred = modelSV.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,SVpred),"\n","  Kappa: ",cohen_kappa_score(y_test,SVpred))

Accuracy:  0.472584419926446 
   Kappa:  0.3020110017937152


In [149]:
#K Nearest Neighbor
KNpred = modelKN.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,KNpred),"\n","  Kappa: ",cohen_kappa_score(y_test,KNpred))

Accuracy:  0.9682380474757606 
   Kappa:  0.9589434370330212


In [150]:
#Gradient Boosted Classification
GBpred = modelGB.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,GBpred),"\n","  Kappa: ",cohen_kappa_score(y_test,GBpred))

Accuracy:  0.9722500835840856 
   Kappa:  0.9641200107183397


## Predict Position

In [151]:
#features
features = wifi.iloc[:,10:475]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
2,100,100,100,100,100,-97,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
3,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


### Select Dependent Variable

In [152]:
#dependent variable
depVar = wifi['position']

### Split Train/Test Data at a 70/30 Ratio

In [153]:
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=.30, random_state=0)
print(" X_train",X_train.shape,'\n',"X_test", X_test.shape,'\n',"y_train", y_train.shape,'\n',"y_test", y_test.shape)

 X_train (13955, 465) 
 X_test (5982, 465) 
 y_train (13955,) 
 y_test (5982,)


### Build Models

In [154]:
modelRF = RandomForestClassifier( random_state=0)
modelSV = SVC(random_state=0)
modelKN = KNeighborsClassifier()
modelGB = GradientBoostingClassifier(random_state=0)

### Train and score all models

In [155]:
#Random Forest
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [156]:
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.93294649 0.93248764 0.93829284]


0.9960587603009674

In [157]:
#Support Vector Classification
modelSV.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [158]:
print(cross_val_score(modelSV, X_train, y_train))
modelSV.score(X_train,y_train)

[0.84891468 0.84949473 0.84970974]


0.994267287710498

In [159]:
#K Nearest Neighbor
modelKN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [160]:
print(cross_val_score(modelKN, X_train, y_train))
modelKN.score(X_train,y_train)

[0.91424887 0.91356697 0.91485702]


0.9525618058043712

In [161]:
#Gradient Boosting Classifier
modelGB.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [162]:
print(cross_val_score(modelGB, X_train, y_train))
modelGB.score(X_train,y_train)

[0.87792822 0.87938078 0.88045582]


0.89064851307775

### Position Predictions

In [163]:
#Random Forest
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.9490137077900368 
   Kappa:  0.8042780748663101


In [164]:
#Support Vector Classification
SVpred = modelSV.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,SVpred),"\n","  Kappa: ",cohen_kappa_score(y_test,SVpred))

Accuracy:  0.8542293547308593 
   Kappa:  0.20365296803652966


In [165]:
#K Nearest Neighbor
KNpred = modelKN.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,KNpred),"\n","  Kappa: ",cohen_kappa_score(y_test,KNpred))

Accuracy:  0.9269475091942494 
   Kappa:  0.715433036683308


In [166]:
#Gradient Boosted Classification
GBpred = modelGB.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,GBpred),"\n","  Kappa: ",cohen_kappa_score(y_test,GBpred))

Accuracy:  0.8813105984620528 
   Kappa:  0.4226077527785308


## Predict Space

In [675]:
#features
features = wifi.iloc[:,10:475]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP510,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
2,100,100,100,100,100,-97,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
3,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


### Select Dependent Variable

In [676]:
#dependent variable
depVar = wifi['space']

### Split Train/Test Data at a 70/30 Ratio

In [677]:
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=.30, random_state=0)
print(" X_train",X_train.shape,'\n',"X_test", X_test.shape,'\n',"y_train", y_train.shape,'\n',"y_test", y_test.shape)

 X_train (13955, 465) 
 X_test (5982, 465) 
 y_train (13955,) 
 y_test (5982,)


### Build Models

In [678]:
modelRF = RandomForestClassifier( random_state=0)
modelSV = SVC(random_state=0)
modelKN = KNeighborsClassifier()
modelGB = GradientBoostingClassifier(random_state=0)

### Train and score all models

In [679]:
#Random Forest
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [680]:
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.65111821 0.65282857 0.64172631]


0.9667502687208885

In [173]:
#Support Vector Classification
modelSV.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [174]:
print(cross_val_score(modelSV, X_train, y_train))
modelSV.score(X_train,y_train)

[0.16975506 0.15917402 0.1667751 ]


0.9641705481906127

In [175]:
#K Nearest Neighbor
modelKN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [176]:
print(cross_val_score(modelKN, X_train, y_train))
modelKN.score(X_train,y_train)

[0.54824281 0.55345236 0.54651919]


0.7433894661411681

In [177]:
#Gradient Boosting Classifier
modelGB.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [178]:
print(cross_val_score(modelGB, X_train, y_train))
modelGB.score(X_train,y_train)

[0.52992545 0.51086255 0.51810887]


0.9269079183088499

### Space Predictions

In [681]:
#Random Forest
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.7039451688398529 
   Kappa:  0.6999086224630227


In [180]:
#Support Vector Classification
SVpred = modelSV.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,SVpred),"\n","  Kappa: ",cohen_kappa_score(y_test,SVpred))

Accuracy:  0.21631561350718823 
   Kappa:  0.20101213353365877


In [181]:
#K Nearest Neighbor
KNpred = modelKN.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,KNpred),"\n","  Kappa: ",cohen_kappa_score(y_test,KNpred))

Accuracy:  0.6203610832497493 
   Kappa:  0.6151372838449369


In [182]:
#Gradient Boosted Classification
GBpred = modelGB.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,GBpred),"\n","  Kappa: ",cohen_kappa_score(y_test,GBpred))

Accuracy:  0.5738883316616517 
   Kappa:  0.5682128019326065


### Tuning Models

In [116]:
modelRF.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [685]:
RF_grid = {"max_features": [5,10,100,125, 'auto'],
           "min_samples_split": [2,4,5],
           "min_samples_leaf": [1,4],
           "bootstrap": [True, False],
           "criterion": ["gini", "entropy"]}

In [686]:
grid_search = GridSearchCV(modelRF, param_grid=RF_grid)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_features': [5, 10, 100, 125, 'auto'], 'min_samples_split': [2, 4, 5], 'min_samples_leaf': [1, 4], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [687]:
grid_search.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_features': 125,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [688]:
grid_search.best_score_

0.7052669294159799

### Rebuild Models with tuned parameters and repredict.

In [737]:
modelRF = RandomForestClassifier(bootstrap = False,
             criterion = 'entropy',
             max_features = 125,                    
             min_samples_leaf = 1,
             min_samples_split = 6,
             random_state = 0)

In [738]:
#Train Random Forest model
modelRF.fit(X_train,y_train)
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.68924388 0.69025597 0.68965517]


0.9697599426728771

In [739]:
#Random Forest score
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.7594450016716817 
   Kappa:  0.7562344219979844


# Summary

Random Forest was the best model across all methods. 

Method1:  
75% Accuracy  
67% Kappa
         
Method2:  
86% Accuracy  
73% Kappa
         
Method3:  
71% Accuracy  
60% Kappa