In [1]:
# necessary Libraries
import numpy as np
import pandas as pd
import time
import pprint

#Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
# magic word for producing visualizations in notebook
get_ipython().run_line_magic('matplotlib', 'inline')


#Preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from scipy.sparse import lil_matrix

#Models
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN
from sklearn.ensemble import RandomForestRegressor


#Scoring Metrics
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
from sklearn.metrics import f1_score, fbeta_score
from sklearn.metrics import accuracy_score

import tensorflow as tf
from keras.models import Sequential
from keras.layers import *

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))
Using TensorFlow backend.


In [2]:
df = pd.read_csv("challenge1.csv")

df.drop(['Unnamed: 0', 'USERID', 'PHONEID', 'TIMESTAMP'], axis = 1, inplace=True)
col = df.columns[0:520]
for i in col:
    df[i].fillna(0, inplace=True)
df.dropna(subset=['LONGITUDE','LATITUDE', 'FLOOR', 'BUILDINGID' ], inplace=True)
#trainingData.isnull().sum()

df.iloc[:, 0:520] = np.where(df.iloc[:, 0:520] <= 0, 
                        df.iloc[:, 0:520] + 105, 
                        df.iloc[:, 0:520] - 100)

In [3]:
def preprocess_data(df):
    """
    Separates trainingData into Features and Targets
    Will also be applied to validationData
    
    INPUT: Cleaned trainingData DataFrame
    OUTPUT: trainingData as Features and Targets
    """
    
    global X1
    global y1
    global X2
    global y2
    # split the data set into features and targets(Floor and BuildingID)
    X1 = df.drop(['LONGITUDE', 'LATITUDE', 'BUILDINGID','FLOOR'], axis=1)
    y1 = df[[ 'BUILDINGID']]
    
    X2 = df.drop(['LONGITUDE', 'LATITUDE','FLOOR'], axis=1)
    y2 = df[['FLOOR']]
    
    #create Dummies for the targets to feed into the model
    y1 = pd.get_dummies(data=y1, columns=[ 'BUILDINGID'])
    y2 = pd.get_dummies(data=y2, columns=[ 'FLOOR'])   
    
    return X1, y1, X2, y2

In [4]:
def split_data(X, y):
# TO AVOID OVERFITTING: Split the training data into training and testing sets 
    global X_train
    global X_test
    global y_train
    global y_test
    
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size = 0.3, 
                                                        random_state = 42,
                                                        shuffle=True)

    # Show the results of the split
    print("Training set has {} samples.".format(X_train.shape[0]))
    print("Testing set has {} samples.".format(X_test.shape[0]))
    return X_train, X_test, y_train, y_test

In [5]:
X1, y1, X2, y2 = preprocess_data(df)
X_train1, X_test1, y_train1, y_test1 = split_data(X1,y1)
X_train2, X_test2, y_train2, y_test2 = split_data(X2,y2)

Training set has 13411 samples.
Testing set has 5748 samples.
Training set has 13411 samples.
Testing set has 5748 samples.


In [6]:
#Scale Data with Standard Scaler

scaler = StandardScaler()

#Fit only the training set
#this will help us transform the validation data 
scaler.fit(X_train1)
    
# Apply transform to both the training set and the test set.
X_train1 = scaler.transform(X_train1)
X_test1 = scaler.transform(X_test1)

scaler.fit(X_train2)
    
# Apply transform to both the training set and the test set.
X_train2 = scaler.transform(X_train2)
X_test2 = scaler.transform(X_test2)

In [7]:
#Apply PCA while keeping 95% of the variation in the data
pca = PCA(.95)

    
#Fit only the training set    
pca.fit(X_train1)

# Apply PCA transform to both the training set and the test set.    
X_train_pca1 = pca.transform(X_train1)
X_test_pca1 = pca.transform(X_test1)

print("Number of PCA Components = {}.".format(pca.n_components_))
#print(pca.n_components_)
print("Total Variance Explained by PCA Components = {}.".format(pca.explained_variance_ratio_.sum()))
#print(pca.explained_variance_ratio_.sum())


#Fit only the training set    
pca.fit(X_train2)

# Apply PCA transform to both the training set and the test set.    
X_train_pca2 = pca.transform(X_train2)
X_test_pca2 = pca.transform(X_test2)

print("Number of PCA Components = {}.".format(pca.n_components_))
#print(pca.n_components_)
print("Total Variance Explained by PCA Components = {}.".format(pca.explained_variance_ratio_.sum()))
#print(pca.explained_variance_ratio_.sum())



Number of PCA Components = 469.
Total Variance Explained by PCA Components = 0.9503886620587898.
Number of PCA Components = 469.
Total Variance Explained by PCA Components = 0.9502079771996365.


In [8]:
y_train1 = lil_matrix(y_train1).toarray()
y_test1 = lil_matrix(y_test1).toarray()
y_train2 = lil_matrix(y_train2).toarray()
y_test2 = lil_matrix(y_test2).toarray()

In [9]:
start_time = time.time()

# Define the model
model_1 = Sequential()
model_1.add(Dense(50, input_dim=pca.n_components_, activation='relu'))
model_1.add(Dense(80, activation='relu'))
model_1.add(Dense(50, activation='relu'))
model_1.add(Dense(3, activation='softmax'))
model_1.compile(loss='binary_crossentropy', optimizer='adam')

# Train the model
model_1.fit(
    X_train_pca1,
    y_train1,
    epochs=100,
    shuffle=True,
    verbose=2
)

predictions1 = np.round(model_1.predict(X_test_pca1))

# accuracy
print("Accuracy of predicting buildings = ",accuracy_score(y_test1,predictions1))

print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

Epoch 1/100
 - 2s - loss: 0.1847
Epoch 2/100
 - 1s - loss: 0.0349
Epoch 3/100
 - 1s - loss: 0.0145
Epoch 4/100
 - 1s - loss: 0.0064
Epoch 5/100
 - 1s - loss: 0.0122
Epoch 6/100
 - 1s - loss: 0.0120
Epoch 7/100
 - 1s - loss: 0.0056
Epoch 8/100
 - 1s - loss: 0.0011
Epoch 9/100
 - 1s - loss: 4.7929e-04
Epoch 10/100
 - 1s - loss: 8.8522e-05
Epoch 11/100
 - 1s - loss: 4.0280e-05
Epoch 12/100
 - 1s - loss: 2.7476e-05
Epoch 13/100
 - 1s - loss: 1.9562e-05
Epoch 14/100
 - 2s - loss: 1.4323e-05
Epoch 15/100
 - 1s - loss: 1.0599e-05
Epoch 16/100
 - 1s - loss: 7.9789e-06
Epoch 17/100
 - 1s - loss: 6.0986e-06
Epoch 18/100
 - 1s - loss: 4.6744e-06
Epoch 19/100
 - 1s - loss: 3.6085e-06
Epoch 20/100
 - 1s - loss: 2.8006e-06
Epoch 21/100
 - 1s - loss: 2.1814e-06
Epoch 22/100
 - 1s - loss: 1.7090e-06
Epoch 23/100
 - 1s - loss: 1.3483e-06
Epoch 24/100
 - 2s - loss: 1.0573e-06
Epoch 25/100
 - 1s - loss: 8.4412e-07
Epoch 26/100
 - 1s - loss: 6.7886e-07
Epoch 27/100
 - 1s - loss: 5.5291e-07
Epoch 28/100
 -

In [10]:
start_time = time.time()

# Define the model
model_2 = Sequential()
model_2.add(Dense(50, input_dim=pca.n_components_, activation='relu'))
model_2.add(Dense(80, activation='relu'))
model_2.add(Dense(50, activation='relu'))
model_2.add(Dense(5, activation='softmax'))
model_2.compile(loss='binary_crossentropy', optimizer='adam')

# Train the model
model_2.fit(
    X_train_pca2,
    y_train2,
    epochs=100,
    shuffle=True,
    verbose=2
)

predictions2 = np.round(model_2.predict(X_test_pca2))

# accuracy
print("Accuracy of predicting floors = ",accuracy_score(y_test2,predictions2))

print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

Epoch 1/100
 - 2s - loss: 0.3219
Epoch 2/100
 - 1s - loss: 0.1707
Epoch 3/100
 - 1s - loss: 0.1133
Epoch 4/100
 - 1s - loss: 0.0765
Epoch 5/100
 - 1s - loss: 0.0510
Epoch 6/100
 - 1s - loss: 0.0360
Epoch 7/100
 - 1s - loss: 0.0268
Epoch 8/100
 - 1s - loss: 0.0192
Epoch 9/100
 - 1s - loss: 0.0232
Epoch 10/100
 - 1s - loss: 0.0257
Epoch 11/100
 - 1s - loss: 0.0153
Epoch 12/100
 - 1s - loss: 0.0090
Epoch 13/100
 - 1s - loss: 0.0121
Epoch 14/100
 - 1s - loss: 0.0143
Epoch 15/100
 - 1s - loss: 0.0144
Epoch 16/100
 - 1s - loss: 0.0116
Epoch 17/100
 - 1s - loss: 0.0123
Epoch 18/100
 - 1s - loss: 0.0099
Epoch 19/100
 - 1s - loss: 0.0134
Epoch 20/100
 - 1s - loss: 0.0110
Epoch 21/100
 - 1s - loss: 0.0061
Epoch 22/100
 - 1s - loss: 0.0079
Epoch 23/100
 - 1s - loss: 0.0077
Epoch 24/100
 - 1s - loss: 0.0095
Epoch 25/100
 - 1s - loss: 0.0129
Epoch 26/100
 - 1s - loss: 0.0107
Epoch 27/100
 - 1s - loss: 0.0076
Epoch 28/100
 - 1s - loss: 0.0075
Epoch 29/100
 - 1s - loss: 0.0054
Epoch 30/100
 - 1s - lo

In [11]:
predictions = np.hstack((predictions1, predictions2)) 
y_test = np.hstack((y_test1, y_test2))  
# accuracy
print("Total Accuracy = ",accuracy_score(y_test,predictions))

print("--- Run time: %s mins ---" % np.round(((time.time() - start_time)/60),2))

Total Accuracy =  0.7994084899095337
--- Run time: 2.0 mins ---
