In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_1 = pd.read_csv("datatraining.csv")
data_2 = pd.read_csv("datatest.csv")
data_3 = pd.read_csv("datatest2.csv")

uci_data = pd.concat([data_1, data_2, data_3])
uci_data.date = pd.to_datetime(uci_data.date)
uci_data.set_index('date', inplace=True)

# Resampling

In [3]:
uci_sampling = uci_data.resample('5min').mean()

# Fill Missing Value

In [4]:
uci_sampling = uci_sampling.interpolate(method ='linear', limit_direction ='forward')

In [5]:
uci_sampling['Occupancy'] = uci_sampling['Occupancy'].apply(np.ceil)

# Normalize

In [6]:
def normalize(dataset):
    dataNorm=((dataset-dataset.min())/(dataset.max() - dataset.min()))
    dataNorm['Occupancy'] = dataset['Occupancy']
    return dataNorm

In [7]:
datanorm = normalize(uci_sampling)

In [8]:
X = datanorm.iloc[:,0:5]
y = datanorm.iloc[:,5]
y = y.astype(int)

# Oversampling Imbalance Data

In [9]:
from collections import Counter
from imblearn.over_sampling import SMOTE # doctest: +NORMALIZE_WHITESPACE

print('Original dataset shape %s' % Counter(y))

sm = SMOTE(k_neighbors = 1)
X_res, y_res = sm.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))

Using TensorFlow backend.


Original dataset shape Counter({0: 3124, 1: 1425})
Resampled dataset shape Counter({1: 3124, 0: 3124})


In [10]:
X_res = pd.DataFrame(X_res,columns =['temp', 'humid', 'light', 'co2', 'humid_ratio'])
y_res = pd.DataFrame(y_res, columns = ['Occupancy'])

frames = [X_res,y_res]
df = pd.concat(frames,axis = 1, sort = False)

In [11]:
X_new = df.iloc[:,0:5]
y_new = df.iloc[:,5]
y_new = y_new.astype(int)

# Startified K-fold

In [12]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
skf.get_n_splits(X_new, y_new)

Xtrain = []
ytrain = []
Xtest = []
ytest = []

for train_index, test_index in skf.split(X_new, y_new):    
    X_train, X_test = X_new.values[train_index], X_new.values[test_index]
    y_train, y_test = y_new.values[train_index], y_new.values[test_index]
    Xtrain.append(X_train)
    ytrain.append(y_train)
    Xtest.append(X_test)
    ytest.append(y_test)

for i in range (10):
    Xtrain[i] = pd.DataFrame(columns=['temp', 'humid', 'light', 'co2', 'humid_ratio'], data = Xtrain[i])
    ytrain[i] = pd.DataFrame(columns=['Occupancy'], data = ytrain[i])
    Xtest[i] = pd.DataFrame(columns=['temp', 'humid', 'light', 'co2', 'humid_ratio'], data = Xtest[i])
    ytest[i] = pd.DataFrame(columns=['Occupancy'], data = ytest[i])

In [17]:
predicted_value = pd.read_csv('predictedbyLSTM.csv').drop(columns = ['Unnamed: 0'])
ground_truth = pd.read_csv('groundtruthforLSTM.csv').drop(columns = ['Unnamed: 0'])

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score



clfrf = RandomForestClassifier(bootstrap = True,max_depth= 80,max_features= 2,min_samples_leaf= 3,min_samples_split= 12,
n_estimators= 100,random_state = 6, oob_score = True)
RFfit = clfrf.fit(X_new, y_new)
print(clfrf.score(X_new, y_new))
# RFpred = clfrf.predict(X_new)
# prediction = clfrf.predict(predicted_value)
# acc_rftest = balanced_accuracy_score(y_new, RFpred)
# RF_pred =balanced_accuracy_score(prediction, ground_truth)


0.9950384122919335


In [19]:

print(clfrf.score(predicted_value, ground_truth))

0.9782844733984799


In [45]:

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
model = DecisionTreeClassifier()

modelfitness = model.fit(X_new, y_new)
cross = cross_val_score(model, X_new, y_new, cv=10)
DTCpredicted = modelfitness.predict(X_new)
dtcbalanced = balanced_accuracy_score(y_new, DTCpredicted)
DTC_occ = modelfitness.predict(predicted_value)
balanced_DTC = balanced_accuracy_score(ground_truth, DTC_occ)

In [46]:
np.mean(cross)

0.9630514254116489

In [47]:
modelfitness.score(predicted_value,ground_truth)

0.9315960912052117

In [48]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(metric ='manhattan', n_neighbors= 3, weights = 'distance')
knnfitness = neigh.fit(X_new, y_new)
knnpredict = neigh.predict(X_new)
cross = cross_val_score(knnfitness, X_new, y_new, cv=10)

In [50]:
np.mean(cross)

0.9590163430818383

In [51]:
knnfitness.score(predicted_value,ground_truth)

0.9685124864277959

# SVC

In [13]:
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score

In [14]:
accsvm_4 = []
for i in range(10):
    mysvm = SVC(C=25, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
    mysvm = mysvm.fit(Xtrain[i], ytrain[i])
    my_svmpredict = mysvm.predict(Xtest[i])
    acc_svmtest = balanced_accuracy_score(ytest[i], my_svmpredict)
    accsvm_4.append(acc_svmtest)
print(mysvm)
print("balanced accuracy =", np.mean(accsvm_4))

SVC(C=25, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
balanced accuracy = 0.987204063242402


# Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

accrf = []
for i in range(10):
    clfrf = RandomForestClassifier(bootstrap = True,max_depth= 80,max_features= 2,min_samples_leaf= 3,min_samples_split= 12,
  n_estimators= 100,random_state = 6)
    RFfit = clfrf.fit(Xtrain[i], ytrain[i])
    RFpred = clfrf.predict(Xtest[i])
    acc_rftest = balanced_accuracy_score(ytest[i], RFpred)
    accrf.append(acc_rftest)

print("balanced accuracy =", np.mean(accrf))

balanced accuracy = 0.9800022528057672


# Decision Tree

In [16]:
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

balanced = []
y_preddtc = []
for i in range (10):
    model = DecisionTreeClassifier()
    modelfitness = model.fit(Xtrain[i], ytrain[i])
    DTCpredicted = modelfitness.predict(Xtest[i])
    y_preddtc.append(DTCpredicted)
    dtcbalanced = balanced_accuracy_score(ytest[i],DTCpredicted)
    balanced.append(dtcbalanced)
    averagedtc = np.mean(balanced)
print("MAE=" , averagedtc) 

MAE= 0.9684996313590564


# MLP

In [17]:
from sklearn.neural_network import MLPClassifier

accmlp = []
for i in range(10):
    clfMLP = MLPClassifier(solver='lbfgs', alpha=1e-5, n_iter_no_change=1000,
                hidden_layer_sizes=(10, 3), random_state=6,learning_rate_init=0.3)
    MLPfitness = clfMLP.fit(Xtrain[i],ytrain[i])
    ypredmlp = MLPfitness.predict(Xtest[i])
    acc_mlptest = balanced_accuracy_score(ytest[i], ypredmlp)
    accmlp.append(acc_mlptest)
print(clfMLP)
print("balanced accuracy =", np.mean(accmlp))

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10, 3), learning_rate='constant',
              learning_rate_init=0.3, max_iter=200, momentum=0.9,
              n_iter_no_change=1000, nesterovs_momentum=True, power_t=0.5,
              random_state=6, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)
balanced accuracy = 0.9795158515605801


# KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier

accknn = []
for i in range(10):
    neigh = KNeighborsClassifier(metric ='manhattan', n_neighbors= 3, weights = 'distance')
    knnfitness = neigh.fit(Xtrain[i], ytrain[i])
    knnpredict = neigh.predict(Xtest[i])
    acc_knntest = balanced_accuracy_score(ytest[i], knnpredict)
    accknn.append(acc_knntest)
print(neigh)
print("balanced accuracy =", np.mean(accknn))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='distance')
balanced accuracy = 0.9626955845006963


# ANN

In [19]:
features = X_new.values
target = y_new.values

In [20]:
import numpy as np
from keras import models
from keras import layers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification

np.random.seed(0)

In [21]:
def create_network():
    
    network = models.Sequential()

    network.add(layers.Dense(units=30, activation='relu', input_shape=(features.shape[1],)))
    network.add(layers.Dense(units=1, activation='sigmoid'))

    network.compile(loss='binary_crossentropy', # Cross-entropy
                    optimizer='rmsprop', # Root Mean Square Propagation
                    metrics=['accuracy']) # Accuracy performance metric
    
    return network

In [22]:
neural_network = KerasClassifier(build_fn=create_network, 
                                 epochs=100, 
                                 batch_size=100, 
                                 verbose=0)

In [23]:
result = cross_val_score(neural_network, features, target, cv=10)

In [24]:
np.mean(result)

0.9812769278180905

In [25]:
model = create_network()                                                                                                                                                 
model.fit(features, target, epochs=100, batch_size=100, verbose=0, validation_split=0.2)

<keras.callbacks.History at 0x21242c871d0>

In [26]:
data_pred = pd.read_excel('prediction_new_rev.xlsx')
result_pred = model.predict(data_pred.values)

In [27]:
result_pred

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],

In [None]:
data_ori = uci_sampling.values[3628: , 0:5]
result_clas = model.predict(data_ori)