In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_1 = pd.read_csv("datatraining.csv")
data_2 = pd.read_csv("datatest.csv")
data_3 = pd.read_csv("datatest2.csv")

uci_data = pd.concat([data_1, data_2, data_3])
uci_data.date = pd.to_datetime(uci_data.date)
uci_data.set_index('date', inplace=True)

In [3]:
uci_sampling = uci_data.resample('5min').mean()

In [4]:
uci_sampling = uci_sampling.interpolate(method ='linear', limit_direction ='forward')

In [5]:
uci_sampling['Occupancy'] = uci_sampling['Occupancy'].apply(np.ceil)

In [6]:
true_class = uci_sampling.values[3628: , 5]

In [7]:
def normalize(dataset):
    dataNorm=((dataset-dataset.min())/(dataset.max() - dataset.min()))
    dataNorm['Occupancy'] = dataset['Occupancy']
    return dataNorm

In [8]:
datanorm = normalize(uci_sampling)

In [9]:
X = datanorm.iloc[:,0:5]
y = datanorm.iloc[:,5].apply(np.ceil)
y = y.astype(int)

In [10]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
skf.get_n_splits(X,y)

Xtrain = []
ytrain = []
Xtest = []
ytest = []

for train_index, test_index in skf.split(X,y):    
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    Xtrain.append(X_train)
    ytrain.append(y_train)
    Xtest.append(X_test)
    ytest.append(y_test)

for i in range (10):
    Xtrain[i] = pd.DataFrame(columns=['temp', 'humid', 'light', 'co2', 'humid_ratio'], data = Xtrain[i])
    ytrain[i] = pd.DataFrame(columns=['Occupancy'], data = ytrain[i])
    Xtest[i] = pd.DataFrame(columns=['temp', 'humid', 'light', 'co2', 'humid_ratio'], data = Xtest[i])
    ytest[i] = pd.DataFrame(columns=['Occupancy'], data = ytest[i])

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(bootstrap = True,max_depth= 80,max_features= 2,min_samples_leaf= 3,min_samples_split= 12,
  n_estimators= 100,random_state = 6)
rf_learn = rf_classifier.fit(X,y)

In [13]:
predict_new = pd.read_excel('prediction_new_rev.xlsx')

In [14]:
new_predict_RF = rf_learn.predict(predict_new)

In [16]:
from sklearn.metrics import balanced_accuracy_score
acc_RF = balanced_accuracy_score(true_class, new_predict_RF)
acc_RF

0.1908177905308465

In [17]:
new_predict_RF

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,