In [2]:
from lxml import objectify
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [3]:
path = r'CLEAN_House2.csv'
csvapp = pd.read_csv(path,index_col=0,parse_dates=[1])

# making the index a datetime object
import datetime as datetime
csvapp.set_index(pd.DatetimeIndex(csvapp.index),inplace=True)

clean_csv = csvapp.resample('5T').mean()

clean_csv.columns = ['aggragate','fridge','washing_machine','dishwasher','TV','Microwave','Toaster','Hifi','Kettle','Fan','issues']
clean_csv = clean_csv[clean_csv.issues == 0]

# create time of day column
cols = ['time','month','hour','minute','day_of_the_week','week','fridge','washing_machine','dishwasher','TV','Microwave','Toaster','Hifi','Kettle','Fan']

usage2 = pd.DataFrame(index = clean_csv.index, columns=cols)

usage2.hour = clean_csv.index.hour
usage2.minute = clean_csv.index.minute

usage2.month=clean_csv.index.month

usage2.day_of_the_week = clean_csv.index.dayofweek
usage2.time = clean_csv.index.hour*60+clean_csv.index.minute

usage2.week = clean_csv.index.week

usage2.fillna(0, inplace = True)

usage2.TV[(clean_csv.TV > 20)] = 1

usage2.Kettle[(clean_csv.Kettle > 20)] =1

usage2.Microwave[(clean_csv.Microwave>15)] =1

usage2.Toaster[(clean_csv.Toaster > 15)] = 1



# Microwave

In [63]:
## undersample

ones = len(usage2.Microwave[(usage2.Microwave == 1)])
zero_indices = usage2[usage2.Microwave == 0].index
random_indices = np.random.choice(zero_indices,ones, replace=False)
ones = usage2[usage2.Microwave == 1].index
under_sample_indices = np.concatenate([ones,random_indices])
under_sample = usage2.loc[under_sample_indices]

cols = ['month','hour','minute','day_of_the_week','week']

X = under_sample[cols]
y = under_sample.Microwave

under_sample = usage2.loc[under_sample_indices]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


#### RTF

clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(X_train, y_train)

y_pred_class = clf_rf.predict(X_test)
print('Undersampled RTF')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')


#### KNN


knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print('Under KNN')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')

#### Logistic regression

logr =LogisticRegression(solver = 'lbfgs')
logr.fit(X_train,y_train)
y_pred_class = logr.predict(X_test)
print('Under logreg')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')

#### Bagging

bg = BaggingClassifier()
bg.fit(X_train,y_train)
y_pred_class = bg.predict(X_test)
print('Under Bagging')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')

## Over sample

cols = ['month','hour','minute','day_of_the_week','week']
X = usage2[cols]
y = usage2.Microwave

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
sm = SMOTE(random_state=12, ratio = 1.0)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

#### RTF


clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(X_train_res, y_train_res)

y_pred_class = clf_rf.predict(X_test)
print('Over RTF')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')


#### KNN

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train_res, y_train_res)
y_pred_class = knn.predict(X_test)
print('Over KNN')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
#### Logistic regression

logr =LogisticRegression(solver = 'lbfgs')
logr.fit(X_train_res,y_train_res)
y_pred_class = logr.predict(X_test)
print('Over Logreg')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))

#### Bagging

bg = BaggingClassifier()
bg.fit(X_train_res,y_train_res)
y_pred_class = bg.predict(X_test)
print('Over Bagging')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
## Regular


cols = ['month','hour','minute','day_of_the_week','week']
X = usage2[cols]
y = usage2.Microwave

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


#### RTF

clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(X_train, y_train)

y_pred_class = clf_rf.predict(X_test)
print('RTF')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')

#### KNN 

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print('KNN')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
#### Logistic regression

logr =LogisticRegression(solver = 'lbfgs')
logr.fit(X_train,y_train)
y_pred_class = logr.predict(X_test)
print('logreg')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
#### Bagging

bg = BaggingClassifier()
bg.fit(X_train,y_train)
y_pred_class = bg.predict(X_test)
print('bagging')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))

#### Bagging



Undersampled RTF
0.8028846153846154
0.8020304568527918 

Under KNN
0.6730769230769231
0.8121827411167513 

Under logreg
0.5841346153846154
0.6395939086294417 

Under Bagging
0.7956730769230769
0.7614213197969543 

Over RTF
0.9612732264861759
0.419811320754717

Over KNN
0.8831492241874721
0.5188679245283019

Over Logreg
0.5916927399272077
0.589622641509434
Over Bagging
0.9587829640508269
0.42452830188679247

RTF
0.9907413319711385
0.08490566037735849

KNN
0.9932315944064875
0.0

logreg
0.9932315944064875
0.0

bagging
0.9907094055296597
0.09433962264150944


# TV

In [75]:
## undersample

ones = len(usage2.TV[(usage2.TV == 1)])
zero_indices = usage2[usage2.TV == 0].index
random_indices = np.random.choice(zero_indices,ones, replace=False)
ones = usage2[usage2.TV == 1].index
under_sample_indices = np.concatenate([ones,random_indices])
under_sample = usage2.loc[under_sample_indices]

cols = ['month','hour','minute','day_of_the_week','week']

X = under_sample[cols]
y = under_sample.TV

under_sample = usage2.loc[under_sample_indices]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


#### RTF

clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(X_train, y_train)

y_pred_class = clf_rf.predict(X_test)
print('Undersampled RTF')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')


#### KNN


knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print('Under KNN')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')

#### Logistic regression

logr =LogisticRegression(solver = 'lbfgs')
logr.fit(X_train,y_train)
y_pred_class = logr.predict(X_test)
print('Under logreg')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')

#### Bagging

bg = BaggingClassifier()
bg.fit(X_train,y_train)
y_pred_class = bg.predict(X_test)
print('Under Bagging')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')

## Over sample

cols = ['month','hour','minute','day_of_the_week','week']
X = usage2[cols]
y = usage2.TV

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
sm = SMOTE(random_state=12, ratio = 1.0)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

#### RTF


clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(X_train_res, y_train_res)

y_pred_class = clf_rf.predict(X_test)
print('Over RTF')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')


#### KNN

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train_res, y_train_res)
y_pred_class = knn.predict(X_test)
print('Over KNN')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
#### Logistic regression

logr =LogisticRegression(solver = 'lbfgs')
logr.fit(X_train_res,y_train_res)
y_pred_class = logr.predict(X_test)
print('Over Logreg')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))

#### Bagging

bg = BaggingClassifier()
bg.fit(X_train_res,y_train_res)
y_pred_class = bg.predict(X_test)
print('Over Bagging')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
## Regular


cols = ['month','hour','minute','day_of_the_week','week']
X = usage2[cols]
y = usage2.TV

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


#### RTF

clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(X_train, y_train)

y_pred_class = clf_rf.predict(X_test)
print('RTF')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')

#### KNN 

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print('KNN')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
#### Logistic regression

logr =LogisticRegression(solver = 'lbfgs')
logr.fit(X_train,y_train)
y_pred_class = logr.predict(X_test)
print('logreg')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
#### Bagging

bg = BaggingClassifier()
bg.fit(X_train,y_train)
y_pred_class = bg.predict(X_test)
print('bagging')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))

#### Bagging



Undersampled RTF
0.8740993553280243
0.9134687735139202 

Under KNN
0.6839211224876753
0.7562076749435666 

Under logreg
0.5515737580583997
0.5654627539503386 

Under Bagging
0.8593098217671596
0.8792325056433409 

Over RTF
0.8626524487580615
0.6757176105508146

Over KNN
0.7480684502905306
0.7106283941039565

Over Logreg
0.5451120618095907
0.54344453064391
Over Bagging
0.85968328970053
0.6625290923196276

RTF
0.9280697273481898
0.4712955779674166

KNN
0.918268309814188
0.02948021722265322

logreg
0.9176936338675691
0.0

bagging
0.9309111806398059
0.5100853374709077


# Toaster

In [65]:
## undersample

ones = len(usage2.Toaster[(usage2.Toaster == 1)])
zero_indices = usage2[usage2.Toaster == 0].index
random_indices = np.random.choice(zero_indices,ones, replace=False)
ones = usage2[usage2.Toaster == 1].index
under_sample_indices = np.concatenate([ones,random_indices])
under_sample = usage2.loc[under_sample_indices]

cols = ['month','hour','minute','day_of_the_week','week']

X = under_sample[cols]
y = under_sample.Toaster

under_sample = usage2.loc[under_sample_indices]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


#### RTF

clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(X_train, y_train)

y_pred_class = clf_rf.predict(X_test)
print('Undersampled RTF')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')


#### KNN


knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print('Under KNN')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')

#### Logistic regression

logr =LogisticRegression(solver = 'lbfgs')
logr.fit(X_train,y_train)
y_pred_class = logr.predict(X_test)
print('Under logreg')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')

#### Bagging

bg = BaggingClassifier()
bg.fit(X_train,y_train)
y_pred_class = bg.predict(X_test)
print('Under Bagging')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')

## Over sample

cols = ['month','hour','minute','day_of_the_week','week']
X = usage2[cols]
y = usage2.Toaster

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
sm = SMOTE(random_state=12, ratio = 1.0)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

#### RTF


clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(X_train_res, y_train_res)

y_pred_class = clf_rf.predict(X_test)
print('Over RTF')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')


#### KNN

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train_res, y_train_res)
y_pred_class = knn.predict(X_test)
print('Over KNN')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
#### Logistic regression

logr =LogisticRegression(solver = 'lbfgs')
logr.fit(X_train_res,y_train_res)
y_pred_class = logr.predict(X_test)
print('Over Logreg')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))

#### Bagging

bg = BaggingClassifier()
bg.fit(X_train_res,y_train_res)
y_pred_class = bg.predict(X_test)
print('Over Bagging')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
## Regular


cols = ['month','hour','minute','day_of_the_week','week']
X = usage2[cols]
y = usage2.Toaster

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


#### RTF

clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(X_train, y_train)

y_pred_class = clf_rf.predict(X_test)
print('RTF')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')

#### KNN 

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print('KNN')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
#### Logistic regression

logr =LogisticRegression(solver = 'lbfgs')
logr.fit(X_train,y_train)
y_pred_class = logr.predict(X_test)
print('logreg')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
#### Bagging

bg = BaggingClassifier()
bg.fit(X_train,y_train)
y_pred_class = bg.predict(X_test)
print('bagging')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))

#### Bagging



Undersampled RTF
0.6981132075471698
0.7162162162162162 

Under KNN
0.5974842767295597
0.6351351351351351 

Under logreg
0.5723270440251572
0.6891891891891891 

Under Bagging
0.7295597484276729
0.7297297297297297 

Over RTF
0.9754166400612988
0.06741573033707865

Over KNN
0.9082114807483558
0.12359550561797752

Over Logreg
0.5620650022348509
0.34831460674157305
Over Bagging
0.9733414213651747
0.07865168539325842

RTF
0.9962965327884554
0.0449438202247191

KNN
0.9971585467083839
0.0

logreg
0.9971585467083839
0.0

bagging
0.9964561649958495
0.11235955056179775


# Kettle

In [74]:
## undersample

ones = len(usage2.Kettle[(usage2.Kettle == 1)])
zero_indices = usage2[usage2.Kettle == 0].index
random_indices = np.random.choice(zero_indices,ones, replace=False)
ones = usage2[usage2.Kettle == 1].index
under_sample_indices = np.concatenate([ones,random_indices])
under_sample = usage2.loc[under_sample_indices]

cols = ['month','hour','minute','day_of_the_week','week']

X = under_sample[cols]
y = under_sample.Kettle

under_sample = usage2.loc[under_sample_indices]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


#### RTF

clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(X_train, y_train)

y_pred_class = clf_rf.predict(X_test)
print('Undersampled RTF')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')


#### KNN


knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print('Under KNN')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')

#### Logistic regression

logr =LogisticRegression(solver = 'lbfgs')
logr.fit(X_train,y_train)
y_pred_class = logr.predict(X_test)
print('Under logreg')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')

#### Bagging

bg = BaggingClassifier()
bg.fit(X_train,y_train)
y_pred_class = bg.predict(X_test)
print('Under Bagging')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class),'\n')

## Over sample

cols = ['month','hour','minute','day_of_the_week','week']
X = usage2[cols]
y = usage2.Kettle

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
sm = SMOTE(random_state=12, ratio = 1.0)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

#### RTF


clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(X_train_res, y_train_res)

y_pred_class = clf_rf.predict(X_test)
print('Over RTF')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')


#### KNN

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train_res, y_train_res)
y_pred_class = knn.predict(X_test)
print('Over KNN')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
#### Logistic regression

logr =LogisticRegression(solver = 'lbfgs')
logr.fit(X_train_res,y_train_res)
y_pred_class = logr.predict(X_test)
print('Over Logreg')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))

#### Bagging

bg = BaggingClassifier()
bg.fit(X_train_res,y_train_res)
y_pred_class = bg.predict(X_test)
print('Over Bagging')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
## Regular


cols = ['month','hour','minute','day_of_the_week','week']
X = usage2[cols]
y = usage2.Kettle

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


#### RTF

clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(X_train, y_train)

y_pred_class = clf_rf.predict(X_test)
print('RTF')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')

#### KNN 

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print('KNN')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
#### Logistic regression

logr =LogisticRegression(solver = 'lbfgs')
logr.fit(X_train,y_train)
y_pred_class = logr.predict(X_test)
print('logreg')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))
print('')
#### Bagging

bg = BaggingClassifier()
bg.fit(X_train,y_train)
y_pred_class = bg.predict(X_test)
print('bagging')
print(accuracy_score(y_test, y_pred_class))
print(recall_score(y_test, y_pred_class))

#### Bagging



Undersampled RTF
0.72480181200453
0.7738359201773836 

Under KNN
0.6568516421291053
0.6829268292682927 

Under logreg
0.578708946772367
0.6008869179600886 

Under Bagging
0.7089467723669309
0.7117516629711752 

Over RTF
0.9072856139454697
0.2296983758700696

Over KNN
0.7939148202541345
0.41299303944315546

Over Logreg
0.5637890300747078
0.5661252900232019
Over Bagging
0.9027201328139965
0.22273781902552203

RTF
0.9815145903837559
0.013921113689095127

KNN
0.986239703722623
0.0

logreg
0.986239703722623
0.0

bagging
0.9815145903837559
0.04408352668213457
