In [1]:
import pandas as pd
import matplotlib as pyplot
import numpy as np
import seaborn as sns
import sklearn
import math
import matplotlib.pyplot as plt

from collections import OrderedDict
from hmmlearn import hmm

In [2]:
devices_dataset = pd.read_csv("house_devices.csv")

In [3]:
devices_dataset.head()

Unnamed: 0,time,lighting2,lighting5,lighting4,refrigerator,microwave
0,1302930703,180,23,195,117,2
1,1302930721,181,23,195,119,2
2,1302930738,180,23,195,117,2
3,1302930765,181,23,195,117,2
4,1302930782,180,23,195,118,2


In [4]:
devices_dataset.shape

(14999, 6)

In [5]:
devices_dataset.describe()

Unnamed: 0,time,lighting2,lighting5,lighting4,refrigerator,microwave
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,1303161000.0,46.44463,17.85299,45.995933,46.283086,6.710247
std,131571.0,91.600463,45.832236,70.079602,66.730693,88.110723
min,1302931000.0,0.0,0.0,0.0,0.0,0.0
25%,1303068000.0,0.0,1.0,0.0,0.0,2.0
50%,1303144000.0,0.0,1.0,0.0,1.0,2.0
75%,1303299000.0,22.0,21.0,63.0,115.0,2.0
max,1303371000.0,362.0,211.0,203.0,916.0,1798.0


In [6]:
devices_dataset.nunique()

time            14999
lighting2          58
lighting5          61
lighting4          96
refrigerator      109
microwave          50
dtype: int64

In [8]:
devices_names = devices_dataset.columns[1:]
hidden_states = [i for i in range(4, 8) if i % 2 == 0]


class ModelsContainer:
    __slots__ = ['models', 'scores', 'states']

    def __init__(self):
        self.models = []
        self.scores = []
        self.states = []

    def add_model(self, model, score, state):
        self.models.append(model)
        self.scores.append(score)
        self.states.append(state)

    def get_best_model(self):
        max_score_idx = self.scores.index(max(self.scores))
        return self.models[max_score_idx], self.scores[max_score_idx], self.states[max_score_idx]


def train_test(dataset):
    train_size = int(dataset.shape[0] * 0.8)
    train_sample = np.random.choice(dataset.index, size=train_size)
    train_set = dataset.loc[train_sample, :]
    test_set_sample = dataset.index.difference(train_set.index)
    test_set = dataset.loc[test_set_sample, :]

    train_set = train_set.sort_values(by="time")
    test_set = test_set.sort_values(by="time")
    return train_set, test_set


def extract_column(X_train, X_test, col_name):
    return X_train[[col_name]], X_test[[col_name]]


def select_hidden_state(X_train, X_test, hidden_states):
    models_container = ModelsContainer()
    for hs in hidden_states:
        hmm_model = hmm.GaussianHMM(n_components=hs, n_iter=10)
        hmm_model.fit(X_train)
        model_score = hmm_model.score(X_test)
        models_container.add_model(hmm_model, model_score, hs)
    return models_container.get_best_model()


def select_best_model(dataset, col_name, hidden_states, reps=3):
    models_container = ModelsContainer()
    for _ in range(reps):
        X_train, X_test = train_test(dataset)
        X_train, X_test = extract_column(X_train, X_test, col_name)
        model, score, hs = select_hidden_state(X_train, X_test, hidden_states)
        models_container.add_model(model, score, hs)
    return models_container.get_best_model()

In [None]:
collected_models = OrderedDict()
for devices_cols in devices_dataset.columns[1:]:
    collected_models[devices_cols] = select_best_model(devices_dataset, devices_cols, hidden_states)

In [None]:
dev_len = len(devices_names)
results_mtrx = np.zeros((dev_len, dev_len))
X_train, X_test = train_test(devices_dataset)
for i, (model_name, model_info) in enumerate(collected_models.items()):
    model = model_info[0]
    for j, dn in enumerate(devices_names):
        _, X_test_col = extract_column(X_train, X_test, dn)
        model_prediction = model.score(X_test_col[[dn]])
        results_mtrx[j, i] = model_prediction