In [None]:
import pandas as pd
import json
import datetime
import time
import os
import numpy as np
from config import DATA_CONSUMPTION_PROCESSED_FILE, DATA_CLUSTERS_FILE, DATA_METADATA_PROCESSED_FILE
import matplotlib
from sklearn import mixture

In [None]:
random_state = 170
np.random.RandomState(random_state)

In [None]:
pre_intervention_period = ['2018-02-01', '2018-04-20']
range_pre_intervention_period = pd.date_range(start=pre_intervention_period[0], end=pre_intervention_period[1], freq='D')
range_pre_intervention_period

In [None]:
real_consumption_df = pd.read_csv(DATA_CONSUMPTION_PROCESSED_FILE)
real_consumption_df['timestamp'] = pd.to_datetime(real_consumption_df['timestamp'])
metadata_df = pd.read_excel(DATA_METADATA_PROCESSED_FILE, sheets='SENSORS')[['smapee','TREATMENT']]
data = real_consumption_df.merge(metadata_df, left_on='smapee', right_on='smapee')
data = data.drop_duplicates(['consumption_kWh','smapee','timestamp'])
data = data.set_index('timestamp').groupby('smapee').resample('D').mean()
data['smapee'] = data.index.get_level_values(0)
data['timestamp'] = data.index.get_level_values(1)
data = data.reset_index(drop=True)
data = data[data['timestamp'].isin(range_pre_intervention_period)]
data = data.groupby('smapee').mean()
data

In [None]:
#get the No of clusters with the minimum akaike
aic_list = []
X_cluster = data['consumption_kWh'].values.reshape(-1, 1)
cv_type = 'tied'
n_componentssss = np.arange(1, 50)
models = [mixture.GaussianMixture(n, covariance_type=cv_type, random_state=random_state).fit(X_cluster)
                      for n in n_componentssss]
aic_list = np.array([m.aic(X_cluster) for m in models])
pd.DataFrame({"a":aic_list}).plot()

In [None]:
n_components = np.argmin(aic_list)
n_components

In [None]:
def calc_clusters(data, n_components, random_state):
    df = data.copy()
    X_cluster = df['consumption_kWh'].values.reshape(-1, 1)
    cv_type = 'tied'
    gmm = mixture.GaussianMixture(n_components=n_components, covariance_type=cv_type, random_state=random_state)
    gmm.fit(X_cluster)
    means = gmm.means_.T[0]  # /gmm.means_.T[1]
    
    #predict for all the data
    X_all = df["consumption_kWh"].values.reshape(-1, 1)
    cluster_labels = gmm.predict(X_all)
    
    df['CLUSTER_VALUES'] = [round(means[cluster], 2) for cluster in cluster_labels]
    df['CLUSTER_ID'] = cluster_labels
    return df

In [None]:
data_final = calc_clusters(data, n_components=n_components, random_state=random_state)
data_final

In [None]:
data_final['CLUSTER_ID'].unique()

In [None]:
data_final[['CLUSTER_VALUES','CLUSTER_ID']].to_csv(DATA_CLUSTERS_FILE)