In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import json
import datetime
import time
import os
import numpy as np
from config import INTERVENTION_CALENDAR, DATA_CONSUMPTION_PROCESSED_FILE, DATA_WEATHER_PROCESSED_FILE,  DATA_METADATA_PROCESSED_FILE, DATA_HOLIDAYS_PROCESSED_FILE, DATA_ISO_CONSUMPTION_PROCESSED_FILE, DATA_ENTHALPY_GRADIENTS_PROCESSED_FILE, DATA_SOLAR_GAINS_PROCESSED_FILE, DATA_GBM_CONSUMPTION_PROCESSED_FILE
import matplotlib
import matplotlib.pyplot as plt
import os
import sys
from causalimpact import CausalImpact
import warnings
warnings.filterwarnings('ignore')

In [None]:
real_consumption_df = pd.read_csv(DATA_CONSUMPTION_PROCESSED_FILE)
real_consumption_df['timestamp'] = pd.to_datetime(real_consumption_df['timestamp'])
weather_data_df = pd.read_csv(DATA_WEATHER_PROCESSED_FILE)
weather_data_df['timestamp'] = pd.to_datetime(weather_data_df['timestamp'])
metadata_df = pd.read_excel(DATA_METADATA_PROCESSED_FILE, sheets='SENSORS')[['smapee','INTERVENTION','ID_CEA', 'EXPERIMENT', 'TREATMENT', 'ID_SUBJECT', 'SALARY','BEDROOMS']]
holidays_df = pd.read_csv(DATA_HOLIDAYS_PROCESSED_FILE)
holidays_df['timestamp'] = pd.to_datetime(holidays_df['timestamp'])
gradients_df = pd.read_csv(DATA_ENTHALPY_GRADIENTS_PROCESSED_FILE)
gradients_df['timestamp'] = pd.to_datetime(gradients_df['timestamp'])
solar_gains_df = pd.read_csv(DATA_SOLAR_GAINS_PROCESSED_FILE)
solar_gains_df['timestamp'] = pd.to_datetime(solar_gains_df['timestamp'])
iso_consumption_df = pd.read_csv(DATA_ISO_CONSUMPTION_PROCESSED_FILE)
iso_consumption_df['timestamp'] = pd.to_datetime(iso_consumption_df['timestamp'])
gbm_consumption_df = pd.read_csv(DATA_GBM_CONSUMPTION_PROCESSED_FILE)
gbm_consumption_df['timestamp'] = pd.to_datetime(gbm_consumption_df['timestamp'])

In [None]:
#merge all
# merge all the fields
df = real_consumption_df.merge(metadata_df, left_on=['smapee', 'INTERVENTION'], right_on=['smapee', 'INTERVENTION'])
df = df.merge(weather_data_df, left_on='timestamp', right_on='timestamp')
df = df.merge(holidays_df, left_on='timestamp', right_on='timestamp')
df = df.merge(gradients_df, left_on='timestamp', right_on='timestamp')
df = df.merge(solar_gains_df, left_on=['timestamp','ID_CEA'], right_on=['timestamp','ID_CEA'])
df = df.merge(iso_consumption_df, left_on=['timestamp','ID_CEA'], right_on=['timestamp','ID_CEA'])
df = df.merge(gbm_consumption_df, left_on=['timestamp','smapee','INTERVENTION'], right_on=['timestamp','smapee', 'INTERVENTION'])
df.reset_index(inplace=True, drop=True)
df

In [None]:
def prepare_data(df, corrupted, INTERVENTION):
    #let's get the data for the first experiment of sensors in VIEW
    data_selection = df[(df['INTERVENTION']== INTERVENTION)&
                            (~df['ID_SUBJECT'].isin(corrupted))]
    #get groupdata (get mean so it is easier to compare to the control group)
    data_mean1 = data_selection.groupby('timestamp')[['GBM_consumption_kWh',
                                                      'consumption_kWh']].mean()
    data_mean2 = data_selection.groupby('timestamp')[['solar_gain_Whm2',
                                                      'DEG_C_kJperKg', 
                                                      'ISO_consumption_Whm2', 
                                                      'DEG_DEHUM_kJperKg',
                                                      'Wind_ms',
                                                      'teaching_time',
                                                      'school_holiday',
                                                      'holiday']].mean()
    data_mean = data_mean1.merge(data_mean2, left_index=True, right_index=True)
#     data_mean = data_mean2
    data_mean['year'] = np.array(data_mean.index.year, dtype=np.uint16)
    data_mean['month'] = np.array(data_mean.index.month, dtype=np.uint8) - 1
    data_mean['dayofweek'] = np.array(data_mean.index.dayofweek, dtype=np.uint8)
    data_mean['dayofyear'] = np.array(data_mean.index.dayofyear, dtype=np.uint16) - 1
    data_mean['weekofyear'] = np.array(data_mean.index.weekofyear, dtype=np.uint8) - 1
    data_mean['weekday'] = data_mean['dayofweek'].apply(lambda x: 1 if 0<=x<5 else 0)
    data_mean.sort_index(inplace=True)

    dict_info = {'y': data_mean['consumption_kWh'].values, 
                 'x1':data_mean['DEG_C_kJperKg'],
                 'x2':data_mean['DEG_DEHUM_kJperKg'],
                 'x3':data_mean['solar_gain_Whm2'],
                 'x4':data_mean['ISO_consumption_Whm2'],
                 'x5':data_mean['GBM_consumption_kWh'],
                 'x6':data_mean['Wind_ms'],
                 'x7':data_mean['teaching_time'],
                 'x8':data_mean['school_holiday'],
                 'x9':data_mean['holiday'],
                 'x10':data_mean['month'],
                 'x11':data_mean['dayofweek'],
                 'x12':data_mean['weekday']}
    data = pd.DataFrame(dict_info)
    return data

In [None]:
def prepare_data_control(df, corrupted, INTERVENTION):
    #let's get the data for the first experiment of sensors in VIEW
    data_selection = df[df['INTERVENTION']== INTERVENTION]

    #get groupdata (get mean so it is easier to compare to the control group)
    data_mean = data_selection.groupby('timestamp')[['consumption_kWh']].sum()
    data_mean['year'] = np.array(data_mean.index.year, dtype=np.uint16)
    data_mean['month'] = np.array(data_mean.index.month, dtype=np.uint8) - 1
    data_mean['dayofweek'] = np.array(data_mean.index.dayofweek, dtype=np.uint8)
    data_mean['dayofyear'] = np.array(data_mean.index.dayofyear, dtype=np.uint16) - 1
    data_mean['weekofyear'] = np.array(data_mean.index.weekofyear, dtype=np.uint8) - 1
    data_mean['weekday'] = data_mean['dayofweek'].apply(lambda x: 1 if 0<=x<5 else 0)
    data_mean.sort_index(inplace=True)
    
    data_selection = df[(df['EXPERIMENT']== int(list(INTERVENTION)[0]))&
                    (df['TREATMENT'] == 'CONTROL') &
                    (~df['ID_SUBJECT'].isin(corrupted))]
    data_mean1 = data_selection.groupby('timestamp')[['consumption_kWh']].sum()
    
    data_mean2 = data_mean.merge(data_mean1, left_index=True, right_index=True)
    dict_info = {'y':data_mean2['consumption_kWh_x'], 
                 'x1':data_mean2['consumption_kWh_y']}
    data = pd.DataFrame(dict_info)
    return data

In [None]:
#check graph
def graph(ci, end_intervention_date):
    font = {'family' : 'Arial',
            'size'   : 18}
    ax = ci.plot(figsize = (6,8), end_intervention_date=end_intervention_date)
    matplotlib.rc('font', **font)
    return ax

# TODOS TRATADOS EXPERIMENT 1

In [None]:
interventions = INTERVENTION_CALENDAR

In [None]:
corrupted = ['S620','S638']
experiment = 1
intervention_data = interventions[experiment]
for intervention in intervention_data[0]:
    pre_period = intervention_data[1]
    post_period = intervention_data[2]
    end_intervention_date = intervention_data[3]
    data = prepare_data(df, corrupted, intervention)
    ci = CausalImpact(data, pre_period, post_period, prior_level_sd=None, standarize=True)
    graph(ci, end_intervention_date)

In [None]:
corrupted = ['S620','S638']
experiment = 2
intervention_data = interventions[experiment]
for intervention in intervention_data[0]:
    pre_period = intervention_data[1]
    post_period = intervention_data[2]
    end_intervention_date = intervention_data[3]
    data = prepare_data(df, corrupted, intervention)
    ci = CausalImpact(data, pre_period, post_period, prior_level_sd=None)
    graph(ci, end_intervention_date)

In [None]:
corrupted = ['S620','S638']
experiment = 3
intervention_data = interventions[experiment]
for intervention in intervention_data[0]:
    pre_period = intervention_data[1]
    post_period = intervention_data[2]
    end_intervention_date = intervention_data[3]
    data = prepare_data(df, corrupted, intervention)
    ci = CausalImpact(data, pre_period, post_period, prior_level_sd=None, standarize=True)
    graph(ci, end_intervention_date)