Import Libs

In [21]:
%matplotlib inline

import torch
import matplotlib.pyplot as plt
import requests
import os
import zipfile
import pandas as pd
from tqdm import tqdm
from numpy import isnan
from datetime import timedelta

Download Data

In [4]:
if not os.path.exists('data/electricity.zip'):
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip'
    response = requests.get(url, allow_redirects=True)

    with open("data/electricity.zip","wb") as handle:
        for data in tqdm(response.iter_content()):
            handle.write(data)

    with zipfile.ZipFile("data/electricity.zip",'r') as zip_ref:
        zip_ref.extractall("data")

    os.rename('data/household_power_consumption.txt','data/household_power_consumption.csv')


Preprocess Data

In [49]:
resolution = 60
df_raw = pd.read_csv('data/household_power_consumption.csv', sep=';', index_col=False, low_memory=False, na_values='?')
df_raw = df_raw.iloc[21996::resolution,:3]
df_raw['datetime'] = pd.to_datetime(df_raw['Date']+' '+df_raw['Time'], infer_datetime_format=True, dayfirst=True)
df_raw = df_raw.drop(['Date','Time'],axis=1)
df_raw = df_raw.set_index(['datetime'])

In [53]:
g = df_raw.groupby(df_raw.index.floor('d'))

daily_data = []
my_day, final_day = df_raw.index[0].date(), df_raw.index[-1].date()

while my_day < final_day:
    try:
        day_data = g.get_group(my_day).T.values.tolist()[0]
    except:
        print("Date "+str(my_day)+" is missing.")
        my_day += timedelta(days=1)
    else:
        if (not isnan(day_data).any()) and day_data.__len__() == 24*60//resolution:
            row_data = [my_day.month, my_day.weekday()] + day_data
            daily_data.append(row_data)
        my_day += timedelta(days=1)

print('Dataset Size:'+str(daily_data.__len__()))



Dataset Size:1396


In [56]:
column_names = ['month', 'weekday']
my_day = pd.Timestamp('2012-01-01')
for _ in range(24*60//resolution):
    column_names.append(my_day.strftime('%H:%M'))
    my_day+=timedelta(minutes=resolution)

print(column_names)

# %%
df_daily = pd.DataFrame(daily_data,columns=column_names)
df_daily.to_csv('data/daily_data.csv',index=False)

['month', 'weekday', '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00', '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00', '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00']
