#### Import packages

In [None]:
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#### Load the data

In [1]:
n_lines = 10

with open('new_household_power_consumption.txt') as file:
    head = [next(file) for line in range(n_lines)]
    
display(head)

['Date-Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3\n',
 '2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0\n',
 '2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0\n',
 '2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0\n',
 '2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0\n',
 '2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0\n',
 '2006-12-16 17:29:00,3.52,0.522,235.02,15.0,0.0,2.0,17.0\n',
 '2006-12-16 17:30:00,3.702,0.52,235.09,15.8,0.0,1.0,17.0\n',
 '2006-12-16 17:31:00,3.7,0.52,235.22,15.8,0.0,1.0,17.0\n',
 '2006-12-16 17:32:00,3.668,0.51,233.99,15.8,0.0,1.0,17.0\n']

In [None]:
df = pd.read_csv("new_household_power_consumption.txt")

In [None]:
lst = ["Date-Time", "Global_active_power", "Global_reactive_power", "Voltage",
       "Global_intensity", "Sub_metering_1", "Sub_metering_2", "Sub_metering_3"]

##### Global Active Power

- In this example, we'll want to predict the global active power, which is the household minute-averaged active power (kilowatt), measured across the globe. So, below, I am getting just that column of data and displaying the resultant plot.

In [None]:
power_df = df['Global_active_power'].copy()
power_df.shape

In [None]:
# display the data 
plt.figure(figsize=(12,6))
# all data points
power_df.plot(title='Global active power', color='blue') 
plt.show()

- Since the data is recorded each minute, the above plot contains a lot of values. So, I'm also showing just a slice of data, below.

In [None]:
# can plot a slice of hourly data
end_mins = 1440 # 1440 mins = 1 day

plt.figure(figsize=(12,6))
power_df[0:end_mins].plot(title='Global active power, over one day', color='blue') 
plt.show()

##### Hourly vs Daily

There is a lot of data, collected every minute, and so I could go one of two ways with my analysis:
1. Create many, short time series, say a week or so long, in which I record energy consumption every hour, and try to predict the energy consumption over the following hours or days.
2. Create fewer, long time series with data recorded daily that I could use to predict usage in the following weeks or months.

- Both tasks are interesting! It depends on whether you want to predict time patterns over a day/week or over a longer time period, like a month.
- With the amount of data I have, I think it would be interesting to see longer, recurring trends that happen over several months or over a year.
- So, I will resample the 'Global active power' values, recording daily data points as averages over 24-hr periods.

In [None]:
power_df

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df['Date-Time'] = pd.to_datetime(df['Date-Time'])

In [None]:
df.info()

In [None]:
datatime_df = df['Date-Time'].copy()
datatime_df

In [None]:
mean_datatime_df = datatime_df.resample("D").mean()

In [None]:
# resample over day (D)
freq = '24h'
# calculate the mean active power for a day
mean_power_df = power_df.resample(freq)

In [None]:
# display the mean values
plt.figure(figsize=(15, 8))
mean_power_df.plot(title='Global active power, mean per day', color='blue')
plt.tight_layout()
plt.show()