In [2]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

In [24]:
energy_data = pd.read_csv('data/household_power_consumption.txt', delimiter=';')

# convert numerical columns to float values
cols_to_convert = ['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2']
for col in cols_to_convert:
    energy_data[col] = pd.to_numeric(energy_data[col], errors='coerce')

# as we learned from our exploratory data analysis, we have around 
# 26,000 null values from sub metering 3. lets drop those rows.
energy_data = energy_data.dropna()

# lets convert Date to a day of year value so we can use
# it in our machine learning algorithm
energy_data['Date'] = pd.to_datetime(energy_data['Date'], dayfirst=True)
energy_data['Date'] = energy_data['Date'].dt.dayofyear

# conver time to hour and minute columns
time_dt = pd.to_datetime(energy_data['Time'], format='%H:%M:%S')
energy_data['hour'] = time_dt.dt.hour
energy_data['minute'] = time_dt.dt.minute
energy_data = energy_data.drop('Time', axis=1)

energy_data.head(10)

  energy_data = pd.read_csv('data/household_power_consumption.txt', delimiter=';')


Unnamed: 0,Date,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,hour,minute
0,350,4.216,0.418,234.84,18.4,0.0,1.0,17.0,17,24
1,350,5.36,0.436,233.63,23.0,0.0,1.0,16.0,17,25
2,350,5.374,0.498,233.29,23.0,0.0,2.0,17.0,17,26
3,350,5.388,0.502,233.74,23.0,0.0,1.0,17.0,17,27
4,350,3.666,0.528,235.68,15.8,0.0,1.0,17.0,17,28
5,350,3.52,0.522,235.02,15.0,0.0,2.0,17.0,17,29
6,350,3.702,0.52,235.09,15.8,0.0,1.0,17.0,17,30
7,350,3.7,0.52,235.22,15.8,0.0,1.0,17.0,17,31
8,350,3.668,0.51,233.99,15.8,0.0,1.0,17.0,17,32
9,350,3.662,0.51,233.86,15.8,0.0,2.0,16.0,17,33


In [None]:
# Isolation Forest algorithm
clf = IsolationForest(contamination=0.05, random_state=42)
clf.fit(energy_data)
predictions = clf.predict(energy_data)

In [30]:
data_pred = energy_data.copy()
data_pred['Anomaly'] = predictions
data_pred.head()

Unnamed: 0,Date,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,hour,minute,Anomaly
0,350,4.216,0.418,234.84,18.4,0.0,1.0,17.0,17,24,-1
1,350,5.36,0.436,233.63,23.0,0.0,1.0,16.0,17,25,-1
2,350,5.374,0.498,233.29,23.0,0.0,2.0,17.0,17,26,-1
3,350,5.388,0.502,233.74,23.0,0.0,1.0,17.0,17,27,-1
4,350,3.666,0.528,235.68,15.8,0.0,1.0,17.0,17,28,-1


In [33]:
avg_anomaly = data_pred.groupby('Anomaly')[data_pred.columns.drop('Anomaly')].mean()
avg_anomaly.head()

Unnamed: 0_level_0,Date,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,hour,minute
Anomaly,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
-1,178.638088,4.029838,0.216819,237.256957,17.098511,20.03086,15.11893,12.03855,15.395397,29.145339
1,181.151866,0.936972,0.118814,241.028432,3.971404,0.126716,0.57113,6.164758,11.299091,29.519124
