# Internet of Things (2IMN25)
### Machine Learning Assignment

### Introduction

*Add assignment description here*

In [97]:
""" Import all libraries here """
import numpy as np
import os
import math
from datetime import datetime
import pandas as pd
from dateutil import parser


## Reading the dataset

In the code box below, please read in the data-sets into suitable data structures. The key task here would be to contruct the feature vectors which would be fed to the regression algorithm.

The datasets for the weather, rain and energy consumption have been provided in the zip file, extract this zip file into the same folder as the python notebook. 

*Hint : Be sure to check if there are any missing fields in the provided data*

In [70]:
data_path = os.path.join(os.getcwd(),'data')

""" Read in the weather"""
weather_array = pd.read_csv(os.path.join(data_path,'Weather-Eindhoven.csv'),
                         delimiter = ',',
                         header=0,
                         parse_dates = [0],
                         usecols = [0,2,4,6,7,8]) #Drop "chill", too many missing values
weather_array  = weather_array.as_matrix()

""" Read in the rain data"""
rain_array = pd.read_csv(os.path.join(data_path,'Rain-Best.csv'),
                         delimiter = ',',
                         header=0,
                         parse_dates = [0])

rain_array = rain_array.as_matrix()

""" Synchronize the 2 data-sets by removing the extra data from the rainfall dataset"""
remove_rows = []
for element,idx in zip(rain_array[:,0],range(len(rain_array[:,0]))):
    if element.minute%10 != 0:
        remove_rows.append(idx)

synced_rain_array = np.delete(rain_array,remove_rows,axis = 0)

print(synced_rain_array.shape)
print(weather_array[:,0].shape)

""" Merge the weather and rain data when time-stamps are equal """
merged_array = []
for rainElem in synced_rain_array:
    try :
        merged_row = []
        valid_row = list(weather_array[:,0]).index(rainElem[0])
        for weatherFeature in weather_array[valid_row]:
            merged_row.append(weatherFeature)
        merged_row.append(rainElem[1])
        merged_array.append(np.asarray(merged_row))
    except ValueError:
        print('Did not find matching timestamp in the weather data, continuing')
        continue

merged_array = np.asarray(merged_array)

(43844, 2)
(43872,)
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matching timestamp in the weather data, continuing
Did not find matchin

In [80]:
""" Clean up the data by removing rows with missing data points """
missing_data_rows = []
for row,idx in zip(merged_array,range(merged_array.shape[0])):
    if row[0].month > 10:
        missing_data_rows.append(idx)
        continue # No energy data for the month of November so delete those too.
    for elem in row[1:]: #Timestamp cannot be "checked"
        if math.isnan(elem):
            missing_data_rows.append(idx)
            break # To prevent the same row from being added multiple times to the list, break when the first "nan" is found
clean_data_array = np.delete(merged_array,missing_data_rows,axis=0)
print(clean_data_array.shape)

(43501, 7)


In [None]:
""" Some helper APIs """
def energy_lookup(timeStamp,dataFolder):
    """ 
    Looks up the energy consumption based on the time-stamp
    Returns energy consumption for the provided time-stamp
    """
    if timeStamp.month < 9:
        monthFolder = os.path.join(dataFolder,'0{}'.format(timeStamp.month))
    else:
        monthFolder = os.path.join(dataFolder,'{}'.format(timeStamp.month))
    if timeStamp.day < 9:
        energyFile = os.path.join(monthFolder,'0{}'.format(timeStamp.day))
    else:
        energyFile = os.path.join(dataFolder,'{}'.format(timeStamp.day))
    
    energy_array = pd.read_csv(energyFile,
                               delimiter = ',',
                               header=0
                               )
    
    energy_array = energy_array.as_matrix()
    
    energy_values = []
    for energy,idx in zip(energy_array[:,3],range(energy_array.shape[0])):
        try :
            timeObj = parse_time(energy_array[idx][1])
            if timeObj.hour == timeStamp.hour and timeObj.minute == timeStamp.minute:
                energy_values.append(energy)
        except ValueError:
            print('{}'.energy_array[idx][1])
    
    return(np.mean(np.asarray(energy_values)))

def parse_time(timeString):
    """ 
    Takes in a timeString and returns a time object which can 
    be easiy analyzed
    """
    return parser.parse(timeString).time()

def create_label_vector(feature_array,labelDataFolder):
    """ Creates a vector of labels corresponding to the feature vector time-stamp """
    labels = []
    for featureVector in feature_array:
        labels.append(energy_lookup(featureVector[0],labelDataFolder))
    return labels

labelDataFolder = os.path.join(data_path,'2017')
labels = create_label_vector(clean_data_array,labelDataFolder)
    

