In [2]:
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_percentage_error
import seaborn as sns 

FILES_FOLDER = {
    "data_for_data_challenge": [
        "data_for_data_challenge/building_data.feather",
        "data_for_data_challenge/meter_data.feather",
        "data_for_data_challenge/weather_data.feather" # The dataset test will be released one hour before the end of the Data Challenge
    ]
}


def load_data(folder, data_dir, dict_files=FILES_FOLDER):
    files = dict_files[folder]
    dataframes = []
    print(":: Start loading data")
    for name_file in tqdm(files):
        dataframe = pd.read_feather(os.path.join(data_dir, name_file))
        dataframes.append(dataframe)
    return dataframes


building, meters, weather = load_data("data_for_data_challenge", "")



:: Start loading data


100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 17.46it/s]


In [3]:
meters.head()

Unnamed: 0,index,building_id,timestamp,meter,meter_reading
0,0,83,2017-01-11 09:00:00,0,1.0989
1,1,83,2016-03-21 22:00:00,0,0.0
2,2,83,2016-04-29 20:00:00,0,0.0
3,3,83,2017-01-15 06:00:00,0,2.7405
4,4,83,2017-02-06 20:00:00,0,2.1296


In [4]:
print(meters.shape)
meters["meter"].unique()

(976896, 5)


array([ 0,  1, -1,  3,  2], dtype=int32)

In [5]:
print("Number of electricity measures :",len(meters[meters['meter']==0]))
print("Number of chilledwater measures :",len(meters[meters['meter']==1]))
print("Number of steam measures :",len(meters[meters['meter']==2]))
print("Number of hotwater measures :",len(meters[meters['meter']==3]))
print("Number of measures with unknown type:",len(meters[meters['meter']==-1]))

Number of electricity measures : 581842
Number of chilledwater measures : 203081
Number of steam measures : 135721
Number of hotwater measures : 56239
Number of measures with unknown type: 13


In [6]:
meters[meters['meter']==-1]

Unnamed: 0,index,building_id,timestamp,meter,meter_reading
8933,8933,7,2017-06-15 21:00:00,-1,552.532471
40077,40078,21,2016-10-18 02:00:00,-1,54.0588
444752,444767,836,2016-12-27 17:00:00,-1,13.9583
639852,639869,1076,2016-04-27 16:00:00,-1,54.6875
741070,741091,1140,2016-11-23 16:00:00,-1,563.002014
741979,742000,1140,2016-07-29 01:00:00,-1,935.625977
782528,782550,1247,2016-04-09 08:00:00,-1,97.199997
799287,799309,1286,2016-03-11 04:00:00,-1,337.579987
807045,807067,1235,2016-05-12 09:00:00,-1,360.187988
811798,811820,1301,2017-02-16 12:00:00,-1,1916.55542


On décide de supprimer les lignes avec pour meter -1, qui ne correspond à rien de connu comme type d'énergie.

In [7]:
new_meters=meters[meters['meter']!=-1]

Il n'y a plus de NaN dans le dataset, on peut l'enregistrer tel quel. On applique juste l'algo de tanguy pour modifier les outliers

In [8]:
def remove_meters_outliers(meters):
    qt2 = meters.groupby('meter').quantile(q = 0.75)
    qt1 = meters.groupby('meter').quantile(q = 0.25) #this is a dataframe
    iqr = qt2.meter_reading - qt1.meter_reading #this is a dataframe
    print(iqr)
    print(iqr.loc[2])
    for k in range(0,4):
        print(qt1.loc[k, 'meter_reading'])
        print(f"clipping values lower than {qt1.loc[k, 'meter_reading'] - 2*iqr.loc[k]} and greater than {qt2.loc[k,'meter_reading'] + 2*iqr.loc[k]}")
        filter_meter_k = meters.loc[(meters.meter == k) & ((meters.meter_reading > qt2.loc[k, 'meter_reading'] + 2*iqr.loc[k] ) | (meters.meter_reading <qt1.loc[k, 'meter_reading'] - 2*iqr.loc[k]))]
        meters = meters.drop(filter_meter_k.index)
    return meters
new_meters = remove_meters_outliers(new_meters)

meter
0     152.350668
1     450.295398
2    1165.055519
3     213.763202
Name: meter_reading, dtype: float64
1165.055519104004
19.55132484436035
clipping values lower than -285.15001106262207 and greater than 476.603328704834
12.027600288391113
clipping values lower than -888.5631952285767 and greater than 1362.9137935638428
67.0999984741211
clipping values lower than -2263.0110397338867 and greater than 3562.266555786133
0.08079999685287476
clipping values lower than -427.44560354948044 and greater than 641.3704053163528


In [10]:
new_meters.head()

Unnamed: 0,index,building_id,timestamp,meter,meter_reading
0,0,83,2017-01-11 09:00:00,0,1.0989
1,1,83,2016-03-21 22:00:00,0,0.0
2,2,83,2016-04-29 20:00:00,0,0.0
3,3,83,2017-01-15 06:00:00,0,2.7405
4,4,83,2017-02-06 20:00:00,0,2.1296


In [11]:
new_meters.to_csv("clean_data/meters.csv")