In [57]:
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_percentage_error
import seaborn as sns 

FILES_FOLDER = {
    "data_for_data_challenge": [
        "data_for_data_challenge/building_data.feather",
        "data_for_data_challenge/meter_data.feather",
        "data_for_data_challenge/weather_data.feather" # The dataset test will be released one hour before the end of the Data Challenge
    ]
}


def load_data(folder, data_dir, dict_files=FILES_FOLDER):
    files = dict_files[folder]
    dataframes = []
    print(":: Start loading data")
    for name_file in tqdm(files):
        dataframe = pd.read_feather(os.path.join(data_dir, name_file))
        dataframes.append(dataframe)
    return dataframes


building, meters, weather = load_data("data_for_data_challenge", "")

meters["naive_pred"] = meters["meter_reading"].mean()

print(
    "Naive performance on the training set :",
    mean_absolute_percentage_error(meters["meter_reading"], meters["naive_pred"]),
)



:: Start loading data


100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 22.06it/s]

Naive performance on the training set : 1.497798e+18





In [60]:
weather.head()

Unnamed: 0,index,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2017-01-11 09:00:00,13.9,,12.8,0.0,1028.0,120.0,1.5
1,1,0,2016-03-21 22:00:00,18.299999,4.0,-3.3,0.0,1022.0,330.0,4.6
2,2,0,2016-04-29 20:00:00,31.700001,2.0,16.1,0.0,1013.0,,2.6
3,3,0,2017-01-15 06:00:00,18.9,6.0,16.700001,0.0,1026.0,30.0,3.1
4,4,0,2017-02-06 20:00:00,26.1,,8.9,0.0,1020.5,0.0,0.0


In [61]:
weather['wind_speed'].isna().sum()

1392

Comme il y a beaucoup de NaN dans cloud_coverage (50%) et precip_depth_1_hr (30%), on décide de se débarasser des colonnes, qui ne nous paraissent par ailleurs pas déterminantes. Au vu de la grande quantité de données et du faible pourcentage d'échantillons de données qui ne contiennent un NaN pour air_temperature (0.5%), on décide de nettoyer le dataset en enlevant la colonne.

In [72]:
print(weather.shape)

(186368, 10)


In [73]:
1392/186368

0.007469093406593407

In [74]:
new_weather = weather.dropna(subset = ['air_temperature'], inplace=False)

In [75]:
print(new_weather.shape)

(185298, 10)


In [76]:
new_weather = new_weather.drop(["cloud_coverage", "precip_depth_1_hr"],axis = 1)

Une petite recherche internet nous indique au sujet de la température de la rosée : "They found that the mean dew point temperature — the temperature at which air is saturated with water vapor — is the best predictor of increased energy demand."
Ainsi, on décide de garder la colonne et d'éliminer les lignes du dataset qui ne contiennent pas de nombre pour dew_temperature (il n'en reste que 85 une fois qu'on a éliminé les lignes ne contenant pas de nombre pour air_temperature).

In [79]:
new_weather.dropna(subset = ['dew_temperature'], inplace=True)

In [82]:
new_weather.to_csv("clean_data/weather.csv")