<hr style="margin-bottom: 50px;">
<center>
    <h1 style="margin-top: 0; margin-bottom: 0;">
        <b><u>Prétraitement des données</u></b>
    </h1>
</center>
<hr style="margin-top: 50px;">

# <b>1. Configuration du notebook

In [6]:
# Imports des modules du projet
import config, src

# Imports classiques
import numpy as np
import pandas as pd

# Imports spécifiques
import holidays

In [None]:
# Chargement des données brutes
data = src.load_data(config.RAW_DATA_FILE)

# Conversion des dates au format datetime
data['date'] = pd.to_datetime(data['date'])

In [8]:
rename_dict = {
    "date": "Date",
    "Appliances": "Energy",
    "lights": "Lights",
    "T1": "Kitchen_T",
    "RH_1": "Kitchen_H",
    "T2": "Living_T",
    "RH_2": "Living_H",
    "T3": "Laundry_T",
    "RH_3": "Laundry_H",
    "T4": "Office_T",
    "RH_4": "Office_H",
    "T5": "Bath_T",
    "RH_5": "Bath_H",
    "T6": "Outside_T",
    "RH_6": "Outside_H",
    "T7": "Ironing_T",
    "RH_7": "Ironing_H",
    "T8": "TeenRoom_T",
    "RH_8": "TeenRoom_H",
    "T9": "ParentsRoom_T",
    "RH_9": "ParentsRoom_H",
    "T_out": "Weather_T",
    "Press_mm_hg": "Weather_P",
    "RH_out": "Weather_H",
    "Windspeed": "Weather_WindSpeed",
    "Visibility": "Weather_Visibility",
    "Tdewpoint": "Weather_Dewpoint",
    "rv1": "RandVar_1",
    "rv2": "RandVar_2"
}

data.rename(columns=rename_dict, inplace=True)

In [None]:
src.save_data(data, 'renamed_data')

In [None]:
days = {
    0: 'monday',
    1: 'tuesday',
    2: 'wednesday',
    3: 'thursday',
    4: 'friday',
    5: 'saturday',
    6: 'sunday'
}

days.values()

In [None]:
data['day_of_week'] = data['date'].dt.weekday
data['day_of_week'] = data['day_of_week'].map(days)

one_hot = pd.get_dummies(data['day_of_week'], columns=['day_of_week'], prefix='', prefix_sep='')
one_hot = one_hot[list(days.values())]

data = pd.concat([data.drop(columns=['day_of_week']), one_hot], axis=1)
data[list(days.values())] = data[list(days.values())].astype(int)
data.info()

In [None]:
data.head()

In [34]:
data['hour'] = data['date'].dt.hour
data['sin_hour'] = np.sin(2 * np.pi * data['hour'] / 24)
data['cos_hour'] = np.cos(2 * np.pi * data['hour'] / 24)

data = data.drop(columns=['hour'])

In [None]:
be_holidays = holidays.Belgium(years=data['date'].dt.year.unique())
be_holidays = {pd.Timestamp(date): name for date, name in be_holidays.items()}
data['is_holiday'] = data['date'].isin(be_holidays).astype(int)

n_holidays = data[data['is_holiday'] == 1]['date'].count()
print(f'Il a {n_holidays} jour(s) férié(s) dans le dataset.')

In [36]:
be_school_holidays_2016 = [
    ("2016-02-08", "2016-02-14"),  # Carnaval
    ("2016-03-28", "2016-04-10"),  # Pâques
    ("2016-07-01", "2016-08-31"),  # Été
    ("2016-10-31", "2016-11-06"),  # Toussaint
    ("2016-12-26", "2017-01-08")   # Noël
]

def is_school_holiday(date, holidays):
    for start, end in holidays:
        if pd.Timestamp(start) <= date <= pd.Timestamp(end):
            return True
    return False

In [None]:
data['is_school_holiday'] = data['date'].apply(lambda x: is_school_holiday(x, be_school_holidays_2016)).astype(int)

n_school_holidays = len(data.loc[data['is_school_holiday'] == True, 'date'].dt.date.unique())
print(f'Il y a {n_school_holidays} jour(s) de vacances scolaires dans le dataset.')

In [38]:
def get_season(date):
    year = date.year
    seasons = {
        0: (pd.Timestamp(year=year, month=3, day=21), pd.Timestamp(year=year, month=6, day=20)), # Printemps
        1: (pd.Timestamp(year=year, month=6, day=21), pd.Timestamp(year=year, month=9, day=20)), # Été
        2: (pd.Timestamp(year=year, month=9, day=21), pd.Timestamp(year=year, month=12, day=20)), # Automne
        3: (pd.Timestamp(year=year, month=12, day=21), pd.Timestamp(year=year+1, month=3, day=20)) # Hiver
        }
    for season, (start, end) in seasons.items():
        if start <= date <= end:
            return season
    return 3 # 01/01 - 20/03

In [39]:
index_to_season = {
    0: 'spring',
    1: 'summer',
    2: 'fall',
    3: 'winter'
}

In [None]:
data['season'] = data['date'].apply(get_season)

n_seasons = len(data['season'].unique())
print(f'Il y a {n_seasons} différentes dans le dataset.')

In [None]:
data['season'].value_counts()

In [42]:
encode_winter_spring = {
    0: 0,
    3: 1
}

data['season'] = data['season'].replace(encode_winter_spring)

In [None]:
data['season'].value_counts()

In [None]:
data.head()

In [None]:
data.duplicated().value_counts()

In [None]:
save_data(data, 'processed_data')