# 1_prepare_data

A notebook to prepare the data for the tests

In [None]:
from datetime import datetime

import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
dfp_consumption = pd.read_csv('./data/rtu/rte_daily_consumption.csv')
dfp_weather = pd.read_csv('./data/rtu/nasa_weather.csv')

In [None]:
dfp_consumption.head()

In [None]:
dfp_weather.head()

In [None]:
# Merge the electrical consumption and the weather data
dfp_data = pd.merge(dfp_consumption, dfp_weather, on =['date'])
dfp_data['date'] = pd.to_datetime(dfp_data['date'])

In [None]:
# Compute averaged temperature and precipatation bassed on the population of the top11 cities in France
dict_cities_population = {
    'bordeaux' : 257068,
    'lille' : 233098,
    'paris' : 2175601,
    'rennes' : 217728,
    'nantes' : 314138,
    'toulouse' : 486828,
    'marseille' : 868277,
    'lyon' : 518635,
    'nice' : 341138,
    'strasbourg' : 284677,
    'montpellier' : 290053
}

sum_population_cities = sum([value for key, value in dict_cities_population.items()])
for parameter in ['t2m', 't2m_min', 't2m_max', 'prectot']:
    dfp_data[f'weighted_{parameter}'] = dfp_data.apply(lambda row: sum([1.0 * (value/sum_population_cities) * row[f'{parameter}_{key}'] for key, value in dict_cities_population.items()]), axis=1)

In [None]:
# Add feature related to the day
dfp_data['weekday'] = dfp_data['date'].apply(lambda date: date.weekday())
dfp_data['month'] = dfp_data['date'].apply(lambda date: date.month)
dfp_data['week_number'] = dfp_data['date'].apply(lambda date: date.week)

In [None]:
dfp_data.head()

In [None]:
# Split all the data that was before the 1st of January 2020 and the one after
dfp_data_model = dfp_data[(dfp_data['date'] < datetime(2020, 1, 1))]
dfp_data_2020 = dfp_data[(dfp_data['date'] >= datetime(2020, 1, 1))]

In [None]:
# Sae the data in files
dfp_data_model.to_csv('./data/rtu/model_data.csv', index=None)
dfp_data_2020.to_csv('./data/rtu/2020_data.csv', index=None)

In [None]:
# Build a training and testing set (save them in files)
dfp_train, dfp_test = train_test_split(dfp_data_model, test_size=0.2, random_state=0)
dfp_train.to_csv('./data/rtu/model_train_data.csv', index=None)
dfp_test.to_csv('./data/rtu/model_test_data.csv', index=None)