# Let's parse the irish dataset

# Data files

In [None]:
%%time
from pathlib import Path
import pandas as pd
import numpy as np
from irish_preprocess import preprocess_irish_data

In [None]:
irish_path = Path('/cw/dtaidata/ml/2023-irishElectricityProfiles/irish/original_dataset')
output_path =Path('/cw/dtaidata/ml/2023-irishElectricityProfiles/irish/before_raw_data')
output_path.mkdir(exist_ok = True)
result_path = Path('/cw/dtaidata/ml/2023-irishElectricityProfiles/irish/raw_data')
result_path.mkdir(exist_ok = True)
preprocessed_path =Path('/cw/dtaidata/ml/2023-irishElectricityProfiles/irish/preprocessed')
preprocessed_path.mkdir(exist_ok = True)
if not (output_path/'raw_data_df.pkl').exists():      
    preprocess_irish_data(irish_path, output_path)
raw_data_df = pd.read_pickle(output_path/'raw_data_df.pkl')
allocation_df = pd.read_pickle(output_path/'raw_allocation_df.pkl')
yearly_info_df = pd.read_pickle(output_path/'raw_yearly_info_df.pkl')

## Data df

In [None]:
# only use Residential profiles which have answered the survey
# There are also 'Other' profiles that have answered the survey but they have LOTS of missing data 
residential_profiles = allocation_df[allocation_df.type == 'Residential'].index
data_df = raw_data_df.loc[yearly_info_df.index.intersection(residential_profiles)]


In [None]:
data_df

In [None]:
data_df.to_pickle(result_path/'raw_data_df.pkl')

# Info df 

In [None]:
# filter out the same profiles as the data_df 
yearly_info_df = yearly_info_df.loc[data_df.index]

In [None]:
yearly_info_df.to_pickle(result_path/'raw_info_df_features.pkl')

In [None]:
yearly_info_df = (
    yearly_info_df.fillna(dict(
        age = -1, 
        home_type = 'Unknown', 
        build_year = -1, 
        home_age = -1, 
        floor_area = -1, 
        number_of_bedrooms = -1,
    ))
    .astype(dict(
        age = 'int8', 
        build_year = 'int16', 
        home_age = 'int8', 
        floor_area = 'int32', 
        number_of_bedrooms = 'int8',
    ))
    .pipe(lambda x: pd.concat([x, pd.get_dummies(x[['home_type', 'cooking']], prefix = ['home_type', 'cooking'])], axis = 1))
    .drop(columns = ['home_type', 'cooking'])
)
yearly_info_df.to_pickle(result_path/'raw_info_df_numerical.pkl')

## Daily info df

In [None]:
from util import transform_raw_data_and_save, check_dataset

yearly_data_df, daily_data_df, yearly_info_df, daily_info_df = transform_raw_data_and_save(
    raw_data_df = result_path/'raw_data_df.pkl', 
    yearly_info_df = result_path/'raw_info_df_numerical.pkl', 
    result_path = preprocessed_path, 
    weather_city = 'Dublin',
    holiday_country = 'ireland', 
    year_to_use_as_index = 2010
)
check_dataset((yearly_data_df, daily_data_df, yearly_info_df, daily_info_df))

In [None]:
yearly_data_df

In [None]:
daily_data_df.dtypes

In [None]:
yearly_info_df

In [None]:
daily_info_df