## Import Libraries

In [26]:
# Import libraries
import warnings
import pandas as pd
from unidecode import unidecode

warnings.filterwarnings('ignore')

## Load data
   
   Data source: https://datos.madrid.es/portal/site/egob

In [27]:
# Load working calendar data
working_calendar = "add_path_flat_file"
working_calendar_df = pd.read_csv(working_calendar, header = 0, sep = ';')

## Support Functions

In [33]:
# Function to impute NaN values with correct data:
def enrich_values_based_on_day_of_week(df, column_to_update, day_of_week_column, value_to_replace = '-' ):
    # Define a function to determine the updated value based on the day of the week
    def get_updated_value(day_of_week):
        if day_of_week in range(0, 5):  # workingday (Monday to Friday)
            return 'workingday'
        elif day_of_week in [5, 6]:  # weekend (Saturday and Sunday)
            return 'weekend'
        else:
            return None

    # Apply the function to determine the updated values
    replacement_values = df[day_of_week_column].apply(get_updated_value)

    # Impute values in the specified column based on the updated values
    df[column_to_update] = df[column_to_update].where(df[column_to_update] != value_to_replace, replacement_values)

    return df

## Data Cleaning and Preprocessing

In [28]:
# Remove Unnamed columns
working_calendar_df = working_calendar_df.drop(columns=['Unnamed: 5', 'Unnamed: 6'])
working_calendar_df.head()


Unnamed: 0,Dia,Dia_semana,laborable / festivo / domingo festivo,Tipo de Festivo,Festividad
0,01/01/2013,martes,festivo,Festivo nacional,Año Nuevo
1,02/01/2013,miercoles,laborable,,
2,03/01/2013,jueves,laborable,,
3,04/01/2013,viernes,laborable,,
4,05/01/2013,sabado,sabado,,


In [29]:
# Rename columns
working_calendar_df.rename(columns={'Dia': 'date',
                                    'Dia_semana': 'week_day', 
                                    'laborable / festivo / domingo festivo': 'WorkingDay_Holiday_Sunday', 
                                    'Tipo de Festivo': 'holiday_type', 
                                    'Festividad': 'holiday_desc'}, inplace=True)
display(working_calendar_df)

Unnamed: 0,date,week_day,WorkingDay_Holiday_Sunday,holiday_type,holiday_desc
0,01/01/2013,martes,festivo,Festivo nacional,Año Nuevo
1,02/01/2013,miercoles,laborable,,
2,03/01/2013,jueves,laborable,,
3,04/01/2013,viernes,laborable,,
4,05/01/2013,sabado,sabado,,
...,...,...,...,...,...
4377,27/12/2024,viernes,,,
4378,28/12/2024,sábado,,,
4379,29/12/2024,domingo,,,
4380,30/12/2024,lunes,,,


In [30]:
# Add year, month, day and day of the week 
working_calendar_df['date'] = pd.to_datetime(working_calendar_df['date'], format='%d/%m/%Y')
working_calendar_df['year'] = working_calendar_df['date'].dt.year
working_calendar_df['month'] = working_calendar_df['date'].dt.month
working_calendar_df['day'] = working_calendar_df['date'].dt.day
working_calendar_df['day_of_week'] = working_calendar_df['date'].dt.dayofweek
display(working_calendar_df.dtypes)

# We are only interested in FY2019
working_calendar_df = working_calendar_df[working_calendar_df['year']==2019]
working_calendar_df.head()

date                         datetime64[ns]
week_day                             object
WorkingDay_Holiday_Sunday            object
holiday_type                         object
holiday_desc                         object
year                                  int32
month                                 int32
day                                   int32
day_of_week                           int32
dtype: object

Unnamed: 0,date,week_day,WorkingDay_Holiday_Sunday,holiday_type,holiday_desc,year,month,day,day_of_week
2190,2019-01-01,martes,festivo,Festivo nacional,Año Nuevo,2019,1,1,1
2191,2019-01-02,miercoles,laborable,,,2019,1,2,2
2192,2019-01-03,jueves,laborable,,,2019,1,3,3
2193,2019-01-04,viernes,laborable,,,2019,1,4,4
2194,2019-01-05,sabado,sabado,,,2019,1,5,5


In [31]:
# Remove accents from week_day
working_calendar_df['week_day'] = working_calendar_df['week_day'].apply(lambda x: unidecode(str(x)))
print(working_calendar_df['week_day'].unique())

# Lowercase values from 'WorkingDay_Holiday_Sunday' and 'holiday_type'
working_calendar_df['WorkingDay_Holiday_Sunday'] = working_calendar_df['WorkingDay_Holiday_Sunday'].str.lower()
working_calendar_df['holiday_type'] = working_calendar_df['holiday_type'].str.lower()
print(working_calendar_df['WorkingDay_Holiday_Sunday'].unique())
print(working_calendar_df['holiday_type'].unique())

['martes' 'miercoles' 'jueves' 'viernes' 'sabado' 'domingo' 'lunes']
['festivo' 'laborable' 'sabado' 'domingo' nan]
['festivo nacional' nan 'festivo de la comunidad de madrid'
 'festivo local de la ciudad de madrid']


In [32]:
# Replace/translate values
replace_dict_week_day = {'lunes': 'monday', 'martes': 'tuesday', 'miercoles': 'wednesday','jueves':'thursday', 'viernes': 'friday', 'sabado': 'saturday', 'domingo': 'sunday'}
working_calendar_df['week_day'] = working_calendar_df['week_day'].replace(replace_dict_week_day)
print(working_calendar_df['week_day'].unique())

replace_dict_work_hol_weekend = {'sabado': 'weekend', 'domingo': 'weekend', 'festivo': 'holiday','laborable':'workingday'}
working_calendar_df['WorkingDay_Holiday_Sunday'] = working_calendar_df['WorkingDay_Holiday_Sunday'].replace(replace_dict_work_hol_weekend)
print(working_calendar_df['WorkingDay_Holiday_Sunday'].unique())

replace_dict_hol_type =  {'festivo nacional': 'national_holiday', 'festivo de la comunidad de madrid': 'madrid_community_public_holiday', 'festivo local de la ciudad de madrid': 'local_holiday_in_the_city_of_madrid'}
working_calendar_df['holiday_type'] = working_calendar_df['holiday_type'].replace(replace_dict_hol_type)
print(working_calendar_df['holiday_type'].unique())

['tuesday' 'wednesday' 'thursday' 'friday' 'saturday' 'sunday' 'monday']
['holiday' 'workingday' 'weekend' nan]
['national_holiday' nan 'madrid_community_public_holiday'
 'local_holiday_in_the_city_of_madrid']


In [39]:
replace_val = '-'

# Replace NaN values with dummy value
working_calendar_df.fillna(replace_val, inplace=True)

# Enrich NaN values based on day_of_week values
for col_name in ['WorkingDay_Holiday_Sunday', 'holiday_type', 'holiday_desc']:
    working_calendar_df = enrich_values_based_on_day_of_week(working_calendar_df, col_name, 'day_of_week', value_to_replace = replace_val)


In [40]:
display(working_calendar_df)

Unnamed: 0,date,week_day,WorkingDay_Holiday_Sunday,holiday_type,holiday_desc,year,month,day,day_of_week
2190,2019-01-01,tuesday,holiday,national_holiday,Año Nuevo,2019,1,1,1
2191,2019-01-02,wednesday,workingday,workingday,workingday,2019,1,2,2
2192,2019-01-03,thursday,workingday,workingday,workingday,2019,1,3,3
2193,2019-01-04,friday,workingday,workingday,workingday,2019,1,4,4
2194,2019-01-05,saturday,weekend,weekend,weekend,2019,1,5,5
...,...,...,...,...,...,...,...,...,...
2550,2019-12-27,friday,workingday,workingday,workingday,2019,12,27,4
2551,2019-12-28,saturday,weekend,weekend,weekend,2019,12,28,5
2552,2019-12-29,sunday,weekend,weekend,weekend,2019,12,29,6
2553,2019-12-30,monday,workingday,workingday,workingday,2019,12,30,0


## Export preprocess dataset

In [41]:
# Save processed calendar
working_calendar_df.to_csv('Working_Calendar_FY2019.csv', index=False, sep = ';')