# 1. Import libraries

In [1]:
import numpy as np
import pandas as pd

# 2. Load data

In [2]:
def weather_no_info(x):
    if x == 8:
        return 1
    else:
        return 0

## 2.0 Hierarchy

In [3]:
hierarchy = pd.read_excel('../data/hierarchy.xlsx')
prepared_hierarchy = hierarchy.copy()
prepared_hierarchy.rename({'meteo - id': 'id_meteo', 'meteo - nazwa': 'name_meteo', 
                           'hydro - id': 'id_hydro', 'hydro - nazwa': 'name_hydro',
                           'cz-ta-sama-lokalizacja': 'the_same_location'}, axis=1, inplace=True)
prepared_hierarchy['id_meteo'] = prepared_hierarchy['id_meteo'].fillna(0).astype(int)
prepared_hierarchy['id_hydro'] = prepared_hierarchy['id_hydro'].fillna(0).astype(int)

prepared_hierarchy

Unnamed: 0,id_meteo,name_meteo,id_hydro,name_hydro
0,251160080,GŁOGÓW,151160060,GŁOGÓW
1,350180540,RACIBÓRZ,150180060,RACIBÓRZ-MIEDONIA


## 2.1 Hydro

In [4]:
hydro = pd.read_excel('../data/hydro.xlsx', sheet_name='hydro', header=[1, 2])

hydro.columns = ['Data'] + [f'{col_name} Stan wody [cm]' for col_name in hydro.columns.get_level_values(0)][1:]

# set the column type for column with date
hydro['Data'] = pd.to_datetime(hydro['Data'], format='%Y-%m-%d')

hydro

Unnamed: 0,Data,GŁOGÓW (151160060) Stan wody [cm],RACIBÓRZ-MIEDONIA (150180060) Stan wody [cm]
0,2011-11-01,199,118
1,2011-11-02,200,118
2,2011-11-03,194,119
3,2011-11-04,195,118
4,2011-11-05,192,116
...,...,...,...
3648,2021-10-27,225,126
3649,2021-10-28,224,126
3650,2021-10-29,214,124
3651,2021-10-30,208,122


In [5]:
hydro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3653 entries, 0 to 3652
Data columns (total 3 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   Data                                          3653 non-null   datetime64[ns]
 1   GŁOGÓW (151160060) Stan wody [cm]             3653 non-null   int64         
 2   RACIBÓRZ-MIEDONIA (150180060) Stan wody [cm]  3653 non-null   int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 85.7 KB


## 2.2 Meteo

In [6]:
meteo = pd.read_excel('../data/meteo.xlsx', sheet_name='dane', header=[0, 1])

# rename col with the status - will be only use information about missing measurements 
meteo.rename({'Status sumy opadów': 'Brak pomiaru'}, axis=1, inplace=True)

# combine multiindex columns
meteo.columns = meteo.columns.map(' '.join).str.strip(' ')
meteo.rename({'Unnamed: 0_level_0 Data': 'Data'}, axis=1, inplace=True)

# leaving information only about missing measurements
for meteo_col, lack_measurements_col in zip(meteo.filter(regex=".*Suma opadów").columns, meteo.filter(regex=".*Brak pomiaru").columns):
    meteo[lack_measurements_col] = meteo[lack_measurements_col].apply(weather_no_info)
    meteo[lack_measurements_col] = meteo[lack_measurements_col].astype(int)
    meteo.loc[meteo[lack_measurements_col]==1, meteo_col] = np.nan

# set the column type for column with date
meteo['Data'] = pd.to_datetime(meteo['Data'], format='%d-%m-%Y')

# select cols
meteo = meteo[['Data']+list(meteo.filter(regex=".*Suma opadów").columns)]

# calculate some statistics
meteo['średnia Suma opadów [mm]'] = meteo[list(meteo.filter(regex=".*Suma opadów").columns)].mean(axis=1)
meteo['mediana Suma opadów [mm]'] = meteo[list(meteo.filter(regex=".*Suma opadów").columns)].median(axis=1)

meteo

Unnamed: 0,Data,BARDO ŚLĄSKIE (250160410) Suma opadów [mm],BIERUTÓW (251170270) Suma opadów [mm],BOLESŁAWÓW (250160610) Suma opadów [mm],BOLKÓW (250160030) Suma opadów [mm],BORÓW (250160070) Suma opadów [mm],BRZEG (250170050) Suma opadów [mm],BRZEG DOLNY (251160230) Suma opadów [mm],CHWAŁKOWICE (251160170) Suma opadów [mm],CIESZYN (249180130) Suma opadów [mm],...,TARNÓW (250160360) Suma opadów [mm],TRZEBNICA (251170210) Suma opadów [mm],TWARDOCICE (251150280) Suma opadów [mm],WALIM (250160270) Suma opadów [mm],WROCŁAW-STRACHOWICE (351160424) Suma opadów [mm],ZIELENIEC (250160530) Suma opadów [mm],ZIELINA (250170280) Suma opadów [mm],ŹRÓDŁA (251160370) Suma opadów [mm],średnia Suma opadów [mm],mediana Suma opadów [mm]
0,2012-01-01,0.2,0.5,0.5,0.7,0.2,,0.8,0.8,0.2,...,0.8,,0.5,0.0,0.3,7.0,0.5,,0.721875,0.5
1,2012-01-02,,1.1,1.0,0.8,0.0,0.3,0.6,1.3,4.0,...,0.0,0.5,2.2,0.1,0.2,1.3,,,0.888889,0.6
2,2012-01-03,2.2,0.9,5.6,,0.0,,0.1,0.0,0.1,...,0.0,0.1,,0.2,0.1,9.5,,,1.406667,0.2
3,2012-01-04,2.4,0.1,12.5,,,0.6,0.5,1.5,11.0,...,1.1,1.8,0.2,5.2,0.0,12.6,0.9,,3.620270,1.5
4,2012-01-05,8.9,7.1,23.0,10.2,0.5,1.0,3.0,6.5,4.3,...,6.6,8.8,9.3,12.9,2.5,46.0,6.1,,10.271795,7.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3648,2021-12-27,0.0,0.0,0.0,0.0,,,0.0,0.0,,...,0.0,,0.0,0.0,0.0,0.0,0.0,,0.000000,0.0
3649,2021-12-28,2.4,0.0,2.8,1.2,5.7,3.1,2.8,2.2,,...,0.6,,3.0,0.2,3.5,6.7,4.5,3.8,2.206579,2.1
3650,2021-12-29,0.2,2.3,0.0,0.5,0.7,1.0,0.5,0.7,,...,0.1,,0.8,0.0,1.3,3.2,1.1,0.9,0.562500,0.5
3651,2021-12-30,1.6,6.2,2.0,3.4,4.4,5.5,4.7,6.3,,...,0.9,,2.4,1.5,4.6,10.2,5.6,4.2,4.215789,4.2


# 3. Data Preprocessing

In [7]:
prepared_dataset = pd.merge(hydro, meteo, how='outer', on=['Data'])

# filter the data from the first dates
prepared_dataset = prepared_dataset.loc[prepared_dataset['Data']>=np.max([hydro['Data'].min(), meteo['Data'].min()]), :]
prepared_dataset.reset_index(drop=True, inplace=True)

prepared_dataset

Unnamed: 0,Data,GŁOGÓW (151160060) Stan wody [cm],RACIBÓRZ-MIEDONIA (150180060) Stan wody [cm],BARDO ŚLĄSKIE (250160410) Suma opadów [mm],BIERUTÓW (251170270) Suma opadów [mm],BOLESŁAWÓW (250160610) Suma opadów [mm],BOLKÓW (250160030) Suma opadów [mm],BORÓW (250160070) Suma opadów [mm],BRZEG (250170050) Suma opadów [mm],BRZEG DOLNY (251160230) Suma opadów [mm],...,TARNÓW (250160360) Suma opadów [mm],TRZEBNICA (251170210) Suma opadów [mm],TWARDOCICE (251150280) Suma opadów [mm],WALIM (250160270) Suma opadów [mm],WROCŁAW-STRACHOWICE (351160424) Suma opadów [mm],ZIELENIEC (250160530) Suma opadów [mm],ZIELINA (250170280) Suma opadów [mm],ŹRÓDŁA (251160370) Suma opadów [mm],średnia Suma opadów [mm],mediana Suma opadów [mm]
0,2012-01-01,199.0,119.0,0.2,0.5,0.5,0.7,0.2,,0.8,...,0.8,,0.5,0.0,0.3,7.0,0.5,,0.721875,0.5
1,2012-01-02,197.0,116.0,,1.1,1.0,0.8,0.0,0.3,0.6,...,0.0,0.5,2.2,0.1,0.2,1.3,,,0.888889,0.6
2,2012-01-03,197.0,118.0,2.2,0.9,5.6,,0.0,,0.1,...,0.0,0.1,,0.2,0.1,9.5,,,1.406667,0.2
3,2012-01-04,204.0,127.0,2.4,0.1,12.5,,,0.6,0.5,...,1.1,1.8,0.2,5.2,0.0,12.6,0.9,,3.620270,1.5
4,2012-01-05,205.0,153.0,8.9,7.1,23.0,10.2,0.5,1.0,3.0,...,6.6,8.8,9.3,12.9,2.5,46.0,6.1,,10.271795,7.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3648,2021-12-27,,,0.0,0.0,0.0,0.0,,,0.0,...,0.0,,0.0,0.0,0.0,0.0,0.0,,0.000000,0.0
3649,2021-12-28,,,2.4,0.0,2.8,1.2,5.7,3.1,2.8,...,0.6,,3.0,0.2,3.5,6.7,4.5,3.8,2.206579,2.1
3650,2021-12-29,,,0.2,2.3,0.0,0.5,0.7,1.0,0.5,...,0.1,,0.8,0.0,1.3,3.2,1.1,0.9,0.562500,0.5
3651,2021-12-30,,,1.6,6.2,2.0,3.4,4.4,5.5,4.7,...,0.9,,2.4,1.5,4.6,10.2,5.6,4.2,4.215789,4.2


# 4. Save data

In [8]:
prepared_dataset.to_csv('../results/prepared_data.csv', index=False)
prepared_hierarchy.to_csv('../results/prepared_hierarchy.csv', index=False)