# 1. Import libraries

In [1]:
import numpy as np
import pandas as pd

# 2. Load data

In [2]:
def weather_no_info(x):
    if x == 8:
        return 1
    else:
        return 0

## 2.0 Hierarchy

In [3]:
hierarchy = pd.read_excel('../data/hierarchy.xlsx')
hierarchy.rename({'meteo - id': 'id_meteo', 'meteo - nazwa': 'name_meteo', 
                  'hydro - id': 'id_hydro', 'hydro - nazwa': 'name_hydro',
                  'cz-ta-sama-lokalizacja': 'the_same_location'}, axis=1, inplace=True)
hierarchy['id_meteo'] = hierarchy['id_meteo'].fillna(0).astype(int)
hierarchy['id_hydro'] = hierarchy['id_hydro'].fillna(0).astype(int)
hierarchy['the_same_location'] = hierarchy['the_same_location'].replace({'tak': 1, 'nie': 0})

hierarchy

Unnamed: 0,id_meteo,name_meteo,id_hydro,name_hydro,the_same_location
0,251160360,GŁOGÓW,151160060,GŁOGÓW,1
1,249180550,CIESZYN,149180060,CIESZYN,1
2,249180550,CIESZYN,149180070,CIESZYN,1
3,249180550,CIESZYN,149180030,ŁAZISKA,0
4,249180550,CIESZYN,149180020,CHAŁUPKI,0
...,...,...,...,...,...
87,250160650,MIĘDZYLESIE,150160190,MIĘDZYLESIE,1
88,250160520,LĄDEK-ZDRÓJ,150160230,LĄDEK-ZDRÓJ,1
89,350160520,KŁODZKO,150160110,SZALEJÓW DOLNY,0
90,250160840,SZALEJÓW GÓRNY,150160080,TŁUMACZÓW,0


## 2.1 Hydro

In [4]:
hydro = pd.read_excel('../data/hydro.xlsx', sheet_name='hydro', header=[1, 2])

hydro.columns = ['Data'] + [f'{col_name} Stan wody [cm]' for col_name in hydro.columns.get_level_values(0)][1:]

# set the column type for column with date
hydro['Data'] = pd.to_datetime(hydro['Data'], format='%Y-%m-%d')

hydro

Unnamed: 0,Data,GŁOGÓW (151160060) Stan wody [cm],RACIBÓRZ-MIEDONIA (150180060) Stan wody [cm]
0,2011-11-01,199,118
1,2011-11-02,200,118
2,2011-11-03,194,119
3,2011-11-04,195,118
4,2011-11-05,192,116
...,...,...,...
3648,2021-10-27,225,126
3649,2021-10-28,224,126
3650,2021-10-29,214,124
3651,2021-10-30,208,122


In [5]:
hydro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3653 entries, 0 to 3652
Data columns (total 3 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   Data                                          3653 non-null   datetime64[ns]
 1   GŁOGÓW (151160060) Stan wody [cm]             3653 non-null   int64         
 2   RACIBÓRZ-MIEDONIA (150180060) Stan wody [cm]  3653 non-null   int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 85.7 KB


## 2.2 Meteo

In [6]:
meteo = pd.read_excel('../data/meteo.xlsx', sheet_name='dane', header=[0, 1])

# rename col with the status - will be only use information about missing measurements 
meteo.rename({'Status sumy opadów': 'Brak pomiaru'}, axis=1, inplace=True)

# combine multiindex columns
meteo.columns = meteo.columns.map(' '.join).str.strip(' ')
meteo.rename({'Unnamed: 0_level_0 Data': 'Data'}, axis=1, inplace=True)

# leaving information only about missing measurements
for lack_measurements_col in meteo.filter(regex=".*Brak pomiaru").columns:
    meteo[lack_measurements_col] = meteo[lack_measurements_col].apply(weather_no_info)
    meteo[lack_measurements_col] = meteo[lack_measurements_col].astype(int)

# set the column type for column with date
meteo['Data'] = pd.to_datetime(meteo['Data'], format='%d-%m-%Y')

# Fill in the missing values with the number 0.05
meteo = meteo.fillna(0.05)

meteo

Unnamed: 0,Data,BARDO ŚLĄSKIE (250160410) Suma opadów [mm],BARDO ŚLĄSKIE (250160410) Brak pomiaru,BIERUTÓW (251170270) Suma opadów [mm],BIERUTÓW (251170270) Brak pomiaru,BOLESŁAWÓW (250160610) Suma opadów [mm],BOLESŁAWÓW (250160610) Brak pomiaru,BOLKÓW (250160030) Suma opadów [mm],BOLKÓW (250160030) Brak pomiaru,BORÓW (250160070) Suma opadów [mm],...,WALIM (250160270) Suma opadów [mm],WALIM (250160270) Brak pomiaru,WROCŁAW-STRACHOWICE (351160424) Suma opadów [mm],WROCŁAW-STRACHOWICE (351160424) Brak pomiaru,ZIELENIEC (250160530) Suma opadów [mm],ZIELENIEC (250160530) Brak pomiaru,ZIELINA (250170280) Suma opadów [mm],ZIELINA (250170280) Brak pomiaru,ŹRÓDŁA (251160370) Suma opadów [mm],ŹRÓDŁA (251160370) Brak pomiaru
0,2012-01-01,0.20,0,0.5,0,0.5,0,0.70,0,0.20,...,0.0,0,0.3,0,7.0,0,0.50,0,0.05,0
1,2012-01-02,0.05,0,1.1,0,1.0,0,0.80,0,0.00,...,0.1,0,0.2,0,1.3,0,0.05,0,0.05,0
2,2012-01-03,2.20,0,0.9,0,5.6,0,0.05,0,0.00,...,0.2,0,0.1,0,9.5,0,0.05,0,0.05,0
3,2012-01-04,2.40,0,0.1,0,12.5,0,0.05,0,0.05,...,5.2,0,0.0,0,12.6,0,0.90,0,0.05,0
4,2012-01-05,8.90,0,7.1,0,23.0,0,10.20,0,0.50,...,12.9,0,2.5,0,46.0,0,6.10,0,0.05,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3648,2021-12-27,0.00,0,0.0,0,0.0,0,0.00,0,0.05,...,0.0,0,0.0,0,0.0,0,0.00,0,0.05,0
3649,2021-12-28,2.40,0,0.0,0,2.8,0,1.20,0,5.70,...,0.2,0,3.5,0,6.7,0,4.50,0,3.80,0
3650,2021-12-29,0.20,0,2.3,0,0.0,0,0.50,0,0.70,...,0.0,0,1.3,0,3.2,0,1.10,0,0.90,0
3651,2021-12-30,1.60,0,6.2,0,2.0,0,3.40,0,4.40,...,1.5,0,4.6,0,10.2,0,5.60,0,4.20,0


# 3. Data Preprocessing

In [7]:
prepared_dataset = pd.merge(hydro, meteo, how='outer', on=['Data'])

# filter the data from the first dates
prepared_dataset = prepared_dataset.loc[prepared_dataset['Data']>=np.max([hydro['Data'].min(), meteo['Data'].min()]), :]
prepared_dataset.reset_index(drop=True, inplace=True)

prepared_dataset

Unnamed: 0,Data,GŁOGÓW (151160060) Stan wody [cm],RACIBÓRZ-MIEDONIA (150180060) Stan wody [cm],BARDO ŚLĄSKIE (250160410) Suma opadów [mm],BARDO ŚLĄSKIE (250160410) Brak pomiaru,BIERUTÓW (251170270) Suma opadów [mm],BIERUTÓW (251170270) Brak pomiaru,BOLESŁAWÓW (250160610) Suma opadów [mm],BOLESŁAWÓW (250160610) Brak pomiaru,BOLKÓW (250160030) Suma opadów [mm],...,WALIM (250160270) Suma opadów [mm],WALIM (250160270) Brak pomiaru,WROCŁAW-STRACHOWICE (351160424) Suma opadów [mm],WROCŁAW-STRACHOWICE (351160424) Brak pomiaru,ZIELENIEC (250160530) Suma opadów [mm],ZIELENIEC (250160530) Brak pomiaru,ZIELINA (250170280) Suma opadów [mm],ZIELINA (250170280) Brak pomiaru,ŹRÓDŁA (251160370) Suma opadów [mm],ŹRÓDŁA (251160370) Brak pomiaru
0,2012-01-01,199.0,119.0,0.20,0.0,0.5,0.0,0.5,0.0,0.70,...,0.0,0.0,0.3,0.0,7.0,0.0,0.50,0.0,0.05,0.0
1,2012-01-02,197.0,116.0,0.05,0.0,1.1,0.0,1.0,0.0,0.80,...,0.1,0.0,0.2,0.0,1.3,0.0,0.05,0.0,0.05,0.0
2,2012-01-03,197.0,118.0,2.20,0.0,0.9,0.0,5.6,0.0,0.05,...,0.2,0.0,0.1,0.0,9.5,0.0,0.05,0.0,0.05,0.0
3,2012-01-04,204.0,127.0,2.40,0.0,0.1,0.0,12.5,0.0,0.05,...,5.2,0.0,0.0,0.0,12.6,0.0,0.90,0.0,0.05,0.0
4,2012-01-05,205.0,153.0,8.90,0.0,7.1,0.0,23.0,0.0,10.20,...,12.9,0.0,2.5,0.0,46.0,0.0,6.10,0.0,0.05,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3648,2021-12-27,,,0.00,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.05,0.0
3649,2021-12-28,,,2.40,0.0,0.0,0.0,2.8,0.0,1.20,...,0.2,0.0,3.5,0.0,6.7,0.0,4.50,0.0,3.80,0.0
3650,2021-12-29,,,0.20,0.0,2.3,0.0,0.0,0.0,0.50,...,0.0,0.0,1.3,0.0,3.2,0.0,1.10,0.0,0.90,0.0
3651,2021-12-30,,,1.60,0.0,6.2,0.0,2.0,0.0,3.40,...,1.5,0.0,4.6,0.0,10.2,0.0,5.60,0.0,4.20,0.0


In [8]:
prepared_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3653 entries, 0 to 3652
Columns: 175 entries, Data to ŹRÓDŁA (251160370) Brak pomiaru
dtypes: datetime64[ns](1), float64(174)
memory usage: 4.9 MB


In [9]:
prepared_hierarchy = hierarchy.copy()
prepared_hierarchy = hierarchy.loc[hierarchy['id_hydro'].isin([151160060, 150180060]), :].copy()
prepared_hierarchy.reset_index(drop=True, inplace=True)
prepared_hierarchy.drop(['the_same_location'], axis=1, inplace=True)
prepared_hierarchy

Unnamed: 0,id_meteo,name_meteo,id_hydro,name_hydro
0,251160360,GŁOGÓW,151160060,GŁOGÓW
1,350180540,RACIBÓRZ,150180060,RACIBÓRZ-MIEDONIA


# 4. Save data

In [10]:
prepared_dataset.to_csv('../results/prepared_data.csv', index=False)
prepared_hierarchy.to_csv('../results/prepared_hierarchy.csv', index=False)