# 1. Import libraries

In [1]:
import pandas as pd

In [2]:
# settings
dates_idx = pd.date_range(start='2011-01-01', end='2022-12-31', 
                          freq='1D') # When there will be new data then to change, only for the purpose of test data

# 2. Load data

In [3]:
def weather_no_info(x):
    if x == 8:
        return 1
    else:
        return 0

## 2.1 Hydro

In [4]:
hydro = pd.read_excel('../data/hydro - sample.xlsx', sheet_name='hydro', header=[1, 2])

# taking only Date and level of water
hydro = hydro.loc[:, hydro.columns.get_level_values(1).isin(['Data', 'Stan wody [cm]'])]

hydro.columns = ['Data'] + [f'{col_name} Stan wody [cm]' for col_name in hydro.columns.get_level_values(0)][1:]

# set the column type for column with date
hydro['Data'] = pd.to_datetime(hydro['Data'], format='%Y-%m-%d')

################# Add new dates for modeling purposes | To change when will be new datasets
hydro = hydro.set_index('Data').reindex(dates_idx).reset_index().rename({'index': 'Data'}, axis=1)
hydro = hydro.bfill().ffill()
###############################################################################################

# Change col type with level of water to int
for col_name in hydro.columns[1:]:
    hydro[col_name] = hydro[col_name].astype(int)

hydro

Unnamed: 0,Data,GŁOGÓW (151160060) Stan wody [cm],RACIBÓRZ-MIEDONIA (150180060) Stan wody [cm]
0,2011-01-01,199,118
1,2011-01-02,199,118
2,2011-01-03,199,118
3,2011-01-04,199,118
4,2011-01-05,199,118
...,...,...,...
4378,2022-12-27,203,114
4379,2022-12-28,203,114
4380,2022-12-29,203,114
4381,2022-12-30,203,114


## 2.2 Meteo

In [5]:
meteo = pd.read_excel('../data/meteo - sample.xlsx', sheet_name='dane', header=[0, 1])

# rename col with the status - will be only use information about missing measurements 
meteo.rename({'Status sumy opadów': 'Brak pomiaru'}, axis=1, inplace=True)

# combine multiindex columns
meteo.columns = meteo.columns.map(' '.join).str.strip(' ')
meteo.rename({'Unnamed: 0_level_0 Data': 'Data'}, axis=1, inplace=True)

# leaving information only about missing measurements
for lack_measurements_col in meteo.filter(regex=".*Brak pomiaru").columns:
    meteo[lack_measurements_col] = meteo[lack_measurements_col].apply(weather_no_info)
    meteo[lack_measurements_col] = meteo[lack_measurements_col].astype(int)

# set the column type for column with date
meteo['Data'] = pd.to_datetime(meteo['Data'], format='%d-%m-%Y')

# Fill in the missing values with the number 0.05
meteo = meteo.fillna(0.05)

################ Add new dates for modeling purposes | To change when will be new datasets
meteo = meteo.set_index('Data').reindex(dates_idx).reset_index().rename({'index': 'Data'}, axis=1)
meteo = meteo.bfill().ffill()

meteo

Unnamed: 0,Data,BARDO ŚLĄSKIE (250160410) Suma opadów [mm],BARDO ŚLĄSKIE (250160410) Brak pomiaru,BIERUTÓW (251170270) Suma opadów [mm],BIERUTÓW (251170270) Brak pomiaru,BOLESŁAWÓW (250160610) Suma opadów [mm],BOLESŁAWÓW (250160610) Brak pomiaru,BOLKÓW (250160030) Suma opadów [mm],BOLKÓW (250160030) Brak pomiaru,BORÓW (250160070) Suma opadów [mm],BORÓW (250160070) Brak pomiaru,BRZEG (250170050) Suma opadów [mm],BRZEG (250170050) Brak pomiaru,BRZEG DOLNY (251160230) Suma opadów [mm],BRZEG DOLNY (251160230) Brak pomiaru,CHWAŁKOWICE (251160170) Suma opadów [mm],CHWAŁKOWICE (251160170) Brak pomiaru
0,2011-01-01,0.2,0.0,0.5,0.0,0.5,0.0,0.7,0.0,0.2,0.0,0.05,0.0,0.8,0.0,0.8,0.0
1,2011-01-02,0.2,0.0,0.5,0.0,0.5,0.0,0.7,0.0,0.2,0.0,0.05,0.0,0.8,0.0,0.8,0.0
2,2011-01-03,0.2,0.0,0.5,0.0,0.5,0.0,0.7,0.0,0.2,0.0,0.05,0.0,0.8,0.0,0.8,0.0
3,2011-01-04,0.2,0.0,0.5,0.0,0.5,0.0,0.7,0.0,0.2,0.0,0.05,0.0,0.8,0.0,0.8,0.0
4,2011-01-05,0.2,0.0,0.5,0.0,0.5,0.0,0.7,0.0,0.2,0.0,0.05,0.0,0.8,0.0,0.8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4378,2022-12-27,3.8,0.0,6.8,0.0,2.0,0.0,4.7,0.0,4.8,0.0,1.70,0.0,5.1,0.0,5.2,0.0
4379,2022-12-28,3.8,0.0,6.8,0.0,2.0,0.0,4.7,0.0,4.8,0.0,1.70,0.0,5.1,0.0,5.2,0.0
4380,2022-12-29,3.8,0.0,6.8,0.0,2.0,0.0,4.7,0.0,4.8,0.0,1.70,0.0,5.1,0.0,5.2,0.0
4381,2022-12-30,3.8,0.0,6.8,0.0,2.0,0.0,4.7,0.0,4.8,0.0,1.70,0.0,5.1,0.0,5.2,0.0


# 3. Data Preprocessing

In [6]:
prepared_dataset = pd.merge(hydro, meteo, how='left', on=['Data'])
prepared_dataset

Unnamed: 0,Data,GŁOGÓW (151160060) Stan wody [cm],RACIBÓRZ-MIEDONIA (150180060) Stan wody [cm],BARDO ŚLĄSKIE (250160410) Suma opadów [mm],BARDO ŚLĄSKIE (250160410) Brak pomiaru,BIERUTÓW (251170270) Suma opadów [mm],BIERUTÓW (251170270) Brak pomiaru,BOLESŁAWÓW (250160610) Suma opadów [mm],BOLESŁAWÓW (250160610) Brak pomiaru,BOLKÓW (250160030) Suma opadów [mm],BOLKÓW (250160030) Brak pomiaru,BORÓW (250160070) Suma opadów [mm],BORÓW (250160070) Brak pomiaru,BRZEG (250170050) Suma opadów [mm],BRZEG (250170050) Brak pomiaru,BRZEG DOLNY (251160230) Suma opadów [mm],BRZEG DOLNY (251160230) Brak pomiaru,CHWAŁKOWICE (251160170) Suma opadów [mm],CHWAŁKOWICE (251160170) Brak pomiaru
0,2011-01-01,199,118,0.2,0.0,0.5,0.0,0.5,0.0,0.7,0.0,0.2,0.0,0.05,0.0,0.8,0.0,0.8,0.0
1,2011-01-02,199,118,0.2,0.0,0.5,0.0,0.5,0.0,0.7,0.0,0.2,0.0,0.05,0.0,0.8,0.0,0.8,0.0
2,2011-01-03,199,118,0.2,0.0,0.5,0.0,0.5,0.0,0.7,0.0,0.2,0.0,0.05,0.0,0.8,0.0,0.8,0.0
3,2011-01-04,199,118,0.2,0.0,0.5,0.0,0.5,0.0,0.7,0.0,0.2,0.0,0.05,0.0,0.8,0.0,0.8,0.0
4,2011-01-05,199,118,0.2,0.0,0.5,0.0,0.5,0.0,0.7,0.0,0.2,0.0,0.05,0.0,0.8,0.0,0.8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4378,2022-12-27,203,114,3.8,0.0,6.8,0.0,2.0,0.0,4.7,0.0,4.8,0.0,1.70,0.0,5.1,0.0,5.2,0.0
4379,2022-12-28,203,114,3.8,0.0,6.8,0.0,2.0,0.0,4.7,0.0,4.8,0.0,1.70,0.0,5.1,0.0,5.2,0.0
4380,2022-12-29,203,114,3.8,0.0,6.8,0.0,2.0,0.0,4.7,0.0,4.8,0.0,1.70,0.0,5.1,0.0,5.2,0.0
4381,2022-12-30,203,114,3.8,0.0,6.8,0.0,2.0,0.0,4.7,0.0,4.8,0.0,1.70,0.0,5.1,0.0,5.2,0.0


In [7]:
prepared_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4383 entries, 0 to 4382
Data columns (total 19 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   Data                                          4383 non-null   datetime64[ns]
 1   GŁOGÓW (151160060) Stan wody [cm]             4383 non-null   int32         
 2   RACIBÓRZ-MIEDONIA (150180060) Stan wody [cm]  4383 non-null   int32         
 3   BARDO ŚLĄSKIE (250160410) Suma opadów [mm]    4383 non-null   float64       
 4   BARDO ŚLĄSKIE (250160410) Brak pomiaru        4383 non-null   float64       
 5   BIERUTÓW (251170270) Suma opadów [mm]         4383 non-null   float64       
 6   BIERUTÓW (251170270) Brak pomiaru             4383 non-null   float64       
 7   BOLESŁAWÓW (250160610) Suma opadów [mm]       4383 non-null   float64       
 8   BOLESŁAWÓW (250160610) Brak pomiaru           4383 non-null   float6

# 4. Save data

In [8]:
prepared_dataset.to_csv('../results/prepared_data.csv', index=False)