# 1. Import libraries

In [3]:
import pandas as pd

import os

In [28]:
water_directory = '../data/imgw/stan_wody'
rain_directory = '../data/imgw/opady'

id_water_level = [151160060, 151160140, 151160170, 151170030]

id_rain = [251160360, 251160110, 251160230, 251170420]

# 2. Load data

## 2.1 stan wody

In [16]:
water_period_df = pd.read_csv('../data/imgw/stan_wody/codz_2018_01.csv', encoding = "ISO-8859-1", header=None)
water_period_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,149180020,CHA£UPKI,Odra (1),2018,1,1,191,76.70,99.9,11
1,149180020,CHA£UPKI,Odra (1),2018,1,2,176,65.30,99.9,11
2,149180020,CHA£UPKI,Odra (1),2018,1,3,163,55.70,99.9,11
3,149180020,CHA£UPKI,Odra (1),2018,1,4,152,48.00,99.9,11
4,149180020,CHA£UPKI,Odra (1),2018,1,5,143,42.40,99.9,11
...,...,...,...,...,...,...,...,...,...,...
24995,149190250,JAB£ONKA,Piekielnik (82224),2018,1,26,149,1.57,99.9,11
24996,149190250,JAB£ONKA,Piekielnik (82224),2018,1,27,151,1.71,99.9,11
24997,149190250,JAB£ONKA,Piekielnik (82224),2018,1,28,146,1.30,99.9,11
24998,149190250,JAB£ONKA,Piekielnik (82224),2018,1,29,144,1.12,99.9,11


In [19]:
raw_water_level = pd.DataFrame()

for water_period in os.listdir(water_directory):
    water_period_df = pd.read_csv(os.path.join(water_directory, water_period), encoding = "ISO-8859-1", header=None)
    raw_water_level = pd.concat([raw_water_level, water_period_df], axis=0)
    
raw_water_level.reset_index(drop=True, inplace=True)
raw_water_level

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,149180020,CHA£UPKI,Odra (1),2018,1,1,191,76.70,99.9,11
1,149180020,CHA£UPKI,Odra (1),2018,1,2,176,65.30,99.9,11
2,149180020,CHA£UPKI,Odra (1),2018,1,3,163,55.70,99.9,11
3,149180020,CHA£UPKI,Odra (1),2018,1,4,152,48.00,99.9,11
4,149180020,CHA£UPKI,Odra (1),2018,1,5,143,42.40,99.9,11
...,...,...,...,...,...,...,...,...,...,...
1214483,149190250,JAB£ONKA,Piekielnik (82224),2021,12,27,141,0.27,99.9,10
1214484,149190250,JAB£ONKA,Piekielnik (82224),2021,12,28,141,0.28,99.9,10
1214485,149190250,JAB£ONKA,Piekielnik (82224),2021,12,29,141,0.28,99.9,10
1214486,149190250,JAB£ONKA,Piekielnik (82224),2021,12,30,141,0.29,99.9,10


## 2.2 opady

In [21]:
raw_rain_level = pd.DataFrame()

for rain_period in os.listdir(rain_directory):    
    rain_period_df = pd.read_csv(os.path.join(rain_directory, rain_period), encoding = "ISO-8859-1", header=None)
    raw_rain_level = pd.concat([raw_rain_level, rain_period_df], axis=0)
    
raw_rain_level.reset_index(drop=True, inplace=True)
raw_rain_level

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,249180020,WARSZOWICE,2017,1,2,3.6,,S,0,9.0,0,9.0,,9.0,.,
1,249180020,WARSZOWICE,2017,1,3,4.8,,S,2,,2,,1.0,,*,
2,249180020,WARSZOWICE,2017,1,4,1.8,,S,10,,9,,1.0,,*,
3,249180020,WARSZOWICE,2017,1,5,1.0,,S,11,,2,,1.0,,*,
4,249180020,WARSZOWICE,2017,1,6,0.0,9.0,,11,,0,9.0,1.0,,*,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
648502,254230020,SEJNY,2021,12,27,0.0,9.0,,7,,2,,3.0,,*,
648503,254230020,SEJNY,2021,12,28,0.6,,S,6,,0,9.0,3.0,,*,
648504,254230020,SEJNY,2021,12,29,0.2,,S,5,,0,9.0,3.0,,*,
648505,254230020,SEJNY,2021,12,30,1.5,,S,4,,1,,3.0,,*,


# 3. Data Preprocessing

## 3.1 stan wody

In [34]:
def add_corect_year(row):
    year = row['Rok_hydrologiczny']
    if row['miesiąc']>=11:
        year = year - 1
    return year

In [45]:
raw_water_level_prepared = raw_water_level.copy()

# rename columns
water_renaming_cols = {0: 'Id', 1: 'Miejscowość', 2: 'Rzeka', 3: 'Rok_hydrologiczny', 5: 'Dzień_tygodnia', 
                       6: 'Stan wody [cm]', 9: 'miesiąc'}
raw_water_level_prepared = raw_water_level_prepared[water_renaming_cols.keys()]
raw_water_level_prepared.rename(water_renaming_cols, axis=1, inplace=True)

# select Ids
raw_water_level_prepared = raw_water_level_prepared.loc[raw_water_level_prepared['Id'].isin(id_water_level), :]

# Add correct year
raw_water_level_prepared['Rok'] = raw_water_level_prepared.apply(add_corect_year, axis=1)

# Create variable with a date
raw_water_level_prepared['Dzień_tygodnia'] = raw_water_level_prepared['Dzień_tygodnia'].astype(str)
raw_water_level_prepared['Dzień_tygodnia'] = raw_water_level_prepared['Dzień_tygodnia'].apply(lambda x: x.zfill(2))
raw_water_level_prepared['Data'] = raw_water_level_prepared['Rok'].astype(str) + '-' \
                                    + raw_water_level_prepared['miesiąc'].astype(str) + '-'\
                                    + raw_water_level_prepared['Dzień_tygodnia'].astype(str)
raw_water_level_prepared['Data'] = pd.to_datetime(raw_water_level_prepared['Data'], format='%Y-%m-%d')

# connect ID and Location

raw_water_level_prepared['Miejscowość_Id'] = raw_water_level_prepared['Miejscowość'].astype(str) + \
    ' (' + raw_water_level_prepared['Id'].astype(str) + ')'



raw_water_level_prepared.reset_index(drop=True, inplace=True)
raw_water_level_prepared

Unnamed: 0,Id,Miejscowość,Rzeka,Rok_hydrologiczny,Dzień_tygodnia,Stan wody [cm],miesiąc,Rok,Data,Miejscowość_Id
0,151170030,TRESTNO,Odra (1),2018,01,334,11,2017,2017-11-01,TRESTNO (151170030)
1,151170030,TRESTNO,Odra (1),2018,02,337,11,2017,2017-11-02,TRESTNO (151170030)
2,151170030,TRESTNO,Odra (1),2018,03,336,11,2017,2017-11-03,TRESTNO (151170030)
3,151170030,TRESTNO,Odra (1),2018,04,333,11,2017,2017-11-04,TRESTNO (151170030)
4,151170030,TRESTNO,Odra (1),2018,05,333,11,2017,2017-11-05,TRESTNO (151170030)
...,...,...,...,...,...,...,...,...,...,...
5839,151160140,OSETNO,Barycz (14),2021,27,177,10,2021,2021-10-27,OSETNO (151160140)
5840,151160140,OSETNO,Barycz (14),2021,28,177,10,2021,2021-10-28,OSETNO (151160140)
5841,151160140,OSETNO,Barycz (14),2021,29,177,10,2021,2021-10-29,OSETNO (151160140)
5842,151160140,OSETNO,Barycz (14),2021,30,177,10,2021,2021-10-30,OSETNO (151160140)


## 3.2 opady

## 3.3 Merge

# 4. Save data

In [8]:
# prepared_dataset.to_csv('../data/prepared_data.csv', index=False)