# 1. Import libraries

In [1]:
import pandas as pd

import os

In [2]:
water_directory = '../data/imgw/stan_wody'
rain_directory = '../data/imgw/opady'

id_water_level = [151160060, 151160140, 151160170, 151170030]

id_rain = [251160360, 251160110, 251160230, 251170420]

# 2. Load data

## 2.1 stan wody

In [3]:
water_period_df = pd.read_csv('../data/imgw/stan_wody/codz_2018_01.csv', encoding = "ISO-8859-1", header=None)
water_period_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,149180020,CHA£UPKI,Odra (1),2018,1,1,191,76.70,99.9,11
1,149180020,CHA£UPKI,Odra (1),2018,1,2,176,65.30,99.9,11
2,149180020,CHA£UPKI,Odra (1),2018,1,3,163,55.70,99.9,11
3,149180020,CHA£UPKI,Odra (1),2018,1,4,152,48.00,99.9,11
4,149180020,CHA£UPKI,Odra (1),2018,1,5,143,42.40,99.9,11
...,...,...,...,...,...,...,...,...,...,...
24995,149190250,JAB£ONKA,Piekielnik (82224),2018,1,26,149,1.57,99.9,11
24996,149190250,JAB£ONKA,Piekielnik (82224),2018,1,27,151,1.71,99.9,11
24997,149190250,JAB£ONKA,Piekielnik (82224),2018,1,28,146,1.30,99.9,11
24998,149190250,JAB£ONKA,Piekielnik (82224),2018,1,29,144,1.12,99.9,11


In [4]:
raw_water_level = pd.DataFrame()

for water_period in os.listdir(water_directory):
    water_period_df = pd.read_csv(os.path.join(water_directory, water_period), encoding = "ISO-8859-1", header=None)
    raw_water_level = pd.concat([raw_water_level, water_period_df], axis=0)
    
raw_water_level.reset_index(drop=True, inplace=True)
raw_water_level

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,149180020,CHA£UPKI,Odra (1),2018,1,1,191,76.70,99.9,11
1,149180020,CHA£UPKI,Odra (1),2018,1,2,176,65.30,99.9,11
2,149180020,CHA£UPKI,Odra (1),2018,1,3,163,55.70,99.9,11
3,149180020,CHA£UPKI,Odra (1),2018,1,4,152,48.00,99.9,11
4,149180020,CHA£UPKI,Odra (1),2018,1,5,143,42.40,99.9,11
...,...,...,...,...,...,...,...,...,...,...
1214483,149190250,JAB£ONKA,Piekielnik (82224),2021,12,27,141,0.27,99.9,10
1214484,149190250,JAB£ONKA,Piekielnik (82224),2021,12,28,141,0.28,99.9,10
1214485,149190250,JAB£ONKA,Piekielnik (82224),2021,12,29,141,0.28,99.9,10
1214486,149190250,JAB£ONKA,Piekielnik (82224),2021,12,30,141,0.29,99.9,10


## 2.2 opady

In [5]:
raw_rain_level = pd.DataFrame()

for rain_period in os.listdir(rain_directory):    
    rain_period_df = pd.read_csv(os.path.join(rain_directory, rain_period), encoding = "ISO-8859-1", header=None)
    raw_rain_level = pd.concat([raw_rain_level, rain_period_df], axis=0)
    
raw_rain_level.reset_index(drop=True, inplace=True)
raw_rain_level

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,249180020,WARSZOWICE,2017,1,2,3.6,,S,0,9.0,0,9.0,,9.0,.,
1,249180020,WARSZOWICE,2017,1,3,4.8,,S,2,,2,,1.0,,*,
2,249180020,WARSZOWICE,2017,1,4,1.8,,S,10,,9,,1.0,,*,
3,249180020,WARSZOWICE,2017,1,5,1.0,,S,11,,2,,1.0,,*,
4,249180020,WARSZOWICE,2017,1,6,0.0,9.0,,11,,0,9.0,1.0,,*,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
648502,254230020,SEJNY,2021,12,27,0.0,9.0,,7,,2,,3.0,,*,
648503,254230020,SEJNY,2021,12,28,0.6,,S,6,,0,9.0,3.0,,*,
648504,254230020,SEJNY,2021,12,29,0.2,,S,5,,0,9.0,3.0,,*,
648505,254230020,SEJNY,2021,12,30,1.5,,S,4,,1,,3.0,,*,


# 3. Data Preprocessing

## 3.1 stan wody

In [6]:
def add_corect_year(row):
    year = row['Rok_hydrologiczny']
    if row['miesiąc']>=11:
        year = year - 1
    return year

In [7]:
raw_water_level_prepared = raw_water_level.copy()

# rename columns
water_renaming_cols = {0: 'Id', 1: 'Miejscowość', 2: 'Rzeka', 3: 'Rok_hydrologiczny', 5: 'Dzień_tygodnia', 
                       6: 'Stan wody [cm]', 9: 'miesiąc'}
raw_water_level_prepared = raw_water_level_prepared[water_renaming_cols.keys()]
raw_water_level_prepared.rename(water_renaming_cols, axis=1, inplace=True)

# Change type of col with level of water
raw_water_level_prepared['Stan wody [cm]'] = raw_water_level_prepared['Stan wody [cm]'].astype(int)

# select Ids
raw_water_level_prepared = raw_water_level_prepared.loc[raw_water_level_prepared['Id'].isin(id_water_level), :]

# Add correct year
raw_water_level_prepared['Rok'] = raw_water_level_prepared.apply(add_corect_year, axis=1)

# Create variable with a date
raw_water_level_prepared['Dzień_tygodnia'] = raw_water_level_prepared['Dzień_tygodnia'].astype(str)
raw_water_level_prepared['Dzień_tygodnia'] = raw_water_level_prepared['Dzień_tygodnia'].apply(lambda x: x.zfill(2))
raw_water_level_prepared['Data'] = raw_water_level_prepared['Rok'].astype(str) + '-' \
                                    + raw_water_level_prepared['miesiąc'].astype(str) + '-'\
                                    + raw_water_level_prepared['Dzień_tygodnia'].astype(str)
raw_water_level_prepared['Data'] = pd.to_datetime(raw_water_level_prepared['Data'], format='%Y-%m-%d')

# connect ID and Location
raw_water_level_prepared['Miejscowość_Id'] = raw_water_level_prepared['Miejscowość'].astype(str) + \
    ' (' + raw_water_level_prepared['Id'].astype(str) + ')'

# Create pivot table
raw_water_level_prepared = raw_water_level_prepared.pivot(index='Data', columns='Miejscowość_Id', values='Stan wody [cm]')

raw_water_level_prepared.columns = [f'{col} Stan wody [cm]' for col in raw_water_level_prepared.columns]
raw_water_level_prepared.reset_index(inplace=True)

raw_water_level_prepared

Unnamed: 0,Data,BRZEG DOLNY (151160170) Stan wody [cm],G£OGÓW (151160060) Stan wody [cm],OSETNO (151160140) Stan wody [cm],TRESTNO (151170030) Stan wody [cm]
0,2017-11-01,286,354,276,334
1,2017-11-02,285,358,281,337
2,2017-11-03,258,353,288,336
3,2017-11-04,218,352,296,333
4,2017-11-05,244,326,304,333
...,...,...,...,...,...
1456,2021-10-27,322,225,177,320
1457,2021-10-28,317,224,177,318
1458,2021-10-29,317,214,177,319
1459,2021-10-30,316,208,177,319


## 3.2 opady

In [8]:
def weather_no_info(x):
    if x == 8:
        return 1
    else:
        return 0

In [9]:
raw_rain_level_prepared = raw_rain_level.copy()

# rename columns
rain_renaming_cols = {0: 'Id', 1: 'Miejscowość', 2: 'Rok', 3: 'miesiąc', 4: 'Dzień_tygodnia', 
                      5: 'Suma opadów [mm]', 6: 'Brak pomiaru'}
raw_rain_level_prepared = raw_rain_level_prepared[rain_renaming_cols.keys()]
raw_rain_level_prepared.rename(rain_renaming_cols, axis=1, inplace=True)

# select Ids
raw_rain_level_prepared = raw_rain_level_prepared.loc[raw_rain_level_prepared['Id'].isin(id_rain), :]

# set info about lack of measurments
raw_rain_level_prepared['Brak pomiaru'] = raw_rain_level_prepared['Brak pomiaru'].apply(weather_no_info)

# Create variable with a date
raw_rain_level_prepared['Dzień_tygodnia'] = raw_rain_level_prepared['Dzień_tygodnia'].astype(str)
raw_rain_level_prepared['Dzień_tygodnia'] = raw_rain_level_prepared['Dzień_tygodnia'].apply(lambda x: x.zfill(2))
raw_rain_level_prepared['Data'] = raw_rain_level_prepared['Rok'].astype(str) + '-' \
                                    + raw_rain_level_prepared['miesiąc'].astype(str) + '-'\
                                    + raw_rain_level_prepared['Dzień_tygodnia'].astype(str)
raw_rain_level_prepared['Data'] = pd.to_datetime(raw_rain_level_prepared['Data'], format='%Y-%m-%d')

# connect ID and Location
raw_rain_level_prepared['Miejscowość_Id'] = raw_rain_level_prepared['Miejscowość'].astype(str) + \
    ' (' + raw_rain_level_prepared['Id'].astype(str) + ')'

# Create pivot table
raw_rain_level_prepared = raw_rain_level_prepared.pivot(index='Data', columns='Miejscowość_Id', values=['Suma opadów [mm]', 
                                                                                                        'Brak pomiaru'])

raw_rain_level_prepared.columns = [f'{multiindex[1]} {multiindex[0]}' for multiindex in raw_rain_level_prepared.columns]

# add missing dates
rain_dates = pd.date_range(start=raw_rain_level_prepared.index.min(), end=raw_rain_level_prepared.index.max(), freq='1D')
raw_rain_level_prepared = raw_rain_level_prepared.reindex(rain_dates)

# Fill missing values
# rainings
for city_id in raw_rain_level_prepared.filter(regex=("Suma opadów")).columns:
    first_date = raw_rain_level_prepared.loc[~pd.isnull(raw_rain_level_prepared[city_id]), :].index.min()
    raw_rain_level_prepared.loc[pd.isnull(raw_rain_level_prepared[city_id])&(raw_rain_level_prepared.index>=first_date), 
                                city_id] = 0
# without measurments
raw_rain_level_prepared.loc[:, raw_rain_level_prepared.filter(regex=("Brak pomiaru")).columns] = raw_rain_level_prepared.loc[:,
                                                 raw_rain_level_prepared.filter(regex=("Brak pomiaru")).columns].fillna(0)


raw_rain_level_prepared.reset_index(inplace=True)
raw_rain_level_prepared.rename({'index': 'Data'}, axis=1, inplace=True)
raw_rain_level_prepared

Unnamed: 0,Data,BRZEG DOLNY (251160230) Suma opadów [mm],OSETNO (251160110) Suma opadów [mm],£ANY (251170420) Suma opadów [mm],BRZEG DOLNY (251160230) Brak pomiaru,OSETNO (251160110) Brak pomiaru,£ANY (251170420) Brak pomiaru
0,2017-01-01,,0.1,,0.0,0.0,0.0
1,2017-01-02,,1.1,,0.0,0.0,0.0
2,2017-01-03,,1.6,,0.0,0.0,0.0
3,2017-01-04,,10.5,,0.0,0.0,0.0
4,2017-01-05,,0.0,,0.0,0.0,0.0
...,...,...,...,...,...,...,...
2153,2022-11-24,0.0,0.0,0.0,0.0,0.0,0.0
2154,2022-11-25,2.1,0.9,0.2,0.0,0.0,0.0
2155,2022-11-26,6.0,2.5,4.9,0.0,0.0,0.0
2156,2022-11-27,2.1,0.7,0.0,0.0,0.0,0.0


## 3.3 Merge

In [10]:
prepared_dataset = pd.merge(raw_water_level_prepared, raw_rain_level_prepared, how='left', on='Data')

prepared_dataset

Unnamed: 0,Data,BRZEG DOLNY (151160170) Stan wody [cm],G£OGÓW (151160060) Stan wody [cm],OSETNO (151160140) Stan wody [cm],TRESTNO (151170030) Stan wody [cm],BRZEG DOLNY (251160230) Suma opadów [mm],OSETNO (251160110) Suma opadów [mm],£ANY (251170420) Suma opadów [mm],BRZEG DOLNY (251160230) Brak pomiaru,OSETNO (251160110) Brak pomiaru,£ANY (251170420) Brak pomiaru
0,2017-11-01,286,354,276,334,,0.7,,0.0,0.0,0.0
1,2017-11-02,285,358,281,337,,10.0,,0.0,0.0,0.0
2,2017-11-03,258,353,288,336,,0.1,,0.0,0.0,0.0
3,2017-11-04,218,352,296,333,,0.0,,0.0,0.0,0.0
4,2017-11-05,244,326,304,333,,0.6,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1456,2021-10-27,322,225,177,320,0.0,0.0,0.0,0.0,0.0,0.0
1457,2021-10-28,317,224,177,318,0.0,0.0,0.0,0.0,0.0,0.0
1458,2021-10-29,317,214,177,319,0.0,0.0,0.0,0.0,0.0,0.0
1459,2021-10-30,316,208,177,319,0.0,0.0,0.0,0.0,0.0,0.0


# 4. Save data

In [11]:
prepared_dataset.to_csv('../data/prepared_data.csv', index=False)