# 1. Import libraries

In [None]:
import numpy as np
import pandas as pd

import os

In [None]:
water_directory = '../data/imgw/stan_wody'
rain_directory = '../data/imgw/opady'

# 2. Load data

## 2.0 Hierarchy

In [None]:
hierarchy = pd.read_excel('../data/hierarchy.xlsx')
hierarchy.rename({'meteo - id': 'id_meteo', 'meteo - nazwa': 'name_meteo', 
                  'hydro - id': 'id_hydro', 'hydro - nazwa': 'name_hydro',
                  'cz-ta-sama-lokalizacja': 'the_same_location'}, axis=1, inplace=True)
hierarchy['id_meteo'] = hierarchy['id_meteo'].fillna(0).astype(int)
hierarchy['id_hydro'] = hierarchy['id_hydro'].fillna(0).astype(int)
hierarchy['the_same_location'] = hierarchy['the_same_location'].replace({'tak': 1, 'nie': 0})

# hierarchy

# hierarchy = pd.DataFrame({'id_hydro': [151160060, 151160140, 151160170, 151170030], 
#                           'name_hydro': ['G£OGÓW', 'OSETNO', 'BRZEG DOLNY', 'TRESTNO'], 
#                           'id_meteo': [0, 251160110, 251160230, 251170420], 
#                           'name_meteo': ['no_info', 'OSETNO', 'BRZEG DOLNY', '£ANY']})

hierarchy

## 2.1 stan wody

In [None]:
raw_water_level = pd.DataFrame()

for water_period in os.listdir(water_directory):
    water_period_df = pd.read_csv(os.path.join(water_directory, water_period), encoding = "ISO-8859-1", header=None)
    raw_water_level = pd.concat([raw_water_level, water_period_df], axis=0)
    
raw_water_level.reset_index(drop=True, inplace=True)
raw_water_level

## 2.2 opady

In [None]:
raw_rain_level = pd.DataFrame()

for rain_period in os.listdir(rain_directory):    
    rain_period_df = pd.read_csv(os.path.join(rain_directory, rain_period), encoding = "ISO-8859-1", header=None)
    raw_rain_level = pd.concat([raw_rain_level, rain_period_df], axis=0)
    
raw_rain_level.reset_index(drop=True, inplace=True)
raw_rain_level

# 3. Data Preprocessing

## 3.1 stan wody

In [None]:
def add_corect_year(row):
    year = row['Rok_hydrologiczny']
    if row['miesiąc']>=11:
        year = year - 1
    return year

In [None]:
raw_water_level_prepared = raw_water_level.copy()

# rename columns
water_renaming_cols = {0: 'Id', 1: 'Miejscowość', 2: 'Rzeka', 3: 'Rok_hydrologiczny', 5: 'Dzień_tygodnia', 
                       6: 'Stan wody [cm]', 9: 'miesiąc'}
raw_water_level_prepared = raw_water_level_prepared[water_renaming_cols.keys()]
raw_water_level_prepared.rename(water_renaming_cols, axis=1, inplace=True)

# Change type of col with level of water
raw_water_level_prepared['Stan wody [cm]'] = raw_water_level_prepared['Stan wody [cm]'].astype(int)

# select Ids
raw_water_level_prepared = raw_water_level_prepared.loc[raw_water_level_prepared['Id'].isin(hierarchy['id_hydro']), :]

# Add correct year
raw_water_level_prepared['Rok'] = raw_water_level_prepared.apply(add_corect_year, axis=1)

# Create variable with a date
raw_water_level_prepared['Dzień_tygodnia'] = raw_water_level_prepared['Dzień_tygodnia'].astype(str)
raw_water_level_prepared['Dzień_tygodnia'] = raw_water_level_prepared['Dzień_tygodnia'].apply(lambda x: x.zfill(2))
raw_water_level_prepared['Data'] = raw_water_level_prepared['Rok'].astype(str) + '-' \
                                    + raw_water_level_prepared['miesiąc'].astype(str) + '-'\
                                    + raw_water_level_prepared['Dzień_tygodnia'].astype(str)
raw_water_level_prepared['Data'] = pd.to_datetime(raw_water_level_prepared['Data'], format='%Y-%m-%d')

# connect ID and Location
raw_water_level_prepared['Miejscowość_Id'] = raw_water_level_prepared['Miejscowość'].astype(str) + \
    ' (' + raw_water_level_prepared['Id'].astype(str) + ')'

# Create pivot table
raw_water_level_prepared = raw_water_level_prepared.pivot(index='Data', columns='Miejscowość_Id', values='Stan wody [cm]')

raw_water_level_prepared.columns = [f'{col} Stan wody [cm]' for col in raw_water_level_prepared.columns]
raw_water_level_prepared.reset_index(inplace=True)

raw_water_level_prepared

## 3.2 opady

In [None]:
def weather_no_info(x):
    if x == 8:
        return 1
    else:
        return 0

In [None]:
raw_rain_level_prepared = raw_rain_level.copy()

# rename columns
rain_renaming_cols = {0: 'Id', 1: 'Miejscowość', 2: 'Rok', 3: 'miesiąc', 4: 'Dzień_tygodnia', 
                      5: 'Suma opadów [mm]', 6: 'Brak pomiaru'}
raw_rain_level_prepared = raw_rain_level_prepared[rain_renaming_cols.keys()]
raw_rain_level_prepared.rename(rain_renaming_cols, axis=1, inplace=True)

# select Ids
raw_rain_level_prepared = raw_rain_level_prepared.loc[raw_rain_level_prepared['Id'].isin(hierarchy['id_meteo']), :]

# set info about lack of measurments
raw_rain_level_prepared['Brak pomiaru'] = raw_rain_level_prepared['Brak pomiaru'].apply(weather_no_info)

# Create variable with a date
raw_rain_level_prepared['Dzień_tygodnia'] = raw_rain_level_prepared['Dzień_tygodnia'].astype(str)
raw_rain_level_prepared['Dzień_tygodnia'] = raw_rain_level_prepared['Dzień_tygodnia'].apply(lambda x: x.zfill(2))
raw_rain_level_prepared['Data'] = raw_rain_level_prepared['Rok'].astype(str) + '-' \
                                    + raw_rain_level_prepared['miesiąc'].astype(str) + '-'\
                                    + raw_rain_level_prepared['Dzień_tygodnia'].astype(str)
raw_rain_level_prepared['Data'] = pd.to_datetime(raw_rain_level_prepared['Data'], format='%Y-%m-%d')

# connect ID and Location
raw_rain_level_prepared['Miejscowość_Id'] = raw_rain_level_prepared['Miejscowość'].astype(str) + \
    ' (' + raw_rain_level_prepared['Id'].astype(str) + ')'

# Create pivot table
raw_rain_level_prepared = raw_rain_level_prepared.pivot(index='Data', columns='Miejscowość_Id', values=['Suma opadów [mm]', 
                                                                                                        'Brak pomiaru'])

raw_rain_level_prepared.columns = [f'{multiindex[1]} {multiindex[0]}' for multiindex in raw_rain_level_prepared.columns]

# add missing dates
rain_dates = pd.date_range(start=raw_rain_level_prepared.index.min(), end=raw_rain_level_prepared.index.max(), freq='1D')
raw_rain_level_prepared = raw_rain_level_prepared.reindex(rain_dates)

# Fill missing values
# rainings
for city_id in raw_rain_level_prepared.filter(regex=("Suma opadów")).columns:
    first_date = raw_rain_level_prepared.loc[~pd.isnull(raw_rain_level_prepared[city_id]), :].index.min()
    raw_rain_level_prepared.loc[pd.isnull(raw_rain_level_prepared[city_id])&(raw_rain_level_prepared.index>=first_date), 
                                city_id] = 0
# without measurments
raw_rain_level_prepared.loc[:, raw_rain_level_prepared.filter(regex=("Brak pomiaru")).columns] = raw_rain_level_prepared.loc[:,
                                                 raw_rain_level_prepared.filter(regex=("Brak pomiaru")).columns].fillna(0)


raw_rain_level_prepared.reset_index(inplace=True)
raw_rain_level_prepared.rename({'index': 'Data'}, axis=1, inplace=True)
raw_rain_level_prepared

## 3.3 Merge

In [None]:
prepared_dataset = pd.merge(raw_water_level_prepared, raw_rain_level_prepared, how='left', on='Data')

prepared_dataset

# 4. Save data

In [None]:
prepared_dataset.to_csv('../results/prepared_data.csv', index=False)
hierarchy.to_csv('../results/prepared_hierarchy.csv', index=False)