In [1]:
import os
import pandas as pd
from pathlib import Path

In [2]:
PROJECT_ROOT = Path(os.getenv("PROJECT_ROOT"))
DATA_DIR = PROJECT_ROOT / 'data/input/weather/'

In [3]:
df_parameters = pd.read_csv(DATA_DIR / 'kody_parametr.csv', encoding='windows-1250', sep=';')
df_parameters

Unnamed: 0,Kod,Nazwa
0,B00300S,Temperatura powietrza (oficjalna)
1,B00305A,Temperatura gruntu (czujnik)
2,B00202A,Kierunek wiatru (czujnik)
3,B00702A,Średnia prędkość wiatru czujnik 10 minut
4,B00703A,Prędkość maksymalna (czujnik)
5,B00608S,Suma opadu 10 minutowego
6,B00604S,Suma opadu dobowego
7,B00606S,Suma opadu godzinowego
8,B00802A,Wilgotność względna powietrza (czujnik)
9,B00714A,Największy poryw w okresie 10min ze stacji Syn...


In [4]:
params = df_parameters.loc[[1, 6, 8], 'Kod'].tolist()
params

['B00305A', 'B00604S', 'B00802A']

In [5]:
df_weather_stations = pd.read_csv(DATA_DIR.parent / 'voivodeships/weather_stations.csv')
df_weather_stations

Unnamed: 0,station_id,voivodeship,location
0,353150210,zachodniopomorskie,Resno-Smólsko
1,254170140,pomorskie,Kościerzyna
2,353200272,warmińsko-mazurskie,Olsztyn
3,351150400,lubuskie,Zielona Góra
4,352160330,wielkopolskie,Poznań
5,353180250,kujawsko-pomorskie,Toruń
6,351160418,mazowieckie,Leszno
7,353230295,podlaskie,Białystok
8,250160090,dolnośląskie,Pszenno
9,350170530,opolskie,Opole


In [6]:
stations = df_weather_stations['station_id']

In [7]:
years = list(range(2018, 2023))
months = [f"{i:02}" for i in range(1, 13)]
column_names = ['station_id', 'param_id', 'date', 'value']

dfs = []
for year in years:
    for month in months:
        df_params = []
        for param in params:
            df = pd.read_csv(
                DATA_DIR / str(year) / month /  f'{param}_{year}_{month}.csv', 
                sep=';', 
                header=None,
                names=column_names, 
                index_col=False, 
                converters={
                    column_names[3]: lambda x: x.replace(',', '.')
                }
            )

            df.rename(columns={'value': param}, inplace=True)
            df.drop(columns=['param_id'], inplace=True)
            df = df[df['station_id'].isin(stations)]

            df[param] = df[param].astype(float)

            df_params.append(df)

        for i in range(1, len(df_params)):
            df_params[0] = pd.merge(df_params[0], df_params[i], on=['station_id', 'date'])
        
        df = df_params[0]
        df['month'] = month
        df['year'] = year
        df.drop(columns=['date'], inplace=True)
        dfs.append(df_params[0])


df_weather = pd.concat(dfs, axis=0)
df_weather

Unnamed: 0,station_id,B00305A,B00604S,B00802A,month,year
0,249190280,2.03,0.5,85.93,01,2018
1,249190280,-1.87,0.0,100.00,01,2018
2,249190280,-1.33,0.2,94.58,01,2018
3,249190280,0.21,1.1,93.02,01,2018
4,249190280,2.66,0.0,81.43,01,2018
...,...,...,...,...,...,...
489,353230295,3.30,12.0,85.00,12,2022
490,353230295,0.40,0.0,90.00,12,2022
491,353230295,2.40,2.4,95.00,12,2022
492,353230295,3.80,0.0,81.00,12,2022


In [8]:
df_grouped = df_weather \
    .rename(columns={'B00305A': 'temperature', 'B00604S': 'rainfall', 'B00802A': 'humidity'}) \
    .groupby(['station_id', 'year', 'month']).agg({
        'temperature': 'mean', 
        'rainfall': 'sum',
        'humidity': 'mean', 
    }) \
    .reset_index()

df_grouped

Unnamed: 0,station_id,year,month,temperature,rainfall,humidity
0,249190280,2018,01,-1.149677,17.7,89.855484
1,249190280,2018,02,-2.167143,20.0,92.866429
2,249190280,2018,03,-0.725484,14.9,87.644194
3,249190280,2018,04,10.383333,17.7,72.174667
4,249190280,2018,05,17.081613,102.7,77.249677
...,...,...,...,...,...,...
924,353230295,2022,08,21.509677,11.5,78.806452
925,353230295,2022,09,10.533333,85.2,91.133333
926,353230295,2022,10,7.632258,45.3,94.612903
927,353230295,2022,11,1.273333,21.4,93.933333


In [9]:
df_grouped = df_grouped.merge(df_weather_stations, on='station_id', how='left')

df_grouped.drop(columns=['station_id', 'location'], inplace=True)
df_grouped = df_grouped.reindex(columns=['year', 'month', 'voivodeship', 'temperature', 'rainfall', 'humidity'])
df_grouped

Unnamed: 0,year,month,voivodeship,temperature,rainfall,humidity
0,2018,01,małopolskie,-1.149677,17.7,89.855484
1,2018,02,małopolskie,-2.167143,20.0,92.866429
2,2018,03,małopolskie,-0.725484,14.9,87.644194
3,2018,04,małopolskie,10.383333,17.7,72.174667
4,2018,05,małopolskie,17.081613,102.7,77.249677
...,...,...,...,...,...,...
924,2022,08,podlaskie,21.509677,11.5,78.806452
925,2022,09,podlaskie,10.533333,85.2,91.133333
926,2022,10,podlaskie,7.632258,45.3,94.612903
927,2022,11,podlaskie,1.273333,21.4,93.933333


In [10]:
df_grouped.drop(columns=['rainfall'], inplace=True)

In [11]:
df_grouped.to_csv(PROJECT_ROOT / "data/intermediate/weather_measurements.csv", index=False)