In [35]:
!pip install meteostat



In [36]:
from datetime import datetime
from meteostat import Stations, Daily
import pandas as pd
from geopy.distance import geodesic
import os

# Key weather variables
main_vars = ['tmax', 'tavg', 'tmin', 'prcp', 'wspd', 'pres']

def get_nearest_stations(city_name, lat, lon, start, end, max_stations=5):
    """
    Find nearest stations sorted by missing days, missing values, and distance.
    Prints stations info and recommends the best one.
    Asks user confirmation to load data from recommended station.
    """
    stations = Stations().nearby(lat, lon).fetch(max_stations)
    station_info = []

    for idx, row in stations.iterrows():
        station_id = idx
        name = row['name']
        station_coords = (row['latitude'], row['longitude'])
        city_coords = (lat, lon)

        # Calculate distance between city and station
        distance = geodesic(city_coords, station_coords).km

        try:
            data = Daily(station_id, start, end).fetch()
            # Count missing days (dates missing from full date range)
            total_days_missing = pd.date_range(start, end).difference(data.index).size
            # Count missing values in key variables
            missing_values = data[main_vars].isna().sum().sum()

            station_info.append({
                'id': station_id,
                'name': name,
                'distance_km': round(distance, 2),
                'missing_days': total_days_missing,
                'missing_values': int(missing_values)
            })
        except Exception as e:
            # Skip stations that fail to load
            continue

    df = pd.DataFrame(station_info)
    df = df.sort_values(['missing_days', 'missing_values', 'distance_km'], ascending=[True, True, True])

    print(f"\nCity: {city_name}")
    display(df.reset_index(drop=True))

    if not df.empty:
        best = df.iloc[0]
        print(f"\nüìå Recommended station: {best['name']} (ID: {best['id']})")
        print(f"üìç Distance: {best['distance_km']} km, ‚ùó Missing days: {best['missing_days']}, üï≥Ô∏è Missing values: {best['missing_values']}")

        confirm = input(f"\nLoad data from this station? (yes/no): ").strip().lower()
        if confirm == 'yes':
            data = Daily(best['id'], start, end).fetch()[main_vars]
            data = data.reset_index()
            data['city'] = city_name
            data['station_id'] = best['id']
            return data
        else:
            print("üîÑ Please select another station from the list manually.")
            return None
    else:
        print("‚ö†Ô∏è No suitable stations found.")
        return None

def process_weather_data(df, main_vars, start, end):
    """
    Process weather data:
    - Set 'time' as datetime index
    - Check and report missing dates in the period
    - Interpolate missing data for key variables
    - Aggregate data by week (mean)
    """
    df = df.copy()
    df['time'] = pd.to_datetime(df['time'])
    df = df.set_index('time').sort_index()

    # Filter by date range
    df = df.loc[start:end]

    # Check for missing dates
    full_dates = pd.date_range(start, end, freq='D')
    missing_dates = full_dates.difference(df.index)

    if len(missing_dates) > 0:
        print(f"‚ö†Ô∏è Missing {len(missing_dates)} days between {missing_dates[0].date()} and {missing_dates[-1].date()}")
    else:
        print("‚úÖ All dates are present.")

    # Interpolate missing values in key variables
    df[main_vars] = df[main_vars].interpolate(limit_direction='both')

    # Check for any remaining missing values
    remaining_nans = df[main_vars].isna().sum()
    if remaining_nans.sum() > 0:
        print("‚ö†Ô∏è Remaining missing values after interpolation:")
        print(remaining_nans[remaining_nans > 0])
    else:
        print("‚úÖ No missing values after interpolation.")

    # Aggregate weekly means
    weekly = df[main_vars].resample('W').mean().reset_index()
    weekly[main_vars] = weekly[main_vars].round(1)

    # Add city and station info columns (take first row values)
    weekly['city'] = df['city'].iloc[0]
    weekly['station_id'] = df['station_id'].iloc[0]

    return weekly



In [38]:
import os

def save_weather_to_csv(df, city_name, folder="output"):
    """
    Save the weekly weather dataset to a CSV file.

    Parameters:
    - df: DataFrame to save
    - city_name: name of the city (used in filename)
    - folder: output directory
    """
    # Ensure output folder exists
    os.makedirs(folder, exist_ok=True)

    # Generate filename
    filename = f"{city_name.lower().replace(' ', '_')}_weather_weekly.csv"
    filepath = os.path.join(folder, filename)

    # Save to CSV
    df.to_csv(filepath, index=False)
    print(f"‚úÖ Dane zapisane do pliku: {filepath}")

In [39]:
start = datetime(2022, 1, 1)
end = datetime(2024, 1, 31)

# Get Warsaw weather data
warsaw_df = get_nearest_stations("Warszawa", 52.2298, 21.0118, start, end)

if warsaw_df is not None:
    weekly_warsaw = process_weather_data(warsaw_df, main_vars, start, end)
    display(weekly_warsaw.head())
else:
    print("Data loading was cancelled or failed.")


City: Warszawa


Unnamed: 0,id,name,distance_km,missing_days,missing_values
0,EPMO0,Warszawa / Modlin,34.76,0,0
1,12376,Minsk / Jan√≥w,43.75,0,23
2,12375,Warszawa-Okecie,7.67,0,90
3,12488,Kozienice,82.56,0,146
4,EPRP0,Radom / Piast√≥w,83.82,761,0



üìå Recommended station: Warszawa / Modlin (ID: EPMO0)
üìç Distance: 34.76 km, ‚ùó Missing days: 0, üï≥Ô∏è Missing values: 0

Load data from this station? (yes/no): yes
‚úÖ All dates are present.
‚úÖ No missing values after interpolation.


Unnamed: 0,time,tmax,tavg,tmin,prcp,wspd,pres,city,station_id
0,2022-01-02,9.5,7.0,3.5,4.4,16.8,1015.1,Warszawa,EPMO0
1,2022-01-09,3.4,1.5,-1.4,2.6,16.0,1005.2,Warszawa,EPMO0
2,2022-01-16,1.4,-0.8,-3.3,0.2,18.2,1024.8,Warszawa,EPMO0
3,2022-01-23,1.6,-0.5,-3.0,1.0,22.9,1017.1,Warszawa,EPMO0
4,2022-01-30,3.4,1.3,-0.7,2.8,23.7,1015.5,Warszawa,EPMO0


In [40]:
save_weather_to_csv(weekly_warsaw, "Warszawa")

‚úÖ Dane zapisane do pliku: output/warszawa_weather_weekly.csv


In [22]:
start = datetime(2022, 1, 1)
end = datetime(2024, 1, 31)

# Get Krakow weather data
krakow_df = get_nearest_stations("Krakow", 50.0647, 19.9450, start, end)

if krakow_df is not None:
    weekly_krakow = process_weather_data(krakow_df, main_vars, start, end)
    display(weekly_krakow.head())
else:
    print("Data loading was cancelled or failed.")


City: Krakow


Unnamed: 0,id,name,distance_km,missing_days,missing_values
0,EPNT0,Nowy Targ / Nowytarg,67.37,0,0
1,EPKM0,Katowice / Bryn√≥w,67.89,0,0
2,12566,Krakow,10.58,0,48
3,12560,Katowice,67.81,0,146
4,12660,Nowy Sacz,73.7,0,146



üìå Recommended station: Nowy Targ / Nowytarg (ID: EPNT0)
üìç Distance: 67.37 km, ‚ùó Missing days: 0, üï≥Ô∏è Missing values: 0

Load data from this station? (yes/no): yes
‚úÖ All dates are present.
‚úÖ No missing values after interpolation.


Unnamed: 0,time,tmax,tavg,tmin,prcp,wspd,pres,city,station_id
0,2022-01-02,7.8,5.5,3.2,7.4,11.7,1020.4,Krakow,EPNT0
1,2022-01-09,2.9,-0.5,-3.7,2.3,10.6,1010.1,Krakow,EPNT0
2,2022-01-16,-0.2,-2.8,-5.2,0.3,10.7,1028.6,Krakow,EPNT0
3,2022-01-23,-2.5,-4.6,-6.7,3.4,15.0,1023.5,Krakow,EPNT0
4,2022-01-30,-0.8,-2.9,-5.7,5.2,14.9,1023.3,Krakow,EPNT0


In [41]:
save_weather_to_csv(weekly_krakow, "Krakow")

‚úÖ Dane zapisane do pliku: output/krakow_weather_weekly.csv


In [42]:
start = datetime(2022, 1, 1)
end = datetime(2024, 1, 31)

# Get Katowice weather data
katowice_df = get_nearest_stations("Katowice", 50.2649, 19.0238, start, end)

if katowice_df is not None:
    weekly_katowice = process_weather_data(katowice_df, main_vars, start, end)
    display(weekly_katowice.head())
else:
    print("Data loading was cancelled or failed.")
save_weather_to_csv(weekly_katowice, "Katowice")



City: Katowice


Unnamed: 0,id,name,distance_km,missing_days,missing_values
0,EPKM0,Katowice / Bryn√≥w,3.07,0,0
1,12566,Krakow,59.01,0,48
2,12560,Katowice,3.58,0,146
3,12600,Bielsko-Biala,51.74,0,146
4,12555,Pyrzowice,24.66,11,750



üìå Recommended station: Katowice / Bryn√≥w (ID: EPKM0)
üìç Distance: 3.07 km, ‚ùó Missing days: 0, üï≥Ô∏è Missing values: 0

Load data from this station? (yes/no): yes
‚úÖ All dates are present.
‚úÖ No missing values after interpolation.


Unnamed: 0,time,tmax,tavg,tmin,prcp,wspd,pres,city,station_id
0,2022-01-02,10.6,9.3,7.6,4.7,14.3,1019.9,Katowice,EPKM0
1,2022-01-09,5.0,1.9,-1.2,2.4,11.8,1009.6,Katowice,EPKM0
2,2022-01-16,1.3,-1.1,-3.9,0.3,10.2,1029.4,Katowice,EPKM0
3,2022-01-23,0.6,-1.2,-3.0,1.9,11.5,1024.0,Katowice,EPKM0
4,2022-01-30,2.1,0.5,-1.8,2.2,14.8,1023.2,Katowice,EPKM0


‚úÖ Dane zapisane do pliku: output/katowice_weather_weekly.csv


In [28]:
start = datetime(2022, 1, 1)
end = datetime(2024, 1, 31)

# Get Gda≈Ñsk weather data
gdansk_df = get_nearest_stations("Gda≈Ñsk", 54.3520, 18.6466, start, end)

if gdansk_df is not None:
    weekly_gdansk = process_weather_data(gdansk_df, main_vars, start, end)
    display(weekly_gdansk.head())
else:
    print("Data loading was cancelled or failed.")



City: Gda≈Ñsk


Unnamed: 0,id,name,distance_km,missing_days,missing_values
0,12155,Gdansk-Swibno,18.76,0,4
1,12160,Elblag,55.26,0,4
2,12125,Lebork,62.19,0,4
3,12150,Gdansk-Rebiechowo,12.2,0,146
4,12135,Hel,29.73,0,146



üìå Recommended station: Gdansk-Swibno (ID: 12155)
üìç Distance: 18.76 km, ‚ùó Missing days: 0, üï≥Ô∏è Missing values: 4

Load data from this station? (yes/no): yes
‚úÖ All dates are present.
‚úÖ No missing values after interpolation.


Unnamed: 0,time,tmax,tavg,tmin,prcp,wspd,pres,city,station_id
0,2022-01-02,8.8,6.2,2.9,3.3,15.5,1013.3,Gda≈Ñsk,12155
1,2022-01-09,3.6,1.6,-0.5,0.9,16.0,1003.2,Gda≈Ñsk,12155
2,2022-01-16,4.1,1.2,-1.4,0.0,17.2,1024.3,Gda≈Ñsk,12155
3,2022-01-23,3.2,1.7,0.1,0.7,21.7,1016.9,Gda≈Ñsk,12155
4,2022-01-30,5.6,3.3,1.5,2.2,20.8,1013.3,Gda≈Ñsk,12155


In [43]:
save_weather_to_csv(weekly_gdansk, "Gda≈Ñsk")

‚úÖ Dane zapisane do pliku: output/gda≈Ñsk_weather_weekly.csv


In [30]:
start = datetime(2022, 1, 1)
end = datetime(2024, 1, 31)

# Get Wroc≈Çaw weather data
wroclaw_df = get_nearest_stations("Wroc≈Çaw", 51.1079, 17.0385, start, end)

if wroclaw_df is not None:
    weekly_wroclaw = process_weather_data(wroclaw_df, main_vars, start, end)
    display(weekly_wroclaw.head())
else:
    print("Data loading was cancelled or failed.")


City: Wroc≈Çaw


Unnamed: 0,id,name,distance_km,missing_days,missing_values
0,12424,Wroclaw Copernicus Airport,10.91,0,97
1,12415,Legnica Bartoszow,59.55,0,146
2,12520,Klodzko,80.73,0,146
3,12530,Opole,81.7,0,146
4,12425,Wroclaw I,4.79,761,0



üìå Recommended station: Wroclaw Copernicus Airport (ID: 12424)
üìç Distance: 10.91 km, ‚ùó Missing days: 0, üï≥Ô∏è Missing values: 97

Load data from this station? (yes/no): yes
‚úÖ All dates are present.
‚úÖ No missing values after interpolation.


Unnamed: 0,time,tmax,tavg,tmin,prcp,wspd,pres,city,station_id
0,2022-01-02,12.0,10.1,6.9,0.0,14.5,1018.8,Wroc≈Çaw,12424
1,2022-01-09,5.6,3.1,0.1,0.0,14.6,1007.7,Wroc≈Çaw,12424
2,2022-01-16,3.5,0.1,-3.9,0.0,11.9,1029.3,Wroc≈Çaw,12424
3,2022-01-23,3.0,0.7,-2.3,0.6,16.1,1024.1,Wroc≈Çaw,12424
4,2022-01-30,5.4,3.1,1.2,0.3,20.3,1022.6,Wroc≈Çaw,12424


In [44]:
save_weather_to_csv(weekly_wroclaw, "Wroc≈Çaw")

‚úÖ Dane zapisane do pliku: output/wroc≈Çaw_weather_weekly.csv


*1) Kt√≥re zmienne z przedstawianych przez meteostat w Twojej opinii sƒÖ wa≈ºne w kontek≈õcie predykcji sprzeda≈ºy wody gazowanej?*

Najwa≈ºniejsze zmienne pogodowe wp≈ÇywajƒÖce na sprzeda≈º wody gazowanej to:

**tavg** (≈õrednia temperatura) ‚Äì wy≈ºsza temperatura zwiƒôksza pragnienie;

**tmax i tmin** (maksymalna i minimalna temperatura) ‚Äì dla zrozumienia ekstrem√≥w pogodowych;

**prcp** (opady) ‚Äì deszcz mo≈ºe zmniejszaƒá liczbƒô klient√≥w na zewnƒÖtrz;

**wspd** (prƒôdko≈õƒá wiatru) ‚Äì silny wiatr mo≈ºe wp≈Çywaƒá na aktywno≈õƒá na ≈õwie≈ºym powietrzu;

*2) Jakie statystyki agregaty (np. ≈õrednia) z danych dziennych warto by wykorzystaƒá tak≈ºe do przeprowadzenia analizy na danych tygodniowych?*

≈örednia (mean) ‚Äì podstawowa statystyka do wykrywania trend√≥w;

Mediana (median) ‚Äì mniej wra≈ºliwa na warto≈õci odstajƒÖce;

Suma (sum) ‚Äì np. dla opad√≥w lub nas≈Çonecznienia (≈ÇƒÖczna suma tygodniowa);

Wariancja lub odchylenie standardowe (std) ‚Äì do oceny zmienno≈õci parametr√≥w w tygodniu.
Jednak moim zdaniem, biorƒÖc pod uwagƒô warunki klimatyczne w naszym przypadku, odpowiednia jest ≈õrednia(mean) agregacja.

*3) Jak poradziƒá z pojedynczymi brakami danych dla poszczeg√≥lnych zmiennych?*

Zastosowaƒá interpolacjƒô (np. liniowƒÖ) do uzupe≈Çnienia brak√≥w;

Je≈õli interpolacja nie jest mo≈ºliwa ‚Äì zastƒÖpiƒá brakujƒÖce warto≈õci ≈õredniƒÖ lub medianƒÖ z sƒÖsiednich dni;

Je≈õli brak√≥w jest du≈ºo, rozwa≈ºyƒá zmianƒô stacji pomiarowej lub pominiƒôcie danej zmiennej;

W ostateczno≈õci oznaczyƒá braki i uwzglƒôdniƒá je jako cechƒô w modelu.

*4) Czy warto braƒá pod uwagƒô zmienne pogodowe z du≈ºƒÖ liczbƒÖ brak√≥w danych?*

Zazwyczaj nie, du≈ºa liczba brak√≥w obni≈ºa jako≈õƒá danych i modeli;

Lepiej znale≈∫ƒá innƒÖ stacjƒô z pe≈Çniejszymi danymi;

Je≈õli zmienna jest kluczowa, mo≈ºna spr√≥bowaƒá uzupe≈Çniƒá braki metodami statystycznymi lub ML, ale z ostro≈ºno≈õciƒÖ.

*5) Kt√≥re dodatkowe zmienne sezonowe (opr√≥cz pogodowych, np. pora roku) warto uwzglƒôdniƒá, starajƒÖc siƒô wyja≈õniƒá zmienno≈õƒá sprzeda≈ºy wody gazowanej?*

Pora roku (wiosna, lato, jesie≈Ñ, zima) ‚Äì wp≈Çywa na temperaturƒô i zachowania klient√≥w;

Dzie≈Ñ tygodnia (weekend/roboczy) ‚Äì sprzeda≈º mo≈ºe siƒô r√≥≈ºniƒá;

≈öwiƒôta i dni wolne od pracy ‚Äì wp≈ÇywajƒÖ na ruch w sklepach;

Wydarzenia lokalne (festiwale, imprezy plenerowe), nap≈Çyw turyst√≥w ‚Äì mogƒÖ zwiƒôkszaƒá popyt;



