## Step 1: Install Required Libraries

In [1]:
# Install required package
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "openmeteo-requests", "requests-cache"])
print("✓ Libraries installed successfully")

✓ Libraries installed successfully


## Step 2: Import Libraries and Load Data

In [2]:
import pandas as pd
import numpy as np
import openmeteo_requests
import requests_cache
from retry_requests import retry
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Load the dataset with missing weather data
df = pd.read_csv('/workspaces/Room_7_Bakery_Prediction/0_DataPreparation/0.3 Additional Features/complete_dataset_with_additional_features.csv')
df['date'] = pd.to_datetime(df['date'])

print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Missing Wettercode values: {df['Wettercode'].isnull().sum()} ({df['Wettercode'].isnull().sum()/len(df)*100:.2f}%)")

Dataset loaded: 11164 rows, 50 columns
Date range: 2013-07-01 00:00:00 to 2019-07-30 00:00:00
Missing Wettercode values: 2662 (23.84%)


## Step 3: Fetch Historical Weather Data from Open-Meteo

In [3]:
# Setup Open-Meteo client with caching
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Kiel, Germany coordinates
latitude = 54.3233
longitude = 10.1348

# Get date range from dataset
start_date = df['date'].min().strftime('%Y-%m-%d')
end_date = df['date'].max().strftime('%Y-%m-%d')

print(f"Fetching weather data for Kiel (lat: {latitude}, lon: {longitude})")
print(f"Date range: {start_date} to {end_date}")
print("This may take a minute...")

# Make request to Open-Meteo
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": latitude,
    "longitude": longitude,
    "start_date": start_date,
    "end_date": end_date,
    "daily": ["weather_code", "temperature_2m_max", "temperature_2m_min", "temperature_2m_mean", "cloud_cover_mean"],
    "timezone": "Europe/Berlin"
}

responses = openmeteo.weather_api(url, params=params)
response = responses[0]

print(f"✓ Data fetched successfully!")
print(f"Response coordinates: ({response.Latitude()}°E, {response.Longitude()}°N)")

Fetching weather data for Kiel (lat: 54.3233, lon: 10.1348)
Date range: 2013-07-01 to 2019-07-30
This may take a minute...
✓ Data fetched successfully!
Response coordinates: (54.3057975769043°E, 10.1953125°N)


## Step 4: Process Weather Data

In [4]:
# Extract daily data
daily = response.Daily()

# Create DataFrame from API response
weather_data = pd.DataFrame({
    'date': pd.date_range(
        start = pd.to_datetime(daily.TimeStart(), unit = "s", utc = True).tz_localize('UTC').tz_convert('Europe/Berlin'),
        periods = len(daily.Variables(0).ValuesAsNumpy()),
        freq = pd.Timedelta(seconds = daily.IntervalSeconds())
    ).date,
    'weather_code': daily.Variables(0).ValuesAsNumpy(),
    'temp_max': daily.Variables(1).ValuesAsNumpy(),
    'temp_min': daily.Variables(2).ValuesAsNumpy(),
    'temp_mean': daily.Variables(3).ValuesAsNumpy(),
    'cloud_cover': daily.Variables(4).ValuesAsNumpy()
})

weather_data['date'] = pd.to_datetime(weather_data['date'])
weather_data['weather_code'] = weather_data['weather_code'].astype('Int64')  # Handle NaN as Int64

print(f"Weather data processed: {weather_data.shape[0]} records")
print(f"\nFirst 5 records:")
print(weather_data.head())
print(f"\nWeather code distribution:")
print(weather_data['weather_code'].value_counts().sort_index())

AttributeError: 'VariablesWithTime' object has no attribute 'TimeStart'

## Step 5: Map WMO Weather Codes to Your Format

In [None]:
# WMO Weather Code mapping (Open-Meteo uses WMO codes)
# Check your existing Wettercode values to ensure compatibility
print("Existing Wettercode values in dataset (non-null):")
print(df[df['Wettercode'].notna()]['Wettercode'].unique())
print(f"\nUnique values: {sorted(df[df['Wettercode'].notna()]['Wettercode'].unique())}")

# WMO Code Description
wmo_description = {
    0: "Clear sky",
    1: "Mainly clear",
    2: "Partly cloudy",
    3: "Overcast",
    45: "Foggy",
    48: "Depositing rime fog",
    51: "Light drizzle",
    53: "Moderate drizzle",
    55: "Dense drizzle",
    61: "Slight rain",
    63: "Moderate rain",
    65: "Heavy rain",
    71: "Slight snow",
    73: "Moderate snow",
    75: "Heavy snow",
    80: "Slight rain showers",
    81: "Moderate rain showers",
    82: "Violent rain showers",
    85: "Slight snow showers",
    86: "Heavy snow showers",
    95: "Thunderstorm",
    96: "Thunderstorm with slight hail",
    99: "Thunderstorm with heavy hail"
}

# Map the weather codes
weather_data['weather_description'] = weather_data['weather_code'].map(wmo_description)

print("\nWMO Weather Codes in fetched data:")
print(weather_data[['date', 'weather_code', 'weather_description', 'temp_mean', 'cloud_cover']].head(10))

## Step 6: Merge Weather Data with Dataset

In [None]:
# Create date column for merging (remove time component)
df['date_only'] = df['date'].dt.date
weather_data['date_only'] = weather_data['date'].dt.date

# Merge on date
df_merged = df.merge(
    weather_data[['date_only', 'weather_code', 'weather_description', 'temp_mean', 'cloud_cover']], 
    on='date_only', 
    how='left'
)

# Fill missing Wettercode with fetched data
print(f"Before filling: {df['Wettercode'].isnull().sum()} missing Wettercode values")

# Create backup of original
df_merged['Wettercode_original'] = df_merged['Wettercode'].copy()

# Fill missing values
mask = df_merged['Wettercode'].isnull() & df_merged['weather_code'].notna()
df_merged.loc[mask, 'Wettercode'] = df_merged.loc[mask, 'weather_code']

print(f"After filling: {df_merged['Wettercode'].isnull().sum()} missing Wettercode values")
print(f"\nRecords filled: {mask.sum()}")

# Display comparison
print("\nSample of filled data:")
sample = df_merged[mask].head(10)[['date', 'Wettercode_original', 'Wettercode', 'weather_description', 'temp_mean']]
print(sample)

## Step 7: Recalculate Derived Weather Features

In [None]:
# Now recalculate wettercode_category based on updated Wettercode values
wmo_numeric = pd.to_numeric(df_merged['Wettercode'], errors='coerce')

wmo_map = {
    0: 'Clear',
    1: 'Mainly_Clear',
    2: 'Partly_Cloudy',
    3: 'Overcast',
    45: 'Fog',
    48: 'Fog',
    51: 'Drizzle',
    53: 'Drizzle',
    55: 'Drizzle',
    61: 'Rain',
    63: 'Rain',
    65: 'Rain',
    71: 'Snow',
    73: 'Snow',
    75: 'Snow',
    80: 'Showers',
    81: 'Showers',
    82: 'Showers',
    85: 'Showers',
    86: 'Showers',
    95: 'Thunderstorm',
    96: 'Thunderstorm',
    99: 'Thunderstorm'
}

df_merged['wettercode_category'] = wmo_numeric.map(wmo_map)

print(f"Updated wettercode_category - Missing values: {df_merged['wettercode_category'].isnull().sum()}")
print(f"\nweathercode_category value counts:")
print(df_merged['wettercode_category'].value_counts())

## Step 8: Clean Up and Save Updated Dataset

In [None]:
# Drop temporary columns
df_final = df_merged.drop(columns=['date_only', 'date_only', 'weather_code', 'weather_description', 'temp_mean', 'cloud_cover', 'Wettercode_original'])

# Ensure column order matches original
df_final = df_final[df.columns]

print(f"Final dataset shape: {df_final.shape}")
print(f"Missing Wettercode: {df_final['Wettercode'].isnull().sum()}")
print(f"Missing wettercode_category: {df_final['wettercode_category'].isnull().sum()}")

# Save updated dataset
output_path = '/workspaces/Room_7_Bakery_Prediction/0_DataPreparation/0.3 Additional Features/complete_dataset_with_additional_features.csv'
df_final.to_csv(output_path, index=False)

print(f"\n✓ Updated dataset saved to:")
print(f"  {output_path}")
print(f"\nSummary of changes:")
print(f"  - Wettercode values filled: {mask.sum()}")
print(f"  - Remaining missing values: {df_final['Wettercode'].isnull().sum()}")

## Step 9: Validation

In [None]:
# Load the saved file to verify
df_verify = pd.read_csv(output_path)

print("Verification of updated dataset:")
print(f"\nShape: {df_verify.shape}")
print(f"\nMissing values summary:")
missing_summary = df_verify.isnull().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)
print(missing_summary)

print(f"\nWettercode range: {df_verify['Wettercode'].min()} to {df_verify['Wettercode'].max()}")
print(f"\nFirst few rows with recently filled Wettercode:")
print(df_verify[df_verify['Wettercode'].notna()].head())