In [16]:
import pandas as pd
import numpy as np
import openmeteo_requests
import requests_cache
from retry_requests import retry
import warnings
warnings.filterwarnings('ignore')

print("Loading datasets...")
# Load original dataset
df_original = pd.read_csv('/workspaces/Room_7_Bakery_Prediction/0_DataPreparation/0.3 Additional Features/complete_dataset_with_additional_features.csv')
df_original['date'] = pd.to_datetime(df_original['date'])

print(f"✓ Original dataset loaded: {df_original.shape}")

Loading datasets...
✓ Original dataset loaded: (11164, 50)


## 1. Analyze Existing Weather Codes

In [17]:
print("="*80)
print("EXISTING WEATHER CODES IN DATASET")
print("="*80)

# Get non-null Wettercode values
existing_codes = df_original[df_original['Wettercode'].notna()]['Wettercode'].astype(int).unique()
existing_codes_sorted = sorted(existing_codes)

print(f"\nTotal unique codes: {len(existing_codes_sorted)}")
print(f"Code range: {min(existing_codes_sorted)} to {max(existing_codes_sorted)}")
print(f"\nAll existing codes:")
print(existing_codes_sorted)

print(f"\nDistribution of existing codes:")
code_counts = df_original[df_original['Wettercode'].notna()]['Wettercode'].astype(int).value_counts().sort_index()
print(code_counts)

# Check for non-integer codes
non_integer = df_original[df_original['Wettercode'].notna()][~df_original[df_original['Wettercode'].notna()]['Wettercode'].astype(str).str.match(r'^\d+\.?0*$')]
if len(non_integer) > 0:
    print(f"\n⚠️  WARNING: Found {len(non_integer)} non-integer codes:")
    print(non_integer['Wettercode'].unique())
else:
    print("\n✓ All codes are integers (good!)")

EXISTING WEATHER CODES IN DATASET

Total unique codes: 24
Code range: 0 to 95

All existing codes:
[np.int64(0), np.int64(3), np.int64(5), np.int64(10), np.int64(17), np.int64(20), np.int64(21), np.int64(22), np.int64(28), np.int64(45), np.int64(49), np.int64(53), np.int64(55), np.int64(61), np.int64(63), np.int64(65), np.int64(68), np.int64(69), np.int64(71), np.int64(73), np.int64(75), np.int64(77), np.int64(79), np.int64(95)]

Distribution of existing codes:
Wettercode
0     1044
3        5
5      774
10     865
17      10
20     307
21    1193
22     115
28      74
45     106
49      53
53      48
55       5
61    2777
63     580
65     111
68      22
69      27
71      91
73      48
75      25
77      45
79       5
95     172
Name: count, dtype: int64

✓ All codes are integers (good!)


## 2. Fetch and Analyze Open-Meteo Weather Codes

In [18]:
print("\nFetching Open-Meteo weather data...")

# Setup Open-Meteo client
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Kiel coordinates
latitude = 54.3233
longitude = 10.1348

# Date range
start_date = df_original['date'].min().strftime('%Y-%m-%d')
end_date = df_original['date'].max().strftime('%Y-%m-%d')

# Fetch data
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": latitude,
    "longitude": longitude,
    "start_date": start_date,
    "end_date": end_date,
    "daily": "weather_code",
    "timezone": "Europe/Berlin"
}

responses = openmeteo.weather_api(url, params=params)
response = responses[0]
daily = response.Daily()

# Create DataFrame
weather_data = pd.DataFrame({
    'date': pd.date_range(start=start_date, periods=len(daily.Variables(0).ValuesAsNumpy()), freq='D'),
    'weather_code': daily.Variables(0).ValuesAsNumpy()
})

weather_data['weather_code'] = weather_data['weather_code'].astype(int)

print(f"✓ Open-Meteo data fetched: {weather_data.shape[0]} records")


Fetching Open-Meteo weather data...
✓ Open-Meteo data fetched: 2221 records


## 3. Analyze Open-Meteo Codes

In [19]:
print("\n" + "="*80)
print("OPEN-METEO WEATHER CODES")
print("="*80)

openmeteo_codes = sorted(weather_data['weather_code'].unique())

print(f"\nTotal unique codes: {len(openmeteo_codes)}")
print(f"Code range: {min(openmeteo_codes)} to {max(openmeteo_codes)}")
print(f"\nAll Open-Meteo codes:")
print(openmeteo_codes)

print(f"\nDistribution of Open-Meteo codes:")
om_counts = weather_data['weather_code'].value_counts().sort_index()
print(om_counts)


OPEN-METEO WEATHER CODES

Total unique codes: 13
Code range: 0 to 75

All Open-Meteo codes:
[np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(51), np.int64(53), np.int64(55), np.int64(61), np.int64(63), np.int64(65), np.int64(71), np.int64(73), np.int64(75)]

Distribution of Open-Meteo codes:
weather_code
0      33
1      46
2      67
3     719
51    530
53    292
55    108
61    213
63     66
65      3
71     71
73     63
75     10
Name: count, dtype: int64


## 4. Compare Codes - Compatibility Check

In [20]:
print("\n" + "="*80)
print("CODE COMPATIBILITY ANALYSIS")
print("="*80)

existing_set = set(existing_codes_sorted)
openmeteo_set = set(openmeteo_codes)

# Codes only in existing data
only_existing = existing_set - openmeteo_set
# Codes only in Open-Meteo
only_openmeteo = openmeteo_set - existing_set
# Codes in both
common_codes = existing_set & openmeteo_set

print(f"\nCodes in BOTH datasets: {len(common_codes)} codes")
print(f"  {sorted(common_codes)}")

print(f"\nCodes ONLY in existing data: {len(only_existing)} codes")
if only_existing:
    print(f"  {sorted(only_existing)}")
    print("  ⚠️  WARNING: These codes won't be found in Open-Meteo data")
else:
    print("  (none - good!)")

print(f"\nCodes ONLY in Open-Meteo: {len(only_openmeteo)} codes")
if only_openmeteo:
    print(f"  {sorted(only_openmeteo)}")
    print("  ✓ These are new weather conditions from Open-Meteo")
else:
    print("  (none)")

print(f"\n" + "="*80)
print(f"OVERLAP: {len(common_codes)}/{len(existing_set)} existing codes appear in Open-Meteo")
print(f"OVERLAP %: {len(common_codes)/len(existing_set)*100:.1f}%")
print("="*80)


CODE COMPATIBILITY ANALYSIS

Codes in BOTH datasets: 10 codes
  [np.int64(0), np.int64(3), np.int64(53), np.int64(55), np.int64(61), np.int64(63), np.int64(65), np.int64(71), np.int64(73), np.int64(75)]

Codes ONLY in existing data: 14 codes
  [np.int64(5), np.int64(10), np.int64(17), np.int64(20), np.int64(21), np.int64(22), np.int64(28), np.int64(45), np.int64(49), np.int64(68), np.int64(69), np.int64(77), np.int64(79), np.int64(95)]

Codes ONLY in Open-Meteo: 3 codes
  [np.int64(1), np.int64(2), np.int64(51)]
  ✓ These are new weather conditions from Open-Meteo

OVERLAP: 10/24 existing codes appear in Open-Meteo
OVERLAP %: 41.7%


## 5. WMO Weather Code Standard Check

In [21]:
# Standard WMO codes
standard_wmo_codes = {
    0: "Clear sky",
    1: "Mainly clear",
    2: "Partly cloudy",
    3: "Overcast",
    45: "Foggy",
    48: "Depositing rime fog",
    51: "Light drizzle",
    53: "Moderate drizzle",
    55: "Dense drizzle",
    61: "Slight rain",
    63: "Moderate rain",
    65: "Heavy rain",
    71: "Slight snow",
    73: "Moderate snow",
    75: "Heavy snow",
    80: "Slight rain showers",
    81: "Moderate rain showers",
    82: "Violent rain showers",
    85: "Slight snow showers",
    86: "Heavy snow showers",
    95: "Thunderstorm",
    96: "Thunderstorm with slight hail",
    99: "Thunderstorm with heavy hail"
}

print("\n" + "="*80)
print("WMO WEATHER CODE STANDARD COMPLIANCE")
print("="*80)

print("\nExisting data codes vs WMO standard:")
existing_valid = existing_set & set(standard_wmo_codes.keys())
existing_invalid = existing_set - set(standard_wmo_codes.keys())

print(f"  Valid WMO codes: {len(existing_valid)}/{len(existing_set)}")
if existing_invalid:
    print(f"  Invalid/Non-standard codes: {sorted(existing_invalid)}")
    print("  ⚠️  These codes are NOT in the WMO standard!")
else:
    print(f"  ✓ All existing codes are valid WMO codes!")

print("\nOpen-Meteo codes vs WMO standard:")
openmeteo_valid = openmeteo_set & set(standard_wmo_codes.keys())
openmeteo_invalid = openmeteo_set - set(standard_wmo_codes.keys())

print(f"  Valid WMO codes: {len(openmeteo_valid)}/{len(openmeteo_set)}")
if openmeteo_invalid:
    print(f"  Invalid/Non-standard codes: {sorted(openmeteo_invalid)}")
else:
    print(f"  ✓ All Open-Meteo codes are valid WMO codes!")

print(f"\n{'='*80}")


WMO WEATHER CODE STANDARD COMPLIANCE

Existing data codes vs WMO standard:
  Valid WMO codes: 12/24
  Invalid/Non-standard codes: [np.int64(5), np.int64(10), np.int64(17), np.int64(20), np.int64(21), np.int64(22), np.int64(28), np.int64(49), np.int64(68), np.int64(69), np.int64(77), np.int64(79)]
  ⚠️  These codes are NOT in the WMO standard!

Open-Meteo codes vs WMO standard:
  Valid WMO codes: 13/13
  ✓ All Open-Meteo codes are valid WMO codes!



## 6. Mapping Table - What will be filled

In [22]:
print("\n" + "="*80)
print("WMO CODE REFERENCE WITH MAPPING")
print("="*80)

# Create mapping table
mapping_table = pd.DataFrame([
    {'Code': code, 'Description': standard_wmo_codes.get(code, 'Unknown'), 
     'In_Existing_Data': 'Yes' if code in existing_set else 'No',
     'In_OpenMeteo': 'Yes' if code in openmeteo_set else 'No'}
    for code in sorted(set(standard_wmo_codes.keys()) | existing_set | openmeteo_set)
])

print("\nMapping reference (showing which codes are in each source):")
print(mapping_table.to_string(index=False))

print(f"\n{'='*80}")


WMO CODE REFERENCE WITH MAPPING

Mapping reference (showing which codes are in each source):
 Code                   Description In_Existing_Data In_OpenMeteo
    0                     Clear sky              Yes          Yes
    1                  Mainly clear               No          Yes
    2                 Partly cloudy               No          Yes
    3                      Overcast              Yes          Yes
    5                       Unknown              Yes           No
   10                       Unknown              Yes           No
   17                       Unknown              Yes           No
   20                       Unknown              Yes           No
   21                       Unknown              Yes           No
   22                       Unknown              Yes           No
   28                       Unknown              Yes           No
   45                         Foggy              Yes           No
   48           Depositing rime fog             

## 7. Final Validation Report

In [23]:
print("\n" + "="*80)
print("FINAL VALIDATION REPORT")
print("="*80)

# Check for critical issues
issues = []
warnings_list = []

# Check 1: All existing codes are WMO standard
if existing_invalid:
    issues.append(f"❌ Non-standard codes in existing data: {sorted(existing_invalid)}")
else:
    print(f"✓ PASS: All existing codes are WMO standard")

# Check 2: All Open-Meteo codes are WMO standard
if openmeteo_invalid:
    issues.append(f"❌ Non-standard codes in Open-Meteo: {sorted(openmeteo_invalid)}")
else:
    print(f"✓ PASS: All Open-Meteo codes are WMO standard")

# Check 3: Good overlap between datasets
overlap_pct = len(common_codes) / len(existing_set) * 100 if existing_set else 0
if overlap_pct >= 70:
    print(f"✓ PASS: Good code overlap ({overlap_pct:.1f}% of existing codes in Open-Meteo)")
elif overlap_pct >= 50:
    warnings_list.append(f"⚠️  Moderate code overlap ({overlap_pct:.1f}% of existing codes in Open-Meteo)")
else:
    warnings_list.append(f"⚠️  Low code overlap ({overlap_pct:.1f}% of existing codes in Open-Meteo)")

# Check 4: No codes-only-in-existing (except if expected)
if only_existing:
    warnings_list.append(f"⚠️  Codes only in existing data (won't be filled): {sorted(only_existing)}")
else:
    print(f"✓ PASS: No codes exist only in existing data")

# Check 5: Data type consistency
if df_original[df_original['Wettercode'].notna()]['Wettercode'].dtype in [np.int64, np.int32, 'int64', 'int32']:
    print(f"✓ PASS: Existing data is integer type")
else:
    print(f"✓ CHECK: Existing data type is {df_original[df_original['Wettercode'].notna()]['Wettercode'].dtype}")

if len(issues) > 0:
    print(f"\n{'='*80}")
    print("CRITICAL ISSUES:")
    for issue in issues:
        print(issue)
    print(f"{'='*80}")
    print("\n⛔ RECOMMENDATION: Review issues before merging!")
else:
    print(f"\n{'='*80}")
    if warnings_list:
        print("WARNINGS:")
        for warning in warnings_list:
            print(warning)
        print(f"{'='*80}")
        print("\n✓ READY TO MERGE (with warnings noted)")
    else:
        print(f"{'='*80}")
        print("\n✓✓✓ ALL CHECKS PASSED - SAFE TO MERGE ✓✓✓")

print(f"\n{'='*80}")


FINAL VALIDATION REPORT
✓ PASS: All Open-Meteo codes are WMO standard
✓ CHECK: Existing data type is float64

CRITICAL ISSUES:
❌ Non-standard codes in existing data: [np.int64(5), np.int64(10), np.int64(17), np.int64(20), np.int64(21), np.int64(22), np.int64(28), np.int64(49), np.int64(68), np.int64(69), np.int64(77), np.int64(79)]

⛔ RECOMMENDATION: Review issues before merging!



open-meteo only supports a select few (about 28) of the wmo-codes we are using in our dataset.