[CDC - Observations Germany](https://opendata.dwd.de/climate_environment/CDC/observations_germany/)

# Wind data
[Wind historical](https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/wind/historical/) </br>
[Wind recent](https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/wind/recent/) </br></br>
[Extreme Wind historical](https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/extreme_wind/historical/) </br>
[Extreme Wind recent](https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/extreme_wind/recent/)

# Precipitation data
[Precipitation historical](https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/precipitation/historical/) </br>
[Precipitation recent](https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/precipitation/recent/)

## 1. Download data
Run the following script completely to get the desired data from Deutscher Wetterdienst.

Information:
Since 'recent' only covers the last 520 days, we need to also download 'historical' data in a loop over two periods.

1. PERIOD = ['2020-2022', 'recent']
2. PERIOD = ['2020 - 2020', 'historical']

Desired config parameters:

DATA = ['air_temperature', 'pressure', 'precipitation', 'wind'], 

STATIONS_ID = ['691', '1420'] --> Bremen and Frankfurt a. M.

TEMPORAL_RES = ['hourly']


In [None]:
# import libraries
import os
import requests
from bs4 import BeautifulSoup as bs
import zipfile

import pandas as pd
import glob

In [None]:
# list of periods to loop over
periods = [['2020 - 2022', 'recent'],
        ['2020 - 2020', 'historical']]


# not all data are available in every temporal resolution!
DATA = [
    'air_temperature',
    # 'cloud_type',
    # 'cloudiness',
    # 'dew_point',
    #'extreme_wind',
    # 'moisture',
    'precipitation',
    'pressure',
    # 'soil',
    # 'soil_temperature',
    # 'solar',
    # 'sun',
    # 'standard_format',
    # 'visibility',
    # 'weather_phenomena',
    'wind',
    # 'wind_test',
    # 'wind_synop',
]

TEMPORAL_RES = [
    # '1_minute',
    #'10_minutes',
    'hourly',
    # 'subdaily',
    # 'daily',
    # 'monthly',
    # 'annual',
    # 'multi_annual',
]

# now dynamically looped over to cover different periods
# PERIOD = [
#     # 'start - 2020', # in hourly data

#     # '1991', # in 10_minutes data
#     # '2000 - 2009', # in 10_minutes data
#     # '2010 - 2019', # in 10_minutes data
#     '2020 - 2020', # in 10_minutes data
#     #'recent', 
#     'historical'
# ]

STATIONS_ID = [
    '691', # Bremen
    '1420', # Frankfurt a. M.
]

ROOT_URL = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/"

DOWNLOAD_DIR = os.path.join(os.curdir, "../data", "DeutscherWetterdienst", "")

# make target directory, if it doesn't exist
if not os.path.exists(DOWNLOAD_DIR):
    os.mkdir(DOWNLOAD_DIR)

# ensure that the id has 5 digits
for i, s_id in enumerate(STATIONS_ID):
    while len(s_id) < 5:
        s_id = '0' + s_id
    STATIONS_ID[i] = s_id


for PERIOD in periods:
    # get urls to search for downloadable data
    urls_root = []
    for temp_res in TEMPORAL_RES:
        for dat in DATA:
            if 'recent' in PERIOD:
                urls_root.append(ROOT_URL + temp_res + '/' + dat + '/' + 'recent' + '/')
            if len(PERIOD) > 1 or PERIOD[0] != 'recent':
                urls_root.append(ROOT_URL + temp_res + '/' + dat + '/' + 'historical' + '/')

    # get relevant years, 'akt' for recent data 
    years = [y.split(' - ')[1] if len(y.split('-')) > 1 else y.split(' - ')[0] for y in PERIOD]
    if 'recent' in PERIOD:
        years.append('akt')

    # get urls and names of desired files
    urls = []
    names = []
    for url in urls_root:
        # get html of website
        r = requests.get(url)
        soup = bs(r.text)
        # find download links and filter for .zip files, station and relevant time periods
        for i, link in enumerate(soup.findAll('a')):
            if '.zip' in str(link) and \
                any([station in str(link) for station in STATIONS_ID]) and \
                    any([year in str(link) for year in years]):
                url_download = url + link.get('href')
                urls.append(url_download)
                names.append(soup.select('a')[i].attrs['href'])

    names_urls = zip(names, urls)

    # download files
    for name, url in names_urls:
        
        file_path = os.path.join(DOWNLOAD_DIR, name)
        file_path_txt = os.path.join(DOWNLOAD_DIR, name.split('.')[0] + '.txt')
        if not os.path.isfile(file_path) and not os.path.isfile(file_path_txt):
            response = requests.get(url, timeout=50)
            print(url)
            with open(file_path, 'wb') as f:
                f.write(response.content)

            # unzip file
            if os.path.isfile(file_path):
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    zip_ref.extractall(DOWNLOAD_DIR)

        # delete .zip
        if os.path.isfile(file_path):
            os.remove(file_path)


## 2. Combine data of cities and metrics

In [None]:
# Read all meteorological data for a city from 'data/DeutscherWetterdienst'
def read_city_data(city_codes):
    """Reads all data of different cities and metrics

    Args:
        city_codes (list): list of city codes to be processed

    Returns:
        dictionary: dictionary of different dataframes per metric, city code and period (recent - 2 or historic - 1)
    """
    path = r'../data/DeutscherWetterdienst' 
    all_files = glob.glob(path + "/produkt*.txt") 

    weather_metrics = {}    
    for filename in all_files: 
            # loop over all cities in parameterization
            for city_code in city_codes:
                if city_code in filename:
                    df = pd.read_csv(filename, sep=';')
                    df['date'] = pd.to_datetime(df.MESS_DATUM, format='%Y%m%d%H')
                    df.drop(['eor', 'MESS_DATUM', df.columns[df.columns.str.startswith('QN')][0]], axis=1, inplace=True)
                    df = df.query("date < '2022-04' and date >= '2020-01'")
                    # keys of dictionary consist of measurement name, city code and first digit of period YYYY (1 or 2)
                    weather_metrics[f'{filename[38:40] + filename[-9:-4] + filename[48]}'] = df
    return weather_metrics

# Concatenate historical and current data for each metric
def concat_city_data(metrics_dict):
    """Concatenate all data per metric

    Args:
        metrics_dict (dictionary): dictionary of dataframes per metric, city code and period (recent - 2 or historic - 1)

    Returns:
        list: list of dataframes (one per metric), containing historical and current data for all cities
    """
    concatenated_metrics = []
    # loop over different metrics
    for metric in set([x[:-6] for x in list(metrics_dict.keys())]):
        # get all corresponding keys for one metric (includes recent and historic data and all cities)
        metric_keys = [x for x in list(metrics_dict.keys()) if metric in x]
        # create list of all dataframes for one metric (for all cities)
        metric_dfs = [metrics_dict[x] for x in metric_keys]
        concatenated_metrics.append(pd.concat(metric_dfs, axis=0, ignore_index=True).drop_duplicates(keep='first'))
    return concatenated_metrics


# Merge all meterological data into one dataframe for all cities and metrics
def merge_city_data(df_list):
    """Merge all data into one dataframe for different metrics

    Args:
        df_list (list): list of dataframes for different metrics

    Returns:
        dataframe: one dataframe with all metrics
    """
    df_merged = df_list[0]
    for df in df_list[1:]:
        df_merged = df_merged.merge(df, on=['date', 'STATIONS_ID'], how='left')
    df_merged.sort_values('date', ascending=True, inplace=True)
    return df_merged


def label_cities(stations_id):
    """Create label from stations_id

    Args:
        stations_id (string): string of stations_id for city

    Returns:
        string: corresponding label for given stations_id
    """
    if (stations_id==1420):
        return 'Frankfurt'
    elif (stations_id==691):
        return 'Bremen'
    # add new cities here



In [None]:
# create dataframe for all cities (global variable STATIONS_ID) and metrics
# append the three functions defined above and call them with parameter for list of stations_ids
dwd_all_cities = merge_city_data(concat_city_data(read_city_data(STATIONS_ID)))

In [None]:
# add city column (e.g. Frankfurt, Bremen)
dwd_all_cities['city'] = dwd_all_cities.apply(lambda x: label_cities(x.STATIONS_ID), axis=1)
# drop column STATIONS_ID because it is no longer needed
# drop PrecipitationIndicator
dwd_all_cities.drop(['STATIONS_ID', 'RS_IND'], inplace=True, axis=1)

In [None]:
# rename columns into readable format
dwd_all_cities = dwd_all_cities.rename(columns = {'RF_TU': 'humidity', 'TT_TU': 'temperature', '  R1': 'precip', '   F': 'wind_speed', '   D': 'wind_direction', '   P': 'pressure_sealevel', '  P0': 'pressure'})

In [None]:
dwd_all_cities

In [None]:
# save as csv
import datetime
day = datetime.datetime.now().date()
dwd_all_cities.to_csv(f'../data/processed_deutscher_wetterdienst_{day}.csv')