# Get weather data

In [1]:
import pandas as pd
import requests
import os

In [2]:
path = '../data/external/'

## Necessary methods to download and save data as data frames.

In [3]:
def saveDwdData(url, path, f_name):
    
    print('Download ' + f_name + ' from ' + url + ' and save to ' + path)
    
    file = requests.get(url)
    open(path + f_name, 'wb').write(file.content)
    file.close()

In [4]:
def getDwdData(url, path, f_name):
    
    saveDwdData(url, path, f_name)
    
    print()
    
    df = pd.read_csv(path + f_name, sep=';')
    print('Created data frame of ' + f_name)
    
    os.remove(path + f_name)
    print('Zip file removed: ' + path + f_name)
    
    return df

### Dictionary with weather data urls of https://opendata.dwd.de/

In [5]:
urls = {
        'air_temp':'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/air_temperature/historical/10minutenwerte_TU_00691_20100101_20191231_hist.zip',
        'air_temp_extr':'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/extreme_temperature/historical/10minutenwerte_extrema_temp_00691_20100101_20191231_hist.zip',
        'wind':'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/wind/historical/10minutenwerte_wind_00691_20100101_20191231_hist.zip',
        'wind_extr':'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/extreme_wind/historical/10minutenwerte_extrema_wind_00691_20100101_20191231_hist.zip',
        'precipitation':'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/precipitation/historical/10minutenwerte_nieder_00691_20100101_20191231_hist.zip',
        'solar':'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/solar/historical/10minutenwerte_SOLAR_00691_20100101_20191231_hist.zip'       
       }

### Create data frames.

In [6]:
air_temp = getDwdData(urls['air_temp'], path,'air_temp.zip')

air_temp_extr = getDwdData(urls['air_temp_extr'], path,'air_temp_extr.zip')

wind = getDwdData(urls['wind'], path,'wind.zip')

wind_extr = getDwdData(urls['wind_extr'], path,'wind_extr.zip')

precipitation = getDwdData(urls['precipitation'], path,'precipitation.zip')

solar = getDwdData(urls['solar'], path,'solar.zip')

Download air_temp.zip from https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/air_temperature/historical/10minutenwerte_TU_00691_20100101_20191231_hist.zip and save to ../data/external/

Created data frame of air_temp.zip
Zip file removed: ../data/external/air_temp.zip
Download air_temp_extr.zip from https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/extreme_temperature/historical/10minutenwerte_extrema_temp_00691_20100101_20191231_hist.zip and save to ../data/external/

Created data frame of air_temp_extr.zip
Zip file removed: ../data/external/air_temp_extr.zip
Download wind.zip from https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/wind/historical/10minutenwerte_wind_00691_20100101_20191231_hist.zip and save to ../data/external/

Created data frame of wind.zip
Zip file removed: ../data/external/wind.zip
Download wind_extr.zip from https://opendata.dwd.de/climate_environment

### Change to code if wanna save as csv

air_temp.to_csv(path + 'air_temp.csv', index=False)

air_temp_extr.to_csv(path + 'air_temp_extr.csv', index=False)

wind.to_csv(path + 'to_csv.csv', index=False)

wind_extr.to_csv(path + 'wind_extr.csv', index=False)

precipitation.to_csv(path + 'precipitation.csv', index=False)

solar.to_csv(path + 'solar.csv', index=False)

# Clean data

## Method to filter data set for a specific year.

In [7]:
def filterForYear(df, year):
    
    df.rename(columns={"MESS_DATUM": "timestamp"}, inplace=True)
    
    df['timestamp'] = pd.to_datetime(df['timestamp'].astype(str))
    
    df.set_index('timestamp', inplace=True)
    
    df = df[df.index.year == year]
    
    return df

## Clean air_temp

In [8]:
print('Cleaning air_temp')

air_temp = filterForYear(air_temp, 2019)

# Drop unnecessary columns
air_temp.drop(columns={'STATIONS_ID','  QN', 'PP_10','eor'}, inplace=True)

# Assign interpretable column names
air_temp.rename(columns={"TT_10": "temp_2m", "TM5_10": "temp_5cm", "RF_10":"humidity_2m","TD_10":"dew_point_2m"}, inplace=True)

# Check for correlation to keep only on of the features with strong correlation
air_temp.corr().style.background_gradient(cmap='coolwarm')

Cleaning air_temp


Unnamed: 0,temp_2m,temp_5cm,humidity_2m,dew_point_2m
temp_2m,1.0,0.897511,0.888592,0.962784
temp_5cm,0.897511,1.0,0.824193,0.899505
humidity_2m,0.888592,0.824193,1.0,0.948762
dew_point_2m,0.962784,0.899505,0.948762,1.0


In [9]:
air_temp.drop(columns={'temp_5cm'}, inplace=True)

## Null values

In [10]:
air_temp.where(air_temp==-999).count()

temp_2m         137
humidity_2m     147
dew_point_2m    147
dtype: int64

## Clean air_temp_extr

In [11]:
print('Cleaning air_temp_extr')

air_temp_extr = filterForYear(air_temp_extr, 2019)

# Drop unnecessary columns
air_temp_extr.drop(columns=['STATIONS_ID', '  QN','eor'], inplace=True)

# Assign interpretable column names
air_temp_extr.rename(columns={'TX_10':'max_at_2m', 'TX5_10':'max_at_5cm','TN_10':'min_at_2m','TN5_10':'min_at_5cm'}, inplace=True)

# Check for correlation to keep only on of the features with strong correlation
air_temp_extr.corr().style.background_gradient(cmap='coolwarm')

Cleaning air_temp_extr


  xa[xa < 0] = -1


Unnamed: 0,max_at_2m,max_at_5cm,min_at_2m,min_at_5cm
max_at_2m,1.0,,0.999986,0.98142
max_at_5cm,,,,
min_at_2m,0.999986,,1.0,0.981395
min_at_5cm,0.98142,,0.981395,1.0


In [12]:
air_temp_extr.drop(columns={'min_at_2m','min_at_5cm'}, inplace=True)

## Null values

In [13]:
air_temp_extr.where(air_temp_extr==-999).count()

max_at_2m       157
max_at_5cm    52560
dtype: int64

In [14]:
air_temp_extr.drop(columns={'max_at_5cm'}, inplace=True)

## Clean wind

In [15]:
print('Cleaning wind')

wind = filterForYear(wind, 2019)

# Drop unnecessary columns
wind.drop(columns={'STATIONS_ID','  QN','eor'}, inplace=True)

# Assign interpretable column names
wind.rename(columns={'FF_10':'mean_speed_h/s','DD_10':'direction_degree'}, inplace=True)

# Check for correlation to keep only on of the features with strong correlation
wind.corr().style.background_gradient(cmap='coolwarm')

Cleaning wind


Unnamed: 0,mean_speed_h/s,direction_degree
mean_speed_h/s,1.0,0.659731
direction_degree,0.659731,1.0


## Null values

In [16]:
wind.where(wind==-999).count()

mean_speed_h/s      193
direction_degree    193
dtype: int64

## Clean wind_extr

In [17]:
print('Cleaning wind_extr')

wind_extr = filterForYear(wind_extr, 2019)

# Drop unnecessary columns
wind_extr.drop(columns={'STATIONS_ID','  QN','eor'}, inplace=True)

# Assign interpretable column names
wind_extr.rename(columns={'FX_10':'max_m/s','FNX_10':'min_mean_m/s','FMX_10':'max_mean_m/s','DX_10':'direction_degree'}, inplace=True)

# Check for correlation to keep only on of the features with strong correlation
wind_extr.corr().style.background_gradient(cmap='coolwarm')

Cleaning wind_extr


Unnamed: 0,max_m/s,min_mean_m/s,max_mean_m/s,direction_degree
max_m/s,1.0,0.984742,0.999845,0.696295
min_mean_m/s,0.984742,1.0,0.985088,0.681196
max_mean_m/s,0.999845,0.985088,1.0,0.694522
direction_degree,0.696295,0.681196,0.694522,1.0


## Null values

In [18]:
wind_extr.where(wind_extr==-999).count()

max_m/s             236
min_mean_m/s        239
max_mean_m/s        236
direction_degree    236
dtype: int64

## Clean precipitation

In [19]:
print('Cleaning precipitation')

precipitation = filterForYear(precipitation, 2019)

# Drop unnecessary columns
precipitation.drop(columns={'STATIONS_ID','  QN','eor','RWS_IND_10'}, inplace=True)

# Assign interpretable column names
precipitation.rename(columns={'RWS_DAU_10':'min','RWS_10':'mm'}, inplace=True)

# Check for correlation to keep only on of the features with strong correlation
precipitation.corr().style.background_gradient(cmap='coolwarm')

Cleaning precipitation


Unnamed: 0,min,mm
min,1.0,-0.0414883
mm,-0.0414883,1.0


## Null values

In [20]:
precipitation.where(precipitation==-999).count()

min     0
mm     54
dtype: int64

## Clean solar

In [21]:
print('Cleaning solar')

solar = filterForYear(solar, 2019)

# Drop unnecessary columns
solar.drop(columns={'STATIONS_ID','  QN','eor'}, inplace=True)

# Assign interpretable column names
solar.rename(columns={'DS_10':'diffuse_radiation','GS_10':'incoming_radiation','SD_10':'duration_h','LS_10':'longwave_downward_radiation'}, inplace=True)

# Check for correlation to keep only on of the features with strong correlation
solar.corr().style.background_gradient(cmap='coolwarm')

Cleaning solar


Unnamed: 0,diffuse_radiation,incoming_radiation,duration_h,longwave_downward_radiation
diffuse_radiation,1.0,0.989635,0.0682492,
incoming_radiation,0.989635,1.0,0.182328,
duration_h,0.0682492,0.182328,1.0,
longwave_downward_radiation,,,,


In [22]:
solar.drop(columns={'longwave_downward_radiation'}, inplace=True)

## Null values

In [23]:
solar.where(solar==-999).count()

diffuse_radiation     192
incoming_radiation    192
duration_h              0
dtype: int64

# Create one data frame for all weather data

In [24]:
print('Merging all weather data...')

all_weather = pd.merge(air_temp, air_temp_extr, on='timestamp')

all_weather = pd.merge(all_weather, wind, on='timestamp')

all_weather = pd.merge(all_weather, wind_extr, on='timestamp')

all_weather = pd.merge(all_weather, precipitation, on='timestamp')

all_weather.where(all_weather==-999).count()

Merging all weather data...


temp_2m               137
humidity_2m           147
dew_point_2m          147
max_at_2m             157
mean_speed_h/s        193
direction_degree_x    193
max_m/s               236
min_mean_m/s          239
max_mean_m/s          236
direction_degree_y    236
min                     0
mm                     54
dtype: int64

# Export weather data

In [25]:
all_weather.to_csv(path + 'weather.gz', compression='gzip')

In [26]:
print('Getting and cleaning of weather data successful!')
print('Data saved as ' + path + 'weather.gz')

Getting and cleaning of weather data successful!
Data saved as ../data/external/weather.gz


In [27]:
print('To import the data use the following command:')
print("pd.read_csv(path + 'weather.gz', index_col='timestamp')")

To import the data use the following command:
pd.read_csv(path + 'weather.gz', index_col='timestamp')
