In [1]:
import pandas as pd
import requests
import os

In [2]:
path = '../data/external/'

In [3]:
def saveDwdData(url, path, f_name):
    
    print('Download ' + f_name + ' from ' + url + ' and save to ' + path)
    
    file = requests.get(url)
    open(path + f_name, 'wb').write(file.content)
    file.close()

In [4]:
def getDwdData(url, path, f_name):
    
    saveDwdData(url, path, f_name)
    
    print()
    
    df = pd.read_csv(path + f_name, sep=';')
    print('Created data frame of ' + f_name)
    
    os.remove(path + f_name)
    print('Zip file removed: ' + path + f_name)
    
    return df

In [5]:

def filterForYear(df, year):
    
    df.rename(columns={"MESS_DATUM": "timestamp"}, inplace=True)
    
    df['timestamp'] = pd.to_datetime(df['timestamp'].astype(str))
    
    df.set_index('timestamp', inplace=True)
    
    df = df[df.index.year == year]
    
    return df

In [6]:
def prepWeather():
    urls = {
        'air_temp':'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/air_temperature/historical/10minutenwerte_TU_00691_20100101_20191231_hist.zip',
        'air_temp_extr':'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/extreme_temperature/historical/10minutenwerte_extrema_temp_00691_20100101_20191231_hist.zip',
        'wind':'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/wind/historical/10minutenwerte_wind_00691_20100101_20191231_hist.zip',
        'wind_extr':'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/extreme_wind/historical/10minutenwerte_extrema_wind_00691_20100101_20191231_hist.zip',
        'precipitation':'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/precipitation/historical/10minutenwerte_nieder_00691_20100101_20191231_hist.zip',
        'solar':'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/solar/historical/10minutenwerte_SOLAR_00691_20100101_20191231_hist.zip'       
       }
    
    air_temp = getDwdData(urls['air_temp'], path,'air_temp.zip')

    air_temp_extr = getDwdData(urls['air_temp_extr'], path,'air_temp_extr.zip')

    wind = getDwdData(urls['wind'], path,'wind.zip')

    wind_extr = getDwdData(urls['wind_extr'], path,'wind_extr.zip')

    precipitation = getDwdData(urls['precipitation'], path,'precipitation.zip')

    solar = getDwdData(urls['solar'], path,'solar.zip')
    print('Cleaning air_temp')

    air_temp = filterForYear(air_temp, 2019)

    # Drop unnecessary columns
    air_temp.drop(columns={'STATIONS_ID','  QN', 'PP_10','eor'}, inplace=True)

    # Assign interpretable column names
    air_temp.rename(columns={"TT_10": "temp_2m", "TM5_10": "temp_5cm", "RF_10":"humidity_2m","TD_10":"dew_point_2m"}, inplace=True)
    air_temp.drop(columns={'temp_5cm'}, inplace=True)
    
    air_temp.replace(-999,float('NaN'), inplace=True)
    air_temp.dropna(inplace=True)

    print('Cleaning air_temp_extr')

    air_temp_extr = filterForYear(air_temp_extr, 2019)

    # Drop unnecessary columns
    air_temp_extr.drop(columns=['STATIONS_ID', '  QN','eor'], inplace=True)

    # Assign interpretable column names
    
    air_temp_extr.rename(columns={'TX_10':'max_at_2m', 'TX5_10':'max_at_5cm','TN_10':'min_at_2m','TN5_10':'min_at_5cm'}, inplace=True)
    air_temp_extr.drop(columns={'min_at_2m','min_at_5cm'}, inplace=True)
    air_temp_extr.drop(columns={'max_at_5cm'}, inplace=True)
    
    air_temp_extr.replace(-999,float('NaN'), inplace=True)
    air_temp_extr.dropna(inplace=True)
    
    print('Cleaning wind')

    wind = filterForYear(wind, 2019)

    # Drop unnecessary columns
    wind.drop(columns={'STATIONS_ID','  QN','eor'}, inplace=True)

    # Assign interpretable column names
    wind.rename(columns={'FF_10':'mean_speed_h/s','DD_10':'direction_degree'}, inplace=True)
    
    wind.replace(-999,float('NaN'), inplace=True)
    wind.dropna(inplace=True)
    
    print('Cleaning wind_extr')

    wind_extr = filterForYear(wind_extr, 2019)

    # Drop unnecessary columns
    wind_extr.drop(columns={'STATIONS_ID','  QN','eor'}, inplace=True)

    # Assign interpretable column names
    wind_extr.rename(columns={'FX_10':'max_m/s','FNX_10':'min_mean_m/s','FMX_10':'max_mean_m/s','DX_10':'direction_degree'}, inplace=True)
    
    wind_extr.replace(-999,float('NaN'), inplace=True)
    wind_extr.dropna(inplace=True)
    
    print('Cleaning precipitation')

    precipitation = filterForYear(precipitation, 2019)

    # Drop unnecessary columns
    precipitation.drop(columns={'STATIONS_ID','  QN','eor','RWS_IND_10'}, inplace=True)

    # Assign interpretable column names
    precipitation.rename(columns={'RWS_DAU_10':'min','RWS_10':'mm'}, inplace=True)
   
    precipitation.replace(-999,float('NaN'), inplace=True)
    precipitation.dropna(inplace=True)
    
    print('Cleaning solar')

    solar = filterForYear(solar, 2019)

    # Drop unnecessary columns
    solar.drop(columns={'STATIONS_ID','  QN','eor'}, inplace=True)

    # Assign interpretable column names
    solar.rename(columns={'DS_10':'diffuse_radiation','GS_10':'incoming_radiation','SD_10':'duration_h','LS_10':'longwave_downward_radiation'}, inplace=True)
    
    solar.drop(columns={'longwave_downward_radiation'}, inplace=True)

    solar.replace(-999,float('NaN'), inplace=True)
    solar.dropna(inplace=True)
    
    print('Merging all weather data...')

    all_weather = pd.merge(air_temp, air_temp_extr, on='timestamp')

    all_weather = pd.merge(all_weather, wind, on='timestamp')

    all_weather = pd.merge(all_weather, wind_extr, on='timestamp')

    all_weather = pd.merge(all_weather, precipitation, on='timestamp')

    print(all_weather.where(all_weather==-999).count())
    
    all_weather.to_csv(path + 'weather.gz', compression='gzip')

    print('Getting and cleaning of weather data successful!')
    print('Data saved as ' + path + 'weather.gz')
    print('To import the data use the following command:')
    print("pd.read_csv(path + 'weather.gz', index_col='timestamp')")
    

In [7]:
prepWeather()

Download air_temp.zip from https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/air_temperature/historical/10minutenwerte_TU_00691_20100101_20191231_hist.zip and save to ../data/external/

Created data frame of air_temp.zip
Zip file removed: ../data/external/air_temp.zip
Download air_temp_extr.zip from https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/extreme_temperature/historical/10minutenwerte_extrema_temp_00691_20100101_20191231_hist.zip and save to ../data/external/

Created data frame of air_temp_extr.zip
Zip file removed: ../data/external/air_temp_extr.zip
Download wind.zip from https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/wind/historical/10minutenwerte_wind_00691_20100101_20191231_hist.zip and save to ../data/external/

Created data frame of wind.zip
Zip file removed: ../data/external/wind.zip
Download wind_extr.zip from https://opendata.dwd.de/climate_environment

In [8]:
w = pd.read_csv(path + 'weather.gz', index_col='timestamp')

In [13]:
w.isna().sum()

temp_2m               0
humidity_2m           0
dew_point_2m          0
max_at_2m             0
mean_speed_h/s        0
direction_degree_x    0
max_m/s               0
min_mean_m/s          0
max_mean_m/s          0
direction_degree_y    0
min                   0
mm                    0
dtype: int64

In [14]:
def getTripsFrame():
    data = pd.read_csv('../data/processed/trips_cleaned.csv',  index_col=0 )
    data['start_time'] = pd.to_datetime(data['start_time'])
    data['end_time'] = pd.to_datetime(data['end_time'])
    
    return data

In [15]:


def mergeWeatherTrips(trips, weather):
    weather['timestamp'] = pd.to_datetime(weather['timestamp'])
    
    # Floor start time to next 10 minutes so we can merge it with weather data

    trips['sTime_floored'] = pd.to_datetime(trips['start_time']).dt.floor('10T')
    trips['sTime_floored'] = pd.to_datetime(trips['sTime_floored'])
    data = trips.merge(right= weather, left_on='sTime_floored', right_on='timestamp', how='left')
    data.drop(columns=['sTime_floored'], inplace=True)
    print("Trip and weather data merged")
    data.to_csv('../data/processed/trips_weather.csv')

In [17]:
trips = getTripsFrame()
weather = pd.read_csv('../data/external/weather.gz', compression='gzip')
mergeWeatherTrips(trips, weather)

Trip and weather data merged
