# Download and process BAST traffic count data
Source: https://www.bast.de/BASt_2017/DE/Verkehrstechnik/Fachthemen/v2-verkehrszaehlung/zaehl_node.html

In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from os import listdir
import os
from os.path import isfile, join

import requests
from zipfile import ZipFile


### Functions for download 
Functions for download and unzip BAST hourly data

In [67]:
#actual downloading
def download_url(url, save_path):
    #function from: https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url/14260592
    
    chunk_size=128
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

#call download and unzip
def DownloadHourData_BAST(CounterNumber, year, FolderSave):

    BAST_Link = 'https://www.bast.de/videos/' + str(year) + '/zst' + str(CounterNumber) + '.zip'
    FileName = FolderSave + 'zst' + str(CounterNumber) + '_' + str(year) + '.zip'

    #download zip
    download_url(BAST_Link, FileName)
    
    #unzip file if downloading was successfull
    try:
        zf = ZipFile(FileName, 'r')
        zf.extractall(FolderSave)
        zf.close()
    except:
        pass

    #remove zip file
    os.system('rm ' + FileName)


### Function to process one counting station in one year
Reads and processes the downloaded BAST file

In [68]:
def Process_Year_And_CountingStation(path, DZ_Nr, year):
    #read file
    df = pd.read_csv(path, delimiter=';') 

    #reduce to important columns
    var = ['Datum', 'Stunde', 'KFZ_R1', 'KFZ_R2']
    df = df[var]

    #change date information so pandas can read it
    if year < 2010:
        df['date'] = '200' + df['Datum'].astype(str)
    else:
        df['date'] = '20' + df['Datum'].astype(str)
        
    #add both counting directions
    df['KFZ'] = df['KFZ_R1'] + df['KFZ_R2']
    
    #change date format to pandas datetime
    df.date = pd.to_datetime(df.date, format='%Y%m%d')
    
    #reduce to date and KFZ and rename columns
    df = df[['date', 'KFZ']]
    df.rename(columns={'KFZ': str(DZ_Nr)},inplace=True)
    
    #reduce to daily data
    df = df.groupby('date').sum()

    return df


### Function to process download request 
DZ_Nr_unique: Number of counting stations <br>
years: years that should be downloaded <br>
path: path to temp directory where the downloaded data is temporary stored <br>
keepOrigData: Bool if downloaded raw data should be deleted: True: not deleted, False: deleted <br>

In [97]:
def Process(DZ_Nr_unique, years, path, keepOrigData):

    #sort years array ascending
    years.sort()
    print(years)
    
    #create temporary directory
    if not os.path.exists(path):
        os.system('mkdir ' + path)

    #total number of counting stations that should be processed
    total_count = len(DZ_Nr_unique)

    #some prints
    print('All counter stations: ', DZ_Nr_unique)
    print('In total # = ', total_count)
    count = 1
    
    #create empty dataframe for storing all data. 
    df_all = pd.DataFrame()
    df_all['date'] = pd.date_range(start='1/1/'+str(years[0]),end='12/31/'+str(years[-1]))
    df_all.index = df_all.date
    df_all.index.name = "date" 
    df_all = df_all.drop("date",axis=1)
    
    #loop over all counting stations
    for nr in range(0,len(DZ_Nr_unique)):
        
        DZ_Nr = DZ_Nr_unique[nr]
        print('Processing ' , count, ' of ', total_count)
        count = count + 1
        
        #loop over years
        for year in years:
            
            #filename in temp dir
            f = 'zst' + str(DZ_Nr) + '_' + str(year) + '.csv'
            try:
                #download data
                DownloadHourData_BAST(DZ_Nr, year, path)
                #process downloaded data
                df = Process_Year_And_CountingStation(path + f, DZ_Nr, year)
                
                #store data in dataframe for all data
                if df.columns[0] in df_all.columns:
                    df_all[df.columns[0]][df.index] = df[df.columns[0]]
                else:
                    df_all = pd.concat([df_all, df], axis=1)

                #remove downloaded file if keepOrigData not True
                if not keepOrigData:
                    os.system('rm ' + path + f)
            except:
                pass

    return df_all

## Function to download all available BAST data from the website
keepOrigData: Bool if downloaded raw data should be deleted: True: not deleted, False: deleted <br>
Uses "Zeitreihen" file from Zeitreihen_URL variable to get all Counting Station Numbers and years that are available

In [70]:
def DownloadAllData(keepOrigData):

    #download information of all available data
    fileJahresdaten = 'Zeitreihe.csv'
    Zeitreihen_URL = 'https://www.bast.de/BASt_2017/DE/Verkehrstechnik/Fachthemen/v2-verkehrszaehlung/Daten/2018_1/Jawe2018.csv?view=renderTcDataExportCSVAlleJahre&cms_strTyp=A'

    #create temp dir if it does not exist
    path = './temp/'
    if not os.path.exists(path):
        os.system('mkdir ' + path)

    #download time series file
    download_url(Zeitreihen_URL, path + fileJahresdaten)
    df_Jahresdaten = pd.read_csv(path + fileJahresdaten, encoding='latin-1', delimiter=';') 

    #remove unused columns
    var = ['DZ_Nr', 'Jahr']
    df_Jahresdaten = df_Jahresdaten[var]

    #create unique arrays for counting station numbers and years
    DZ_Nr_unique = np.array(df_Jahresdaten.DZ_Nr.drop_duplicates())
    years = np.array(df_Jahresdaten.Jahr.drop_duplicates(), 'int16')

    #process these arrays
    return Process(DZ_Nr_unique, years, path, keepOrigData)

## Function to download data that is given by the arrays
DZ_Nr_arr: numpy array with counting station numbers that should be processes <br>
years: numpy array withyears that should be processes <br>
keepOrigData: Bool if downloaded raw data should be deleted: True: not deleted, False: deleted <br>

In [71]:
def DownloadPartOfData(DZ_Nr_arr, years, keepOrigData):
    path = 'temp/'
    return Process(DZ_Nr_arr, years, path, keepOrigData)

## Function to process daily data to monthly data

In [100]:
def ProcessDailyToMonthly(filename):
    df_daily = pd.read_csv(filename)
    df_monthly = df_daily
    df_monthly.date = pd.to_datetime(df_monthly.date).dt.to_period('m')
    return df_monthly.groupby("date").sum(min_count=28)

## Create one feature value from the data 
Because we do not want to have each counting station as a feature and also not all counting stations are available for the whole time the values of the counting stations are averaged to one feature.

In [73]:
def ProcessToOneFeature(df_feat):
    df_feat['M_BAST_AverTotalVehicPerCountingStation'] = df_feat.mean(axis=1, skipna=True)
    return df_feat[['M_BAST_AverTotalVehicPerCountingStation']]

## Main function that downloads all data and processes it to daily and monthly values

In [1]:
if __name__ == "__main__":
    #example for downloading only part of data
    """
    DZ_Nr_arr = np.array([9140]) # counting station number
    years = np.array([2007]) # year we are interesed in
    keepOrigData = False # hourly data should be deleted
    df_daily = DownloadPartOfData(DZ_Nr_arr, years, keepOrigData) 
    df_daily.to_csv('BAST_CountingStation_daily.csv')
    """
    
    #downloading all data
    PathSave = ''
    df_daily = DownloadAllData(keepOrigData)  
    df_daily.to_csv('../data/mobility/raw_data/ZaehlstellenBAST/BAST_CountingStations_daily.csv')
    
    df_monthly = ProcessDailyToMonthly('../data/mobility/raw_data/ZaehlstellenBAST/BAST_CountingStations_daily.csv')
    df_monthly.to_csv('../data/mobility/raw_data/ZaehlstellenBAST/BAST_CountingStations_monthly.csv')
    
    df_feat = ProcessToOneFeature(df_monthly)
    df_feat.to_csv('../data/mobility/BAST_CountingStations_Feature_monthly.csv')
    