In [24]:
import pycurl
import json
import requests
from io import BytesIO
import pandas as pd
import re

from pathlib import Path

In [15]:
# Function to get metadata from the opendata API

def get_metadata(url, filter):
    """
    Get metadata from the opendata API.

    Parameters:
    url (str): The URL to the API.
    filter (str): The filter to search for.

    Returns:
    dict: A JSON object containing the metadata if the request is successful.
    None: If the request is not successful or the response is empty.
    """
    buffer = BytesIO()
    c = pycurl.Curl()
    c.setopt(c.URL, f'{url}package_search?q=title:{filter}')
    c.setopt(c.WRITEDATA, buffer)

    c.perform()
    c.close()

    response = buffer.getvalue().decode('utf-8')

    if not response.strip():
        print("The response is empty.")
    else:
        try:
            data = json.loads(response)

            if data.get('success'):
                return data
            else:
                print("API did not return a successful response:", data)
        except json.JSONDecodeError as e:
            print("Failed to decode JSON:", e)
            print("Response content:", response)

def save_json(data, filename):
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)

def filter_result_by_year(data, year):
    for result in data['result']['results']:
        if str(year) in result['name']:
            return result
        
def get_resource_url_for_year(data, year):
    combined_data = {
        'data': [],
        'metadata': []
    }

    result = filter_result_by_year(data, year)

    for resource in result['resources']:
        url = resource['download_url']
        if "Haltepunkt" in url or "Haltestelle" in url:
            combined_data['metadata'].append(url)
        else:
            combined_data['data'].append(url)

    return combined_data

def pack_all_urls_into_one_dict(url, filter):
    data = get_metadata(url, filter)

    all_urls = {}

    for dataset in data['result']['results']:
        year = re.search(r'\d{4}', dataset['name']).group(0)

        all_urls[year] = get_resource_url_for_year(data, year)
    
    all_urls_sorted = dict(sorted(all_urls.items()))

    return all_urls_sorted


In [17]:
# Function to download the data from the opendata API

def download_csv(url):

    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.content
    except requests.exceptions.RequestException as e:
        print("Failed to download the data:", e)
        return None

def save_csv(content, filename):
    try:
        with open(filename, 'wb') as csv_file:
            csv_file.write(content)
            print(f"Data saved to {filename}")
    except IOError as e:
        print("Failed to save the data:", e)


In [None]:
# defining the download function
def downloading_data(data_dict, data_path):

    for key, value in test_dict.items():

        save_path = data_path / f"{key}"

        if not save_path.exists():
            save_path.mkdir()
        else:
            raise FileExistsError("The directory already exists.")

        for url in value['data']:
            data = download_csv(url)
            file_path = save_path / url.split('/')[-1]
            save_csv(data, file_path)

        for url in value['metadata']:
            data = download_csv(url)
            file_path = save_path / url.split('/')[-1]
            save_csv(data, file_path)


In [18]:
# Getting the metadata from the API
url = "https://ckan.opendata.swiss/api/3/action/"
filter = "Fahrzeiten"
storage_options = {'User-Agent': 'Mozilla/5.0'}

download_url = pack_all_urls_into_one_dict(url, filter)

In [26]:
# Path setup
project_path = Path('/cfs/earth/scratch/kraftjul/DaAn_Projektarbeit')
data_path = project_path / 'data'

In [27]:
test_url = download_url['2019']['data'][0]
test_path = data_path / 'test.csv'

data = download_csv(test_url)
save_csv(data, test_path)

Data saved to /cfs/earth/scratch/kraftjul/DaAn_Projektarbeit/data/test.csv


In [29]:
test_url.split('/')[-1]

'Fahrzeiten_SOLL_IST_20190324_20190330.csv'

In [37]:
download_url['2019']['metadata'][0]

'https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2019/download/Haltepunkt.csv'

In [38]:
test_dict = {'2019' : {'data': [download_url['2019']['data'][0], download_url['2019']['data'][1]], 'metadata': [download_url['2019']['metadata'][0]]}}

In [39]:
test_dict

{'2019': {'data': ['https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2019/download/Fahrzeiten_SOLL_IST_20190324_20190330.csv',
   'https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2019/download/Fahrzeiten_SOLL_IST_20191117_20191123.csv'],
  'metadata': ['https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2019/download/Haltepunkt.csv']}}

In [44]:
def main(data_dict, data_path):

    for key, value in test_dict.items():

        save_path = data_path / f"{key}"

        if not save_path.exists():
            save_path.mkdir()
        else:
            raise FileExistsError("The directory already exists.")

        for url in value['data']:
            data = download_csv(url)
            file_path = save_path / url.split('/')[-1]
            save_csv(data, file_path)

        for url in value['metadata']:
            data = download_csv(url)
            file_path = save_path / url.split('/')[-1]
            save_csv(data, file_path)


In [6]:
for url in download_url['2019']['data']:
    if "2020" in url:
        print(url)


https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2019/download/Fahrzeiten_SOLL_IST_20191229_20200104.csv


In [7]:
download_url['2019']['metadata'][0]

'https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2019/download/Haltepunkt.csv'

In [10]:
df = pd.read_csv(download_url['2019']['data'][0], storage_options=storage_options)

In [11]:
df

Unnamed: 0,linie,richtung,betriebsdatum,fahrzeug,kurs,seq_von,halt_diva_von,halt_punkt_diva_von,halt_kurz_von1,datum_von,...,fahrweg_id,fw_no,fw_typ,fw_kurz,fw_lang,umlauf_von,halt_id_von,halt_id_nach,halt_punkt_id_von,halt_punkt_id_nach
0,2,1,24.03.19,3069,6,1,6030,0,DEP4,24.03.19,...,99439,15,2,15,DEP4 - KALK,189382,2251,1906,47032,10563
1,2,1,24.03.19,3016,3,1,478,0,BEZI,24.03.19,...,107009,11,1,11,BEZI - BTIE für Ausfahrt,187205,1306,1502,44135,44530
2,2,1,24.03.19,3016,3,5,601,0,KANB,24.03.19,...,107009,11,1,11,BEZI - BTIE für Ausfahrt,187205,2228,2590,49163,46896
3,2,1,24.03.19,3016,3,4,1861,0,PARA,24.03.19,...,107009,11,1,11,BEZI - BTIE für Ausfahrt,187205,1528,2228,44152,49163
4,2,1,24.03.19,3016,3,3,2396,0,SIHS,24.03.19,...,107009,11,1,11,BEZI - BTIE für Ausfahrt,187205,2657,1528,10574,44152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1426573,919,2,30.03.19,11650,1,2,2826,51,WALT,30.03.19,...,107421,2,1,2,DOZE - BKUE,186711,2096,2859,42663,44546
1426574,919,2,30.03.19,11650,1,1,705,51,DOZE,30.03.19,...,107421,2,1,2,DOZE - BKUE,186711,2320,2096,47498,42663
1426575,919,1,30.03.19,11650,1,12,2826,50,WALT,30.03.19,...,106054,1,1,1,BKUE - DOZE,186711,2096,2320,46040,47498
1426576,919,1,30.03.19,11650,1,11,1075,50,GOES,30.03.19,...,106054,1,1,1,BKUE - DOZE,186711,2859,2096,42575,46040


In [27]:
url = download_url['2018']['metadata'][0]

In [29]:
df

Unnamed: 0,halt_id,halt_diva,halt_kurz,halt_lang,halt_ist_aktiv
0,143,2570,BirmSte,"Birmensdorf ZH, Sternen/WSL",True
1,309,3356,WaldBir,"Waldegg, Birmensdorferstrasse",True
2,373,6232,FRAF07,"Zürich Flughafen, Fracht",True
3,539,2655,TBAH01,"Thalwil, Bahnhof",True
4,588,3027,FLUG07,"Zürich Flughafen, Bahnhof",True
...,...,...,...,...,...
725,3198,6999,TWÄR,"Binz bei Maur, Twäracher",True
726,3206,7000,SMEU,"Schlieren, Meuchwis",True
727,3217,7022,BAWO,"Zürich, Bahnhof Wollishofen",True
728,3222,7021,SRAI,"Spreitenbach, Raiacker",True


In [12]:
for year in download_url:
    print(year)
    for url in download_url[year]['metadata']:
        print(url)
    

2015
2016
https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2016/download/Haltepunkt.csv
https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2016/download/Haltestelle.csv
2017
https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2017/download/Haltepunkt.csv
https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2017/download/Haltestelle.csv
2018
https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2018/download/Haltepunkt.csv
https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2018/download/Haltestelle.csv
2019
https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2019/download/Haltepunkt.csv
https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2019/download/Haltestelle.csv
2020
https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2020/download/Haltestelle.csv
https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2020/download/Haltepunkt.csv
2021
https://data.stadt-zuerich.ch/dataset/vbz_fahrzeiten_ogd_2021/download/Haltepunkt.csv
https://data.s