# Crawler

## Imports and Settings

In [17]:
# Imports for requests
import re
import requests
from datetime import datetime
from urllib.request import urlretrieve
from urllib.parse import quote

# Used for debugging / exporting
import pickle
import json

In [18]:
# Variables and Settings
topicsAll = []
topicsFiltered = []
locationsAll = []
locationsFiltered = []
dom = 'https://service.berlin.de/'

## Functions

In [19]:
# Creates HTTP requests and saves the files for later usage
def getData() :
    #  locations
    q = 'standorte/'
    url = dom + quote(q)
    print(url)
    urlretrieve(url, 'location.html')

    #  topics
    q = 'dienstleistungen/'
    url = dom + quote(q)
    print(url)
    urlretrieve(url, 'topics.html')

    #  locations filtered
    q = 'standorte/buergeraemter/'
    url = dom + quote(q)
    print(url)
    urlretrieve(url, 'location_filtered.html')

In [20]:
# Extracts locations and ids from the location file
def parseLocationData() :
    f = open('location.html', encoding='utf8')
    line = f.readline()
    found = False
    # regex to identify the parts
    idRex = re.compile('<a href="/standort/([0-9]*)')
    groupRex = re.compile('<div class="span5">(.*)</div>')
    nameRex = re.compile('Bürgeramt')
    # regex to reduce amount of lines checked
    start = '<div class="tab-pane" id="orte_normal">'
    end = '<div class="tab-pane active" id="orte_grouped">'
    while line :
        line = line.strip()
        if line == end :
            found = False
            break

        if line == start or found :
            id = idRex.search(line)
            if id != None :
                name = f.readline().strip()
                nameFilt = nameRex.search(name)
                group = groupRex.search(f.readline().strip())
                if group != None :
                    group = group.group(1)
                else :
                    group = ''
                
                t = {
                    'id' : id.group(1),
                    'name' : name,
                    'group': group
                }
                # add only bürgeramts to this list
                if nameFilt != None :
                    locationsFiltered.append(t)                    
                locationsAll.append(t)
            found = True

        line = f.readline()
    f.close()

In [21]:
# Extracts topics and ids from location file
def parseTopics() :
    f = open('topics.html', encoding='utf8')
    line = f.readline()
    found = False
    # regex for id
    idRex = re.compile('<a href="/dienstleistung/([0-9]*)')
    # regex to filter topics
    topicRex = re.compile('Reisepass|Ausweis|Wohnung', re.IGNORECASE)
    # start end regex
    startRex = re.compile('<div class="azlist"><div class="ort-group">')
    endRex = re.compile('<div class="span3 column-right html5-aside" role="complementary">')
    while line :
        line = line.strip()
        end = endRex.search(line)
        if end != None :
            found = False
            break
        
        start = startRex.search(line)
        if start != None or found :
            id = idRex.search(line)
            if id != None :
                name = f.readline().strip()
                t = {
                    'id' : id.group(1),
                    'name' : name
                }
                if topicRex.search(name) != None :
                    topicsFiltered.append(t)
                topicsAll.append(t)
            found = True

        line = f.readline()

In [22]:
# Helper function that checks the html file for available dates
def checkFile(file) :
    line = file.readline()
    # placeholder for the next link in the calendar
    nextLink = ''
    # regex to find next and check if a date is available
    nextRex = re.compile('<th class="next">')
    nextLinkRex = re.compile('href="/terminvereinbarung/termin/day/([0-9]*)/">')
    availRex = re.compile('class="buchbar".*href="/terminvereinbarung/termin/time/([0-9]*)/"')
    avail = []
    while line :
        line = line.strip()
        # check for next link
        n = nextRex.search(line)
        if n != None :
            line = file.readline().strip()
            n = nextLinkRex.search(line)
            if n != None :
                nextLink = 'terminvereinbarung/termin/day/{ts}/'.format(ts=n.group(1))

        # check for available dates
        n = availRex.search(line)
        if n != None :
            t = {
                'timestamp' : n.group(1),
                'link' : 'terminvereinbarung/termin/time/{ts}/'.format(ts=n.group(1)),
                'date' : datetime.fromtimestamp(int(n.group(1)))
            }
            
            avail.append(t)
        line = file.readline()

    return (nextLink, avail)

In [23]:
# Function that uses the given topic and location to find available dates
# multiple locations can be chained together separated by ','
def getDates(topic, location) : 
    # use sessions because the link will be redirected and the data is stored in a session
    sess = requests.Session()
    available = []
    # create url
    next = 'terminvereinbarung/termin/tag.php?termin=1&dienstleister={top}&anliegen[]={loc}&herkunft=1'.format(loc=location, top=topic)
    while next != '' :
        url = dom + next
        print (url)
        r = sess.get(url)
        if r.status_code == 200 :
            # keep files for debugging
            f = open('temp_dates.html', 'w+')
            f.write(r.text)
            f.seek(0)
            # extract next and available dates from the file
            next, temp = checkFile(f)
            # add results and filter duplicates
            if len(temp) > 0 :
                if len(available) == 0 :
                    available = available + temp
                else :
                    for j in temp :
                        found = False
                        for i in available :
                            if i.get('timestamp') == j.get('timestamp') :
                                found = True
                            break
                        if not found :
                            available = available.append(j)
            f.close()

    #DEBUG: Session is saved for debugging 
    with open('session.pkl', 'wb') as f:  
        pickle.dump(sess, f)

    return available

### Usage

In [24]:
# Use url request to get locations and topics
getData()
# Extract available locations and topics from saved html files
parseLocationData()
parseTopics()

https://service.berlin.de/standorte/
https://service.berlin.de/dienstleistungen/
https://service.berlin.de/standorte/buergeraemter/


In [25]:
# Function that gets the available days for a combination of location and topic
# dates = getDates(topicId, locationId)

In [26]:
# Create json files for filtered values
f = open('topicsFiltered.json', 'w')
json.dump(topicsFiltered, f)
f = open('locationsFiltered.json', 'w')
json.dump(locationsFiltered, f)

## Example from Slides

In [13]:
# test topic get ausweis or pass
# topic id = 120703
# test bürgeramt kreuzberg
# bürgeramt id = 327346
# retrieves dates where times are available
dates = getDates(120703, 327346)
print (dates)
# resulting example dates example: 
# [{'timestamp': '1646262000', 'link': 'terminvereinbarung/termin/time/1646262000/', 'date': datetime.datetime(2022, 3, 3, 0, 0)}]

https://service.berlin.de/terminvereinbarung/termin/tag.php?termin=1&dienstleister=327346&anliegen[]=120703&herkunft=1
https://service.berlin.de/terminvereinbarung/termin/day/1643670000/
https://service.berlin.de/terminvereinbarung/termin/day/1646089200/
[]


## Debugging


In [27]:
# use pickels to save topics and locations
with open('topics.pkl', 'wb') as f:  
    pickle.dump((topicsAll, topicsFiltered), f)
with open('locations.pkl', 'wb') as f:  
    pickle.dump((locationsAll, locationsFiltered), f)
with open('dates.pkl', 'wb') as f:  
    pickle.dump(dates, f)

In [28]:
# load topics and locations
with open('topics.pkl', 'rb') as f:  
    (topicsAll, topicsFiltered) = pickle.load(f)
with open('locations.pkl', 'rb') as f:  
    (locationsAll, locationsFiltered) = pickle.load(f)

In [29]:
# Code that tried to get the html file with the date but got blocked with captcha
with open('dates.pkl', 'rb') as f:
    avail = pickle.load(f)
with open('session.pkl', 'rb') as f:
    sess = pickle.load(f)

for i in avail :
    url = dom + i.get('link')
    r = sess.get(url)
    print(r.status_code)
    print(r.text)
    if r.status_code == 200 :
        f = open('temp_times.html', 'w+')
        f.write(r.text)
        f.close()