# Downloading Mobi data

## Accessing URLs

In [None]:
from bs4 import BeautifulSoup # for html parsing
import requests # for accessing html

In [None]:
# connect to Mobi site
response = requests.get('https://www.mobibikes.ca/en/system-data')
print(response.text)

The data urls all include 'google.com'.  We can use this to filter:

In [None]:
# create html parser
soup = BeautifulSoup(markup=response.text, features='html.parser')
data_elements = soup.find_all(href = lambda x : x and 'google.com' in x)
data_elements

In [None]:
# build dictionary of urls and their text labels (dates)
urls = {elt.get_text() : elt['href'] for elt in data_elements}
urls

There is a little bit of cleanup to do:
* July 2019 is split into two entries (`'Ju'` and `'ly 2019'`) with the same URL.
* The penultimate entry of `urls` (`'\xa0'`) is a repeat of 2017 data (I think?).
* The final entry of `urls` (`'https://...'`) is a repeat of April 2022.  (Tbh I don't know why this was here.)

In [None]:
# rename 'Ju' to 'July 2019'
urls['July 2019'] = urls.pop('Ju')

# delete duplicates
for key in ['ly 2019', '\xa0', 'https://drive.google.com/file/d/1c_rWhMcwXRnNt06psxmt_UN8IjbJF91E/view?usp=sharing']:
    del urls[key]

urls

## Downloading the data

In [None]:
# construct URL for downloading (rather than viewing) data

def get_download_url(date):
    url = urls[date]
    file_ID = url.split(sep='/')[-2]

    if 'drive.google.com' in url:
        return 'https://drive.google.com/uc?export=download&id=' + file_ID
    elif 'docs.google.com' in url:
        return 'https://docs.google.com/uc?export=download&id=' + file_ID
    else:
        print('Unable to constuct URL for ' + date + ' data')

In [None]:
# download data for 'dates', a list of keys for the urls dictionary

import os

def download(dates):
    if not isinstance(dates,list):
        dates = [dates]
    for date in dates:
        if os.path.exists(date+'.csv'):
            print('File ' + date + '.csv already exists.  No download initiated')
        else:
            url = get_download_url(date)
            response = requests.get(url)
            code = response.status_code
            if code != 200:
                print('Unable to use URL for ' + date + ' data (error code ' + str(code) + '): ' + url)
            else:
                with open(date+'.csv', 'wb') as f:
                    for chunk in response.iter_content(chunk_size=1024):
                        if chunk:
                            f.write(chunk)
                print('Data for ' + date + ' written to file ' + date + '.csv')
            response.close()
        

To download data, call the `download` function.  The argument `dates` should be a key of the dictionary `urls` (e.g. `'April 2024'`) or a list of keys (e.g. `['April 2024, 'June 2021']`).  To download all of the data, you can use `dates = list(urls.keys())`.

In [None]:
download()