# Downloading Mobi data

## Accessing URLs

In [1]:
from bs4 import BeautifulSoup # for html parsing
import requests # for accessing html

In [2]:
# connect to Mobi site
response = requests.get('https://www.mobibikes.ca/en/system-data')
print(response.text)

<!DOCTYPE html>
<html lang="en" dir="ltr">
<head>
  <link rel="profile" href="http://www.w3.org/1999/xhtml/vocab" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="Generator" content="Drupal 7 (http://drupal.org)" />
<link rel="canonical" href="/en/system-data" />
<link rel="shortlink" href="/en/node/271" />
<link rel="shortcut icon" href="https://www.mobibikes.ca/sites/all/themes/smoove_bootstrap/favicon.ico" type="image/vnd.microsoft.icon" />
<meta name="description" content="Vancouver Bike Share - Mobi by ShawGo. Community public bike share program, servicing Vancouver." />
  <meta name="apple-itunes-app" content="app-id=47RKCYMP58.com.choosit.smoove">
  <!-- Start SmartBanner configuration -->
  <meta name="smartbanner:title" content="Mobi by Shaw Go">
  <meta name="smartbanner:author" content="Download the Mobi Bike Share app">
  <meta name="smartbanner:price" content="Fre

The data urls all include 'google.com'.  We can use this to filter:

In [3]:
# create html parser
soup = BeautifulSoup(markup=response.text, features='html.parser')
data_elements = soup.find_all(href = lambda x : x and 'google.com' in x)
data_elements

[<a href="https://drive.google.com/file/d/1F7GmkgwoWPMh_iaeu619cKzhxLGtM1G1/view?usp=sharing">April 2024</a>,
 <a href="https://drive.google.com/file/d/1FFWoEhYeWz_2-ykei9hJ32oe4DOZxn4x/view?usp=sharing">March 2024</a>,
 <a href="https://docs.google.com/spreadsheets/d/19hgt6uX63S8NGGJqJtLLhjToapnyz_gy/edit?usp=sharing&amp;ouid=100992631487215186312&amp;rtpof=true&amp;sd=true">February 2024</a>,
 <a href="https://drive.google.com/file/d/16wW2fgkyaXoQewe29GxWrrpJFtDUqikJ/view?usp=sharing">January 2024</a>,
 <a href="https://drive.google.com/file/d/13jW3rph1VyC13EzBxGOMZdFP3xgTb_m-/view?usp=drive_link">December 2023</a>,
 <a href="https://drive.google.com/file/d/10medxsRW5v0-hROBfgBIybT7T47Snbcy/view?usp=sharing">November 2023</a>,
 <a href="https://drive.google.com/file/d/1gKUKesn99zxt76qz8a-1o1AeW8WSIJuK/view?usp=drive_link">October 2023</a>,
 <a href="https://drive.google.com/file/d/17SowN86MrVJXpI7ou5Y2qrvCgXJSTlAI/view?usp=drive_link">September 2023</a>,
 <a href="https://drive.googl

In [4]:
# build dictionary of urls and their text labels (dates)
urls = {elt.get_text() : elt['href'] for elt in data_elements}
urls

{'April 2024': 'https://drive.google.com/file/d/1F7GmkgwoWPMh_iaeu619cKzhxLGtM1G1/view?usp=sharing',
 'March 2024': 'https://drive.google.com/file/d/1FFWoEhYeWz_2-ykei9hJ32oe4DOZxn4x/view?usp=sharing',
 'February 2024': 'https://docs.google.com/spreadsheets/d/19hgt6uX63S8NGGJqJtLLhjToapnyz_gy/edit?usp=sharing&ouid=100992631487215186312&rtpof=true&sd=true',
 'January 2024': 'https://drive.google.com/file/d/16wW2fgkyaXoQewe29GxWrrpJFtDUqikJ/view?usp=sharing',
 'December 2023': 'https://drive.google.com/file/d/13jW3rph1VyC13EzBxGOMZdFP3xgTb_m-/view?usp=drive_link',
 'November 2023': 'https://drive.google.com/file/d/10medxsRW5v0-hROBfgBIybT7T47Snbcy/view?usp=sharing',
 'October 2023': 'https://drive.google.com/file/d/1gKUKesn99zxt76qz8a-1o1AeW8WSIJuK/view?usp=drive_link',
 'September 2023': 'https://drive.google.com/file/d/17SowN86MrVJXpI7ou5Y2qrvCgXJSTlAI/view?usp=drive_link',
 'August 2023': 'https://drive.google.com/file/d/17oWflFFNCWvwwHA_wgPs-umZr5NMBapO/view?usp=drive_link',
 'July 2

There is a little bit of cleanup to do:
* July 2019 is split into two entries (`'Ju'` and `'ly 2019'`) with the same URL.
* The penultimate entry of `urls` (`'\xa0'`) is a repeat of 2017 data (I think?).
* The final entry of `urls` (`'https://...'`) is a repeat of April 2022.  (Tbh I don't know why this was here.)

In [7]:
# rename 'Ju' to 'July 2019'
urls['July 2019'] = urls.pop('Ju')

# delete duplicates
for key in ['ly 2019', '\xa0', 'https://drive.google.com/file/d/1c_rWhMcwXRnNt06psxmt_UN8IjbJF91E/view?usp=sharing']:
    del urls[key]

urls

{'April 2024': 'https://drive.google.com/file/d/1F7GmkgwoWPMh_iaeu619cKzhxLGtM1G1/view?usp=sharing',
 'March 2024': 'https://drive.google.com/file/d/1FFWoEhYeWz_2-ykei9hJ32oe4DOZxn4x/view?usp=sharing',
 'February 2024': 'https://docs.google.com/spreadsheets/d/19hgt6uX63S8NGGJqJtLLhjToapnyz_gy/edit?usp=sharing&ouid=100992631487215186312&rtpof=true&sd=true',
 'January 2024': 'https://drive.google.com/file/d/16wW2fgkyaXoQewe29GxWrrpJFtDUqikJ/view?usp=sharing',
 'December 2023': 'https://drive.google.com/file/d/13jW3rph1VyC13EzBxGOMZdFP3xgTb_m-/view?usp=drive_link',
 'November 2023': 'https://drive.google.com/file/d/10medxsRW5v0-hROBfgBIybT7T47Snbcy/view?usp=sharing',
 'October 2023': 'https://drive.google.com/file/d/1gKUKesn99zxt76qz8a-1o1AeW8WSIJuK/view?usp=drive_link',
 'September 2023': 'https://drive.google.com/file/d/17SowN86MrVJXpI7ou5Y2qrvCgXJSTlAI/view?usp=drive_link',
 'August 2023': 'https://drive.google.com/file/d/17oWflFFNCWvwwHA_wgPs-umZr5NMBapO/view?usp=drive_link',
 'July 2

## Downloading the data

In [8]:
# construct URL for downloading (rather than viewing) data

def get_download_url(date):
    url = urls[date]
    file_ID = url.split(sep='/')[-2]

    if 'drive.google.com' in url:
        return 'https://drive.google.com/uc?export=download&id=' + file_ID
    elif 'docs.google.com' in url:
        return 'https://docs.google.com/uc?export=download&id=' + file_ID
    else:
        print('Unable to constuct URL for ' + date + ' data')

In [9]:
# download data for 'dates', a list of keys for the urls dictionary

import os

def download(dates):
    if not isinstance(dates,list):
        dates = [dates]
    for date in dates:
        if os.path.exists(date+'.csv'):
            print('File ' + date + '.csv already exists.  No download initiated')
        else:
            url = get_download_url(date)
            response = requests.get(url)
            code = response.status_code
            if code != 200:
                print('Unable to use URL for ' + date + ' data (error code ' + str(code) + '): ' + url)
            else:
                with open(date+'.csv', 'wb') as f:
                    for chunk in response.iter_content(chunk_size=1024):
                        if chunk:
                            f.write(chunk)
                print('Data for ' + date + ' written to file ' + date + '.csv')
            response.close()
        

To download data, call the `download` function.  The argument `dates` should be a key of the dictionary `urls` (e.g. `'April 2024'`) or a list of keys (e.g. `['April 2024, 'June 2021']`).  To download all of the data, you can use `dates = list(urls.keys())`.

In [None]:
download()