# 1 - Data collection

Enable progress bars (requires restarting jupyter lab)

In [None]:
%%bash
pip install ipywidgets
jupyter nbextension enable --py widgetsnbextension
jupyter labextension install @jupyter-widgets/jupyterlab-manager

Setup

In [None]:
import io
from zipfile import ZipFile
import requests as rq
from datetime import date
import pandas as pd
import re

today = date.today().isoformat()

In [None]:
from IPython.display import Audio
sonar = '../sonar.ogg'

## 1.1 Download pipeline system data

In [None]:
pipelines_2010_present = rq.get('https://www.phmsa.dot.gov/sites/phmsa.dot.gov/files/data_statistics/pipeline/annual_hazardous_liquid_2010_present.zip')
pipelines_2004_2009 = rq.get('https://www.phmsa.dot.gov/sites/phmsa.dot.gov/files/data_statistics/pipeline/annual_hazardous_liquid_2004_2009.zip')

pipelines_2010_present = ZipFile(io.BytesIO(pipelines_2010_present.content))
pipelines_2004_2009 = ZipFile(io.BytesIO(pipelines_2004_2009.content))

pipelines_2010_present.extractall(f'../data/pipelines_2010_present_{today}')
pipelines_2004_2009.extractall(f'../data/pipelines_2004_2009_{today}')

In [None]:
pipelines_2010_present = pd.read_excel('../data/pipelines_2010_present_2019-08-09/annual_hazardous_liquid_2010.xlsx', header=2)

pipelines_2010_present.head()

In [None]:
gas_2010_present = rq.get('https://www.phmsa.dot.gov/sites/phmsa.dot.gov/files/data_statistics/pipeline/annual_liquefied_natural_gas_2010_present.zip')
gas_2001_2009 = rq.get('https://www.phmsa.dot.gov/sites/phmsa.dot.gov/files/data_statistics/pipeline/annual_gas_transmission_gathering_2001_2009.zip')

gas_2010_present = ZipFile(io.BytesIO(gas_2010_present.content))
gas_2001_2009 = ZipFile(io.BytesIO(gas_2001_2009.content))

gas_2010_present.extractall(f'../data/gas_2010_present_{today}')
gas_2001_2009.extractall(f'../data/gas_2001_2009_{today}')

In [None]:
gas_2010_present = pd.read_excel('../data/gas_2010_present_2019-08-09/annual_liquefied_natural_gas_2015.xlsx', header=2)

gas_2010_present.head()

## 1.2 Download incidents data

In [None]:
incidents_file = rq.get('https://www.phmsa.dot.gov/sites/phmsa.dot.gov/files/data_statistics/pipeline/PHMSA_Pipeline_Safety_Flagged_Incidents.zip')
incidents_file = ZipFile(io.BytesIO(incidents_file.content))

incidents_file.extractall(f'../data/incidents_{today}')

In [None]:
incidents = pd.read_excel('../data/incidents_2019-08-01/hl2010toPresent.xlsx', sheet_name=1)

incidents.head()

## 1.3 Download FERC notices

### Download list of all notices

In [None]:
api = 'https://www.federalregister.gov/api/v1/documents.json'

params = {'fields[]': ['document_number', 'publication_date', 'title', 'raw_text_url'], 
          'per_page': 1000, 
          'order': 'oldest', 
          'conditions[agencies][]': 'federal-energy-regulatory-commission', 
          'conditions[type][]': 'NOTICE'}

Iterate over all pages of one search (for one year) and download data on all documents (before downloading the full-text of all documents

In [None]:
import requests as rq

def get_documents(api, params, year):
    params['conditions[publication_date][year]'] = year
    max_page = 9999
    page = 1
    notices = []
    while page <= max_page:
        params['page'] = page
        with rq.get(api, params=params) as response:
            overview = response.json()

            max_page = overview['total_pages']
            print(f'Loaded year {year}, page {page} of {max_page}.')

            results = overview['results']
            notices = notices + results
        
        page += 1
        
    return notices

Collect all data from 2005-2019, except full-text

In [None]:
notices = []
for year in range(2005, 2020):
    notices = notices + get_documents(api=api, params=params, year=year)

In [None]:
import pandas as pd
notices = pd.DataFrame(notices)

notices.head()

### Paralellize download of sites

In [None]:
import re
import tqdm
from requests import Session
from multiprocessing import Pool
from tqdm import tqdm_notebook as tqdm

def download_site(url):
    with session.get(url) as response:
        return response.text
    
def download_all_sites(sites, preprocessing=None, processes=8):
    with Pool(processes=processes) as pool:
        results = tqdm(pool.imap(download_site, sites), total=len(sites))
        if preprocessing:
            results = map(preprocessing, results)
        return list(results)

We will need this function to clean the individual documents we download

In [None]:
def clean_notice(notice):
    return re.findall('<pre>(.*)</pre>', notice, flags=re.DOTALL)

In [None]:
session = Session()

notices['full-text'] = download_all_sites(list(notices['raw_text_url']), preprocessing=clean_notice, processes=7)

In [None]:
Audio(sonar, autoplay=True)

In [None]:
notices.sample()

In [None]:
notices.to_csv(f'../data/ferc_notices_{today}.csv')