# 1 - Data collection

Enable progress bars (requires restarting jupyter lab)

In [None]:
%%bash
pip install ipywidgets
jupyter nbextension enable --py widgetsnbextension
jupyter labextension install @jupyter-widgets/jupyterlab-manager

Setup

In [1]:
import io
from zipfile import ZipFile
import requests as rq
from datetime import date
import pandas as pd
import re

today = date.today().isoformat()

In [2]:
from IPython.display import Audio
sonar = '../sonar.ogg'

## 1.1 Download pipeline system data

In [3]:
pipelines_2010_present = rq.get('https://www.phmsa.dot.gov/sites/phmsa.dot.gov/files/data_statistics/pipeline/annual_hazardous_liquid_2010_present.zip')
pipelines_2004_2009 = rq.get('https://www.phmsa.dot.gov/sites/phmsa.dot.gov/files/data_statistics/pipeline/annual_hazardous_liquid_2004_2009.zip')

pipelines_2010_present = ZipFile(io.BytesIO(pipelines_2010_present.content))
pipelines_2004_2009 = ZipFile(io.BytesIO(pipelines_2004_2009.content))

pipelines_2010_present.extractall(f'../data/pipelines_2010_present_{today}')
pipelines_2004_2009.extractall(f'../data/pipelines_2004_2009_{today}')

In [4]:
pipelines_2010_present = pd.read_excel('../data/pipeline_sytem_2019-08-01/annual_hazardous_liquid_2010.xlsx', header=2)

pipelines_2010_present.head()

Unnamed: 0,DATAFILE_AS_OF,REPORT_YEAR,REPORT_NUMBER,SUPPLEMENTAL_NUMBER,OPERATOR_ID,PARTA2NAMEOFCOMP,PARTA4STREET,PARTA4CITY,PARTA4STATE,PARTA4ZIP,...,PARTE2010HF,PARTE2010LF,PARTE2010TOTAL,PARTETOTAL,PARTETOTALHF,PARTETOTALLF,REPORT_SUBMISSION_TYPE,REPORT_DATE,FILING_DATE,FORM_REV
0,2019-07-01 10:31:50,2010,20110334,12436,300,"PLAINS PIPELINE, L.P.",333 CLAY STREET,HOUSTON,TX,77002,...,0.0,0.0,0.0,1994.32,734.12,1260.2,SUPPLEMENTAL,2011-08-15 18:24:50,2013-08-08 15:08:42,7000-1.1 (Rev. 06-2011)
1,2019-07-01 10:31:50,2010,20110333,10345,300,"PLAINS PIPELINE, L.P.",333 CLAY STREET,HOUSTON,TX,77002,...,0.0,0.0,0.0,256.65,6.91,249.74,INITIAL,2011-08-15 18:24:11,2011-08-15 18:24:11,7000-1.1 (Rev. 06-2011)
2,2019-07-01 10:31:50,2010,20110192,10194,395,AMOCO OIL CO,150 W. Warrenville Rd,OAKBROOK TERRACE,IL,60563,...,0.0,0.0,0.0,220.0,93.0,127.0,INITIAL,2011-08-13 16:22:01,2011-08-13 16:22:01,7000-1.1 (Rev. 06-2011)
3,2019-07-01 10:31:50,2010,20110079,10080,401,HESS CORPORATION,1501 MCKINNEY STREET,BLOOMINGTON,TX,77002,...,0.0,0.0,0.0,31.5,31.5,0.0,INITIAL,2011-08-10 12:09:38,2011-08-10 12:09:38,7000-1.1 (Rev. 06-2011)
4,2019-07-01 10:31:50,2010,20110373,10391,402,"MARKWEST EAST TEXAS PNG UTILITY, LLC","1515 Arapahoe St, Tower 2, Suite 700",Denver,CO,80202,...,0.0,0.0,0.0,49.0,49.0,0.0,INITIAL,2011-08-16 15:29:35,2011-08-16 15:29:35,7000-1.1 (Rev. 06-2011)


## 1.2 Download incidents data

In [5]:
incidents_file = rq.get('https://www.phmsa.dot.gov/sites/phmsa.dot.gov/files/data_statistics/pipeline/PHMSA_Pipeline_Safety_Flagged_Incidents.zip')
incidents_file = ZipFile(io.BytesIO(incidents_file.content))

incidents_file.extractall(f'../data/incidents_{today}')

In [6]:
incidents = pd.read_excel('../data/incidents_2019-08-01/hl2010toPresent.xlsx', sheet_name=1)

incidents.head()

Unnamed: 0,DATAFILE_AS_OF,SIGNIFICANT,SERIOUS,IPE,IA_IPE,OM_IPE,REPORT_NUMBER,SUPPLEMENTAL_NUMBER,REPORT_RECEIVED_DATE,REPORT_TYPE,...,PREPARER_TITLE,PREPARER_EMAIL,PREPARER_TELEPHONE,PREPARER_FAX,PREPARED_DATE,AUTHORIZER_NAME,AUTHORIZER_TITLE,AUTHORIZER_TELEPHONE,AUTHORIZER_EMAIL,NARRATIVE
0,2019-07-31 08:37:40,NO,NO,NO,NO,NO,20100001,15751,2010-03-10 13:05:46,SUPPLEMENTAL FINAL,...,SENIOR COMPLIANCE COORDINATOR,ken.lybarger@magellanlp.com,918-574-7315,918-574-7246,2011-04-15,KENNETH L. LYBARGER,SENIOR COMPLIANCE COORDINATOR,918-574-7315,ken.lybarger@magellanlp.com,
1,2019-07-31 08:37:40,NO,NO,NO,NO,NO,20100002,19837,2010-03-16 17:43:54,SUPPLEMENTAL FINAL,...,SR. PIPELINE TECH II,gabe.mccown@chaparralenergy.com,405-426-4361,,2014-10-28,DAVE YEAGER,PIPELINE MANAGER,405-255-6581,dave.yeager@chaparralenergy.com,"AT 11:50 AM ON THE MORNING OF MARCH 1, 2010 A ..."
2,2019-07-31 08:37:40,YES,NO,NO,NO,NO,20100003,18021,2010-03-17 10:38:15,SUPPLEMENTAL FINAL,...,DIRECTOR - REGULATORY COMPLIANCE,danny.nichols@energytransfer.com,326-681-006,,2013-05-10,DANNY NICHOLS,DIRECTOR - REGULATORY COMPLIANCE,326-681-006,danny.nichols@energytransfer.com,LEAK DISCOVERED BY CONTRACTOR WORKING FOR ANOT...
3,2019-07-31 08:37:40,NO,NO,NO,NO,NO,20100004,19086,2010-03-18 13:18:48,SUPPLEMENTAL FINAL,...,COMPLIANCE MANAGER,jreinbold@buckeye.com,610-904-4185,610-904-4545,2014-03-06,JOHN REINBOLD,COMPLIANCE MANAGER,610-904-4185,jreinbold@buckeye.com,"AT 6:50 AM ON FEBRUARY 19, 2010, BUCKEYE PERSO..."
4,2019-07-31 08:37:40,NO,NO,NO,NO,NO,20100005,17663,2010-03-18 14:26:48,SUPPLEMENTAL FINAL,...,ENVIRONMENTAL REMEDIATION AND COMPLIANCE SPECI...,hltuggle@paalp.com,713-646-4324,713-646-4310,2013-03-22,WARREN FUSILIER,DIRECTOR - ENVIRONMENTAL AND REMEDIATION-COMPL...,713-646-4515,wdfusilier@paalp.com,INTERNAL CORROSION ON 10-INCH PIPELINE RESULTE...


## 1.3 Download FERC notices

### Download list of all notices

In [7]:
api = 'https://www.federalregister.gov/api/v1/documents.json'

params = {'fields[]': ['document_number', 'publication_date', 'title', 'raw_text_url'], 
          'per_page': 1000, 
          'order': 'oldest', 
          'conditions[agencies][]': 'federal-energy-regulatory-commission', 
          'conditions[type][]': 'NOTICE'}

Iterate over all pages of one search (for one year) and download data on all documents (before downloading the full-text of all documents

In [8]:
import requests as rq

def get_documents(api, params, year):
    params['conditions[publication_date][year]'] = year
    max_page = 9999
    page = 1
    notices = []
    while page <= max_page:
        params['page'] = page
        with rq.get(api, params=params) as response:
            overview = response.json()

            max_page = overview['total_pages']
            print(f'Loaded year {year}, page {page} of {max_page}.')

            results = overview['results']
            notices = notices + results
        
        page += 1
        
    return notices

Collect all data from 2005-2019, except full-text

In [9]:
notices = []
for year in range(2005, 2020):
    notices = notices + get_documents(api=api, params=params, year=year)

Loaded year 2005, page 1 of 3.
Loaded year 2005, page 2 of 3.
Loaded year 2005, page 3 of 3.
Loaded year 2006, page 1 of 3.
Loaded year 2006, page 2 of 3.
Loaded year 2006, page 3 of 3.
Loaded year 2007, page 1 of 3.
Loaded year 2007, page 2 of 3.
Loaded year 2007, page 3 of 3.
Loaded year 2008, page 1 of 2.
Loaded year 2008, page 2 of 2.
Loaded year 2009, page 1 of 2.
Loaded year 2009, page 2 of 2.
Loaded year 2010, page 1 of 2.
Loaded year 2010, page 2 of 2.
Loaded year 2011, page 1 of 2.
Loaded year 2011, page 2 of 2.
Loaded year 2012, page 1 of 3.
Loaded year 2012, page 2 of 3.
Loaded year 2012, page 3 of 3.
Loaded year 2013, page 1 of 2.
Loaded year 2013, page 2 of 2.
Loaded year 2014, page 1 of 2.
Loaded year 2014, page 2 of 2.
Loaded year 2015, page 1 of 2.
Loaded year 2015, page 2 of 2.
Loaded year 2016, page 1 of 2.
Loaded year 2016, page 2 of 2.
Loaded year 2017, page 1 of 2.
Loaded year 2017, page 2 of 2.
Loaded year 2018, page 1 of 2.
Loaded year 2018, page 2 of 2.
Loaded y

In [10]:
import pandas as pd
notices = pd.DataFrame(notices)

notices.head()

Unnamed: 0,document_number,publication_date,title,raw_text_url
0,E4-3915,2005-01-04,"Young Gas Storage Company, Ltd.; Notice of Pro...",https://www.federalregister.gov/documents/full...
1,E4-3918,2005-01-04,Sea Robin Pipeline Company; Notice of Flowthro...,https://www.federalregister.gov/documents/full...
2,05-18,2005-01-04,Survey on Operator Training Practices; Order R...,https://www.federalregister.gov/documents/full...
3,E4-3919,2005-01-04,Dauphin Island Gathering Partners; Notice of P...,https://www.federalregister.gov/documents/full...
4,E4-3920,2005-01-04,Transcontinental Gas Pipe Line Corporation; No...,https://www.federalregister.gov/documents/full...


### Paralellize download of sites

In [11]:
import re
import tqdm
from requests import Session
from multiprocessing import Pool
from tqdm import tqdm_notebook as tqdm

def download_site(url):
    with session.get(url) as response:
        return response.text
    
def download_all_sites(sites, preprocessing=None, processes=8):
    with Pool(processes=processes) as pool:
        results = tqdm(pool.imap(download_site, sites), total=len(sites))
        if preprocessing:
            results = map(preprocessing, results)
        return list(results)

We will need this function to clean the individual documents we download

In [12]:
def clean_notice(notice):
    return re.findall('<pre>(.*)</pre>', notice, flags=re.DOTALL)

In [13]:
session = Session()

notices['full-text'] = download_all_sites(list(notices['raw_text_url']), preprocessing=clean_notice, processes=7)

HBox(children=(IntProgress(value=0, max=27890), HTML(value='')))




In [14]:
Audio(sonar, autoplay=True)

In [15]:
notices.tail()

Unnamed: 0,document_number,publication_date,title,raw_text_url,full-text
27885,2019-16277,2019-07-31,Combined Notice of Filings #1,https://www.federalregister.gov/documents/full...,"[[Federal Register Volume 84, Number 147 (Wedn..."
27886,2019-16274,2019-07-31,Notice Soliciting Scoping Comments: Turners Fa...,https://www.federalregister.gov/documents/full...,"[[Federal Register Volume 84, Number 147 (Wedn..."
27887,2019-16276,2019-07-31,"Gulfstream Natural Gas System, L.L.C.; Notice ...",https://www.federalregister.gov/documents/full...,"[[Federal Register Volume 84, Number 147 (Wedn..."
27888,2019-16278,2019-07-31,Records Governing Off-the-Record Communication...,https://www.federalregister.gov/documents/full...,"[[Federal Register Volume 84, Number 147 (Wedn..."
27889,2019-16275,2019-07-31,"Emmons-Logan Wind, LLC; Supplemental Notice Th...",https://www.federalregister.gov/documents/full...,"[[Federal Register Volume 84, Number 147 (Wedn..."


In [16]:
notices.to_csv(f'../data/ferc_notices_{today}.csv')