# 7. (Functions to) identify all data issue

In [1]:
pipeline_2010_selected_file = '../preprocessed_data/pipelines_2010_selected_2019-08-11.feather'
incidents_selected_file = '../preprocessed_data/incidents_selected_2019-08-11.feather'
largest_observations_file = '../preprocessed_data/largest_companies_2019-08-11.feather'
sample_file = '../preprocessed_data/sample_2019-08-17.feather'

pipelines_2010_raw_file = '../data/pipelines_2010_2019-08-11.feather'
incidents_raw = '../data/incidents_2019-08-11.feather'

In [2]:
sample_len = 50

## Setup

In [3]:
import pandas as pd
import numpy as np
from datetime import date
from functools import partial

today = date.today().isoformat()

In [4]:
import wrds

db = wrds.Connection(wrds_username='juujian')

Loading library list...
Done


# Load data

In [5]:
pipelines_2010 = pd.read_feather(pipeline_2010_selected_file)
pipelines_2010.sample(2)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES
343,32009,2017,EXXONMOBIL OIL CORPORATION-TERMINALS,Refined and/or Petroleum Product (non-HVL),0.21
3394,3445,2018,DIXIE PIPELINE COMPANY LLC,HVL,656.216


In [6]:
incidents = pd.read_feather(incidents_selected_file)
incidents.sample(2)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE
163,32109,2010-07-14 21:15:00,ONEOK NGL PIPELINE LP,NO,True,36.77451,-97.75519
3816,30829,2019-06-27 09:50:00,ENTERPRISE CRUDE PIPELINE LLC,NO,True,35.9548,-96.7591


In [7]:
sample = pd.read_feather(sample_file)
sample.sample(2)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,PARENT,INCIDENTS,SIGNIFICANT_INCIDENTS
658,32099,2017,ENERGY TRANSFER COMPANY,hvl,1218.28,Energy Transfer,4.0,4.0
303,22610,2011,"MAGELLAN PIPELINE COMPANY, LP",hvl,218.0,Magellan Midstream Partners,1.0,0.0


In [8]:
largest_observations_ordered = pd.read_feather(largest_observations_file)
largest_observations_ordered.head(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,TOTAL_MILES
0,31618,2018.0,ENTERPRISE PRODUCTS OPERATING LLC,8325.499
1,32109,2018.0,"ONEOK NGL PIPELINE, LLC",4756.61
2,22610,2014.0,"MAGELLAN PIPELINE COMPANY, LP",4505.5
3,2552,2015.0,COLONIAL PIPELINE CO,4500.92
4,31684,2017.0,PHILLIPS 66 PIPELINE LLC,4474.1


In [9]:
pipelines_2010_raw = pd.read_feather(pipelines_2010_raw_file)

## 7.1 Define functions for analysis

### 7.1.1 Functions to compare with raw data

In [10]:
def find_info(OPERATOR_ID, info_col: str, title: str, df = pipelines_2010_raw, id_col = 'OPERATOR_ID', 
              year_col = 'REPORT_YEAR', fuzzy=False):
    from fuzzywuzzy import fuzz
    
    values = np.unique(df[df[id_col] == int(OPERATOR_ID)][info_col]).tolist()
    result = []
    for value in values:
        start_year = df[df[info_col] == value][year_col].min()
        end_year = df[df[info_col] == value][year_col].max()
        result = result + [{title: value, 'start_year': start_year, 'end_year': end_year}]
        
    if fuzzy and len(result) == 2 and fuzz.ratio(result[0][title].lower(), result[1][title].lower()) >= 95:
            result = [result[0]]
            
    return(result)

find_address = partial(find_info, info_col='PARTA4STREET', title='address')
find_names = partial(find_info, info_col='PARTA2NAMEOFCOMP', title='name', fuzzy=True)
find_names('4906')

[{'name': 'EXXONMOBIL PIPELINE CO', 'start_year': 2010, 'end_year': 2018}]

In [11]:
import operator

def find_latest_info(OPERATOR_ID, info_col: str, df = pipelines_2010_raw, 
                     id_col = 'OPERATOR_ID', year_col = 'REPORT_YEAR'):
    data_points = find_info(OPERATOR_ID=OPERATOR_ID, info_col=info_col, title='value', 
                            id_col=id_col, year_col=year_col, fuzzy=False)
    latest_info = max(data_points.__iter__(), key=operator.itemgetter('end_year'))['value']
    return latest_info

find_latest_name = partial(find_latest_info, info_col='PARTA2NAMEOFCOMP')
find_latest_name('12470')

'MID - VALLEY PIPELINE CO'

### 7.1.2 Compare parents

In [12]:
def extract_value(OPERATOR_ID, col, df = sample, id_col = 'OPERATOR_ID'):
    candidates = df.loc[df[id_col] == OPERATOR_ID][col].unique()
    if len(candidates) == 1:
        return(candidates[0])
    elif len(candidates) == 0:
        raise LookupError (f'OPERATOR_ID or {col} not found.')
    elif len(candidates) > 1:
        raise LookupError (f'More than one value found for {col}.')
        
extract_parent = partial(extract_value, col='PARENT')
extract_parent('300')

'Plains All American Pipeline'

In [13]:
def compare_values(OPERATOR_ID, col, df = sample, id_col = 'OPERATOR_ID'):
    value = extract_value(OPERATOR_ID, col=col, df=df, id_col=id_col)
    rows = df.loc[df[col] == value]
    ids_with_same_values = rows[id_col].unique().tolist()
    ids_with_same_values.remove(OPERATOR_ID)
    name_with_same_values = [find_latest_name(id_) for id_ in ids_with_same_values]
    return(list(zip(ids_with_same_values, name_with_same_values)))
    
compare_parents = partial(compare_values, col='PARENT')
compare_parents('22830')

[('32147', 'MARATHON PIPE LINE LLC'),
 ('38933', 'TESORO LOGISTICS OPERATIONS, LLC'),
 ('39029', 'TESORO LOGISTICS NORTHWEST PIPELINE LLC')]

## 7.2 Create regular expressions

In [14]:
ids = sample['OPERATOR_ID'].unique()

In [15]:
entry = []
for id_ in ids:
    entry = entry + [[id_, entry['name']] for entry in find_names(id_)]

In [16]:
entry

[['300', 'PLAINS PIPELINE, L.P.'],
 ['395', 'AMOCO OIL CO'],
 ['1845', 'BUCKEYE PARTNERS, LP'],
 ['2552', 'COLONIAL PIPELINE CO'],
 ['2731', 'CHEVRON PIPE LINE CO'],
 ['3445', 'DIXIE PIPELINE'],
 ['3445', 'DIXIE PIPELINE COMPANY LLC'],
 ['4805', 'EXPLORER PIPELINE CO'],
 ['4906', 'EXXONMOBIL PIPELINE CO'],
 ['10012', 'NUSTAR PIPELINE OPERATING PARTNERSHIP L.P.'],
 ['11169', 'ENBRIDGE ENERGY, LIMITED PARTNERSHIP'],
 ['12470', 'MID - VALLEY PIPELINE CO'],
 ['12628', 'EXXONMOBIL PIPELINE CO'],
 ['12628', 'MOBIL  PIPE  LINE COMPANY'],
 ['12628', 'MOBIL PIPELINE CO'],
 ['15156', 'SINCLAIR TRANSPORTATION COMPANY'],
 ['15674', 'PLANTATION PIPE LINE CO'],
 ['18092', 'SFPP, LP'],
 ['18718', 'SUNOCO PIPELINE L.P.'],
 ['22430', 'WEST SHORE PIPELINE CO'],
 ['22610', 'MAGELLAN PIPELINE COMPANY, LP'],
 ['22830', 'WOLVERINE PIPELINE CO'],
 ['22855', 'FLINT HILLS RESOURCES, LC'],
 ['22855', 'KOCH PIPELINE COMPANY, L.P.'],
 ['25146', 'EQUISTAR CHEMICALS, L.P.'],
 ['26125', 'CALNEV PIPELINE CO'],
 ['261

In [17]:
from collections import namedtuple

company_re = namedtuple('company', 'OPERATOR_ID NAME re_')

company_res = [company_re('10012', 'NUSTAR PIPELINE OPERATING PARTNERSHIP L.P.', r'.*nustar.*'),
               company_re('11169', 'ENBRIDGE ENERGY, LIMITED PARTNERSHIP', r'.*enbridge.*'), 
               company_re('12470', 'MID - VALLEY PIPELINE CO', r'.*mid\s?-\s?valley.*'), 
               company_re('12628', 'EXXONMOBIL PIPELINE CO', r'.*exxonmobil.*'), 
               company_re('12628', 'MOBIL  PIPE  LINE COMPANY', r'.*mobil.*'), 
               company_re('15156', 'SINCLAIR TRANSPORTATION COMPANY', r'.*sinclair.*'),
               company_re('15674', 'PLANTATION PIPE LINE CO', r'.*plantation.*'),
               company_re('18092', 'SFPP, LP', r'.*sfpp.*'),
               company_re('1845', 'BUCKEYE PARTNERS, LP', r'.*buckeye.*'),
               company_re('18718', 'SUNOCO PIPELINE L.P.', r'.*sunoco.*'),
               company_re('22430', 'WEST SHORE PIPELINE CO', r'.*west shore.*'),
               company_re('22610', 'MAGELLAN PIPELINE COMPANY, LP', r'.*magellan,*'),
               company_re('22830', 'WOLVERINE PIPELINE CO', r'.*wolverine.*'),
               company_re('22855', 'FLINT HILLS RESOURCES, LC', r'.*flint hills.*'),
               company_re('22855', 'KOCH PIPELINE COMPANY, L.P.', r'.*koch.*'),
               company_re('25146', 'EQUISTAR CHEMICALS, L.P.', r'.*equistar.*'),
               company_re('2552', 'COLONIAL PIPELINE CO', r'.*colonial.*'),
               company_re('26125', 'CALNEV PIPELINE CO', r'.*calnev.*'),
               company_re('26149', 'ALYESKA PIPELINE SERVICE CO', r'.*alyeska.*'),
               company_re('2731', 'CHEVRON PIPE LINE CO', r'.*chevron.*'),
               company_re('300', 'PLAINS PIPELINE, L.P.', r'.*plains.*'),
               company_re('30781', 'OLYMPIC PIPE LINE COMPANY', r'.*olympic.*'),
               company_re('30826', 'WILLIAMS FIELD SERVICES', r'.*williams.*'),
               company_re('30829', 'ENTERPRISE CRUDE PIPELINE LLC', r'^enterprise.*'),
               company_re('31130', 'DCP MIDSTREAM', r'.*dcp.*'),
               company_re('31174', 'SHELL PIPELINE CO., L.P.', r'.*shell.*'),
               company_re('31189', 'BP PIPELINE (NORTH AMERICA) INC.', '.*bp.*'),
               company_re('31371', 'BUCKEYE DEVELOPMENT & LOGISTICS, LLC', r'.*buckeye.*'),
               company_re('31454', 'NUSTAR LOGISTICS, L.P.', r'.*nustar.*'),
               company_re('31618', 'ENTERPRISE PRODUCTS OPERATING LLC', r'^enterprise.*'),
               company_re('31666', 'ROCKY MOUNTAIN PIPELINE SYSTEM, LLC', r'.*rocky mountain.*'),
               company_re('31684', 'CONOCOPHILLIPS', r'.*conocophillips.*'),
               company_re('31684', 'PHILLIPS 66 PIPELINE LLC', r'.*phillips.*'),
               company_re('31720', 'EXPRESS HOLDINGS (USA), LLC', r'.*express.*'),
               company_re('31720', 'KINDER MORGAN PIPELINES (USA) INC', r'.*kinder morgan.*'),
               company_re('32011', 'HOLLY ENERGY PARTNERS - OPERATING, L.P.', r'.*holly.*'),
               company_re('32080', 'CCPS TRANSPORTATION, LLC', r'.*ccps.*'),
               company_re('32099', 'ENERGY TRANSFER COMPANY', r'.*energy transfer.*'),
               company_re('32103', 'CRIMSON PIPELINE L.P.', r'.*crimson.*'),
               company_re('32109', 'ONEOK NGL PIPELINE LP', r'.*oneok.*'),
               company_re('32147', 'MARATHON PIPE LINE LLC', r'.*marathon.*'),
               company_re('32258', 'KINDER MORGAN COCHIN LLC', r'.*kinder morgan.*'),
               company_re('32334', 'TC OIL PIPELINE OPERATIONS INC', r'.*tc.*'),
               company_re('32502', 'ENBRIDGE PIPELINES (SOUTHERN LIGHTS) L.L.C.', r'.*enbridge.*'),
               company_re('3445', 'DIXIE PIPELINE', r'.*dixie.*'),
               company_re('38933', 'TESORO LOGISTICS OPERATIONS, LLC', r'.*tesoro.*'),
               company_re('39029', 'TESORO LOGISTICS NORTHWEST PIPELINE LLC', r'.*tesoro.*'),
               company_re('39043', 'TALLGRASS PONY EXPRESS PIPELINE, LLC', r'.*tallgrass.*'),
               company_re('39205', 'DAPL-ETCO OPERATIONS MANAGEMENT, LLC', r'.*dapl-etco.*'),
               company_re('39398', 'INEOS USA LLC', r'.*ineos.*'),
               company_re('395', 'AMOCO OIL CO', r'.*amoco.*'),
               company_re('39596', 'PERMIAN EXPRESS PARTNERS LLC', r'.*permian.*'),
               company_re('4805', 'EXPLORER PIPELINE CO', r'.*explorer.*'),
               company_re('4906', 'EXXONMOBIL PIPELINE CO', r'.*exxonmobil.*')
               ]

company_res = pd.DataFrame(company_res)
assert len(sample.loc[~sample['OPERATOR_ID'].isin(company_res['OPERATOR_ID'])]) == 0
company_res.sample(3)

Unnamed: 0,OPERATOR_ID,NAME,re_
17,26125,CALNEV PIPELINE CO,.*calnev.*
18,26149,ALYESKA PIPELINE SERVICE CO,.*alyeska.*
52,4805,EXPLORER PIPELINE CO,.*explorer.*


In [18]:
def extract_values(OPERATOR_ID, col, df = company_res, id_col = 'OPERATOR_ID'):
    return df.loc[df[id_col] == str(OPERATOR_ID)][col].unique().tolist()

extract_res = partial(extract_values, col='re_')
extract_res('31684')

['.*conocophillips.*', '.*phillips.*']

In [19]:
company_res.to_feather(f'../preprocessed_data/company_res_{today}.feather')

### 7.2.1 Check regular expression validity

In [20]:
for _, expression in company_res['re_'].items():
    matches = sample[sample['NAME'].str.match(expression, case=False)]
    if len(matches) == 0:
        print(f'Regular expression {expression} does not match anything!')

### 7.2.2 Function to find namesakes

In [21]:
def find_namesakes(re_, df = sample, col = 'NAME', id_col = 'OPERATOR_ID'):
    if isinstance(re_, str):
        re_ = [re_]
    
    if len(re_) == 1:
        results = df.loc[df[col].str.match(re_[0], case=False)][['OPERATOR_ID', 'NAME']].drop_duplicates()
        return results
        
    if len(re_) > 1:
        results = pd.DataFrame()
        for expression in re_:
            results = results.append(find_namesakes(expression))
        return(results)
    
find_namesakes(r'.*exxonmobil.*')

Unnamed: 0,OPERATOR_ID,NAME
118,4906,EXXONMOBIL PIPELINE CO
193,12628,EXXONMOBIL PIPELINE CO


## 7.3 Create main loop

In [22]:
from IPython.core.debugger import set_trace

def analyze_sample():    
    parents_handled = []
    namesakes_handled = []
    messages = []
    
    for _, id_ in largest_observations_ordered['OPERATOR_ID'][:50].iteritems():
        current_name = find_latest_name(id_)
        message = ''

        names = find_names(id_)
        if len(names) > 1:
            message += f"\n{current_name} (OPERATOR_ID {id_}) has changed its name:\n"
            for name in names:
                message += f"\n\tWas named {name['name']} from {name['start_year']} to {name['end_year']}.\n"
        
        same_parent = compare_parents(id_)
        same_parent_names = [sibling[1] for sibling in same_parent]
        same_parent_ids = [sibling[0] for sibling in same_parent]      

        if (id_) not in parents_handled:
            if same_parent:
                message += f"\n{current_name} (OPERATOR_ID {id_}) has the same parent company as:\n"
                for sibling in same_parent:
                    message += f"\n\t{sibling[1]} (OPERATOR_ID {sibling[0]})\n"
                parents_handled = parents_handled + [sibling[0] for sibling in same_parent]
        
        re_ = extract_res(id_)
        namesakes = find_namesakes(re_)
        namesakes = namesakes.loc[~namesakes['NAME'].isin(same_parent_names)]
        namesakes = namesakes.loc[~namesakes['OPERATOR_ID'].isin(same_parent_names)]
        namesakes = namesakes[~(namesakes['OPERATOR_ID'] == id_)]
        if len(namesakes) > 0 and set(namesakes['OPERATOR_ID']) not in namesakes_handled:
            namesakes_handled = namesakes_handled + [set(namesakes['OPERATOR_ID'])]
            
            message += f"\n{current_name} (OPERATOR_ID {id_}) may have a namesake or namesakes:\n"
            for _, row in namesakes.iterrows():
                message += f"\n\t{row['NAME']} (OPERATOR_ID {row['OPERATOR_ID']})\n"

        if message:
            messages.append(message)
            
    return messages

In [23]:
import pickle

issues_to_address = analyze_sample()
with open(f'../preprocessed_data/issues_to_address_{today}.pickle', 'wb') as file:
    pickle.dump(issues_to_address, file)