# 7. (Functions to) identify all data issue

In [1]:
pipeline_2010_selected_file = '../preprocessed_data/pipelines_2010_selected_2019-08-22.feather'
incidents_selected_file = '../preprocessed_data/incidents_selected_2019-08-22.feather'
largest_observations_file = '../preprocessed_data/largest_companies_2019-09-01.feather'
sample_file = '../preprocessed_data/sample_2019-09-02.feather'

pipelines_2010_raw_file = '../data/pipelines_2010_2019-08-11.feather'
incidents_raw = '../data/incidents_2019-08-11.feather'
regular_impressions_file = '../input/company_names_res_2019-09-01.csv'

In [2]:
sample_len = 150

## Setup

In [3]:
import pandas as pd
import numpy as np
from datetime import date
from functools import partial

today = date.today().isoformat()

In [4]:
import wrds

db = wrds.Connection(wrds_username='juujian')

Loading library list...
Done


# Load data

In [5]:
pipelines_2010 = pd.read_feather(pipeline_2010_selected_file)
pipelines_2010.sample(2)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,PRE_1940_MILES,1940_MILES,1950_MILES,1960_MILES,1970_MILES,1980_MILES,1990_MILES,2000_MILES,2010_MILES,PERC_OFFSHORE,AVG_AGE
2945,31563,2010,"WHITECAP PIPE LINE COMPANY, L.L.C.",Crude Oil,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1755,15915,2012,"PIPELINES OF PUERTO RICO INCD, THE",Refined and/or Petroleum Product (non-HVL),9.5,0.0,0.0,0.0,0.0,8.5,0.0,0.0,0.0,0.0,0.0,0.0,55.0


In [6]:
incidents = pd.read_feather(incidents_selected_file)
incidents.sample(2)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE,ONSHORE
311,22610,2010-12-05 20:30:00,"MAGELLAN PIPELINE COMPANY, LP",REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,False,False,39.13805,-94.6025,True
3546,32551,2018-11-02 15:00:00,"BKEP PIPELINE, LLC",CRUDE OIL,False,True,34.77353,-97.44631,True


In [7]:
sample = pd.read_feather(sample_file)
sample.sample(2)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE,PARENT,INCIDENTS,SIGNIFICANT_INCIDENTS
1659,39205,2017,"DAPL-ETCO OPERATIONS MANAGEMENT, LLC",crude,795.9,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.0,0.05,129.17,0.0,5.044066,Energy Transfer,7.0,1.0
1642,39105,2016,VALERO PARTNERS OPERATING CO. LLC,non-hvl,57.315,0.0,0.0,0.0,0.0,2.848,25.322,10.732,0.0,16.589,5.857,0.0,31.783758,Valero Energy Corporation,0.0,0.0


In [8]:
largest_observations_ordered = pd.read_feather(largest_observations_file)
largest_observations_ordered.head(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,TOTAL_MILES
0,31618,2018.0,ENTERPRISE PRODUCTS OPERATING LLC,8311.369
1,32109,2018.0,"ONEOK NGL PIPELINE, LLC",4756.61
2,22610,2014.0,"MAGELLAN PIPELINE COMPANY, LP",4505.5
3,2552,2015.0,COLONIAL PIPELINE CO,4500.92
4,31684,2017.0,PHILLIPS 66 PIPELINE LLC,4474.1


In [9]:
pipelines_2010_raw = pd.read_feather(pipelines_2010_raw_file)

## 7.1 Define functions for analysis

### 7.1.1 Functions to compare with raw data

In [10]:
def find_info(OPERATOR_ID, info_col: str, title: str, df = pipelines_2010_raw, id_col = 'OPERATOR_ID', 
              year_col = 'REPORT_YEAR', fuzzy=False):
    from fuzzywuzzy import fuzz
    
    values = np.unique(df[df[id_col] == int(OPERATOR_ID)][info_col]).tolist()
    result = []
    for value in values:
        start_year = df[df[info_col] == value][year_col].min()
        end_year = df[df[info_col] == value][year_col].max()
        result = result + [{title: value, 'start_year': start_year, 'end_year': end_year}]
        
    if fuzzy and len(result) == 2 and fuzz.ratio(result[0][title].lower(), result[1][title].lower()) >= 95:
            result = [result[0]]
            
    return(result)

find_address = partial(find_info, info_col='PARTA4STREET', title='address')
find_names = partial(find_info, info_col='PARTA2NAMEOFCOMP', title='name', fuzzy=True)
find_names('4906')

[{'name': 'EXXONMOBIL PIPELINE CO', 'start_year': 2010, 'end_year': 2018}]

In [11]:
import operator

def find_latest_info(OPERATOR_ID, info_col: str, df = pipelines_2010_raw, 
                     id_col = 'OPERATOR_ID', year_col = 'REPORT_YEAR'):
    data_points = find_info(OPERATOR_ID=OPERATOR_ID, info_col=info_col, title='value', 
                            id_col=id_col, year_col=year_col, fuzzy=False)
    latest_info = max(data_points.__iter__(), key=operator.itemgetter('end_year'))['value']
    return latest_info

find_latest_name = partial(find_latest_info, info_col='PARTA2NAMEOFCOMP')
find_latest_name('12470')

'MID - VALLEY PIPELINE CO'

### 7.1.2 Compare parents

In [12]:
def extract_value(OPERATOR_ID, col, df = sample, id_col = 'OPERATOR_ID'):
    candidates = df.loc[df[id_col] == OPERATOR_ID][col].unique()
    if len(candidates) == 1:
        return(candidates[0])
    elif len(candidates) == 0:
        raise LookupError (f'OPERATOR_ID or {col} not found.')
    elif len(candidates) > 1:
        raise LookupError (f'More than one value found for {col}.')
        
extract_parent = partial(extract_value, col='PARENT')
extract_parent('300')

'Plains GP Holding'

In [13]:
def compare_values(OPERATOR_ID, col, df = sample, id_col = 'OPERATOR_ID'):
    value = extract_value(OPERATOR_ID, col=col, df=df, id_col=id_col)
    rows = df.loc[df[col] == value]
    ids_with_same_values = rows[id_col].unique().tolist()
    ids_with_same_values.remove(OPERATOR_ID)
    name_with_same_values = [find_latest_name(id_) for id_ in ids_with_same_values]
    return(list(zip(ids_with_same_values, name_with_same_values)))
    
compare_parents = partial(compare_values, col='PARENT')
compare_parents('22830')

[('15774', 'NORTH DAKOTA PIPELINE COMPANY LLC'),
 ('26026', 'MARKWEST RANGER PIPELINE COMPANY, L.L.C.'),
 ('31570', 'TESORO HIGH PLAINS PIPELINE COMPANY LLC'),
 ('31574', 'WESTERN REFINING LOGISTICS, LP'),
 ('32147', 'MARATHON PIPE LINE LLC'),
 ('38933', 'TESORO LOGISTICS OPERATIONS, LLC'),
 ('39013', 'TESORO SOCAL PIPELINE COMPANY LLC'),
 ('39029', 'TESORO LOGISTICS NORTHWEST PIPELINE LLC'),
 ('39347', 'ILLINOIS EXTENSION PIPELINE COMPANY, L.L.C.')]

## 7.2 Create regular expressions

In [14]:
ids = sample['OPERATOR_ID'].unique()

In [15]:
entry = []
for id_ in ids:
    entry = entry + [[id_, entry['name']] for entry in find_names(id_)]

pd.DataFrame(entry, columns = ['OPERATOR_ID', 'NAME']).to_csv(f'../input/company_names_{today}.csv', index=False)

For this step, we modify the exported company names file before importing the resulting .csv back into python.

In [16]:
company_res = pd.read_csv(regular_impressions_file)
company_res.sample(5)

Unnamed: 0,OPERATOR_ID,NAME,RES
57,26026,"MARKWEST ENERGY APPALACHIA, LLC",.*markwest.*
153,39205,"DAPL-ETCO OPERATIONS MANAGEMENT, LLC",.*dapl.*
24,11551,"LION OIL TRADING & TRANSPORTATION, INC",.*lion\soil.*
162,39535,TORRANCE PIPELINE COMPANY LLC,.*torrance.*
138,32543,"DENBURY GREEN PIPELINE-TEXAS, LLC",.*denbury.*


In [17]:
company_res.dtypes

OPERATOR_ID     int64
NAME           object
RES            object
dtype: object

In [18]:
company_res['OPERATOR_ID'] = company_res['OPERATOR_ID'].astype('str')
company_res.dtypes

OPERATOR_ID    object
NAME           object
RES            object
dtype: object

In [19]:
def extract_values(OPERATOR_ID, col, df = company_res, id_col = 'OPERATOR_ID'):
    return df.loc[df[id_col] == str(OPERATOR_ID)][col].unique().tolist()

extract_res = partial(extract_values, col='RES')
extract_res('31684')

['.*conocophillips.*', '.*phillips\\s66.*']

In [20]:
company_res.to_feather(f'../preprocessed_data/company_res_{today}.feather')

### 7.2.1 Check regular expression validity

In [21]:
for _, expression in company_res['RES'].items():
    matches = sample[sample['NAME'].str.match(pat=expression, case=False)]
    if len(matches) == 0:
        print(f'Regular expression {expression} does not match anything!')

### 7.2.2 Function to find namesakes

In [22]:
def find_namesakes(re_, df = sample, col = 'NAME', id_col = 'OPERATOR_ID'):
    if isinstance(re_, str):
        re_ = [re_]
    
    if len(re_) == 1:
        results = df.loc[df[col].str.match(re_[0], case=False)][['OPERATOR_ID', 'NAME']].drop_duplicates()
        return results
        
    if len(re_) > 1:
        results = pd.DataFrame()
        for expression in re_:
            results = results.append(find_namesakes(expression))
        return(results)
    
find_namesakes(r'.*exxonmobil.*')

Unnamed: 0,OPERATOR_ID,NAME
235,4906,EXXONMOBIL PIPELINE CO
370,12624,EXXONMOBIL OIL CORPORATION
382,12628,EXXONMOBIL PIPELINE CO
412,12634,EXXONMOBIL OIL CORPORATION
832,26134,EXXONMOBIL OIL CORP - WEST COAST


## 7.3 Create main loop

In [23]:
from IPython.core.debugger import set_trace

def analyze_sample():    
    parents_handled = []
    namesakes_handled = []
    messages = []
    
    for _, id_ in largest_observations_ordered['OPERATOR_ID'][:sample_len].iteritems():
        current_name = find_latest_name(id_)
        message = ''

        names = find_names(id_)
        if len(names) > 1:
            message += f"\n{current_name} (OPERATOR_ID {id_}) has changed its name:\n"
            for name in names:
                message += f"\n\tWas named {name['name']} from {name['start_year']} to {name['end_year']}.\n"
        
        same_parent = compare_parents(id_)
        same_parent_names = [sibling[1] for sibling in same_parent]
        same_parent_ids = [sibling[0] for sibling in same_parent]      

        if (id_) not in parents_handled:
            if same_parent:
                message += f"\n{current_name} (OPERATOR_ID {id_}) has the same parent company as:\n"
                for sibling in same_parent:
                    message += f"\n\t{sibling[1]} (OPERATOR_ID {sibling[0]})\n"
                parents_handled = parents_handled + [sibling[0] for sibling in same_parent]
        
        re_ = extract_res(id_)
        namesakes = find_namesakes(re_)
        namesakes = namesakes.loc[~namesakes['NAME'].isin(same_parent_names)]
        namesakes = namesakes.loc[~namesakes['OPERATOR_ID'].isin(same_parent_names)]
        namesakes = namesakes[~(namesakes['OPERATOR_ID'] == id_)]
        if len(namesakes) > 0 and set(namesakes['OPERATOR_ID']) not in namesakes_handled:
            namesakes_handled = namesakes_handled + [set(namesakes['OPERATOR_ID'])]
            
            message += f"\n{current_name} (OPERATOR_ID {id_}) may have a namesake or namesakes:\n"
            for _, row in namesakes.iterrows():
                message += f"\n\t{row['NAME']} (OPERATOR_ID {row['OPERATOR_ID']})\n"

        if message:
            messages.append(message)
            
    return messages

In [24]:
import pickle

issues_to_address = analyze_sample()
with open(f'../preprocessed_data/issues_to_address_{today}.pickle', 'wb') as file:
    pickle.dump(issues_to_address, file)