<a href="https://colab.research.google.com/github/joanneim/code_samples/blob/main/apipullex_get_company_legalstructure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

About the Open Corporates API:

OpenCorporates is a global database of official company records.

In this notebook, we use the Open Corporates dataset to identify the entity's legal structure,  the name of the ultimate parent company, and the ultimate parent company's name, jurisdiction, and legal structure.




In [None]:
# Dependencies
%%capture
from __future__ import print_function
import json
import urllib
import urllib.request
import time
from google.colab import drive
import requests, re, bs4, json
from bs4 import BeautifulSoup
import requests, lxml
import math

!pip install "openpyxl==3.0"
import pandas as pd

In [None]:
# Dataset

# Mount Google Drive
drive.mount("/content/drive")

INPUT_FILE_NAME  = "corpwatch_crsp_unmatched.xlsx"
OUTPUT_FILE_NAME = "parentconames_to_oc_parent_2022"

Mounted at /content/drive


In [None]:

# Specify file path
#file_path = "/content/drive/My Drive/OwnershipPatterns/data_in/list_to_oc.xlsx"

file_path = "/content/drive/My Drive/OwnershipPatterns/data_in/" + INPUT_FILE_NAME


In [None]:
def get_data():

  # Import excel spradsheet of parent company names that was generated from STATA code

  global df

  df = pd.read_excel(file_path, converters={'stn_name':str})

  #df = pd.read_pickle("/content/drive/My Drive/OwnershipPatterns/parentconames_to_opencorps_firstpass_out")
  # Add columns to the dataframe that correspond to the variables whose entries will be populated by the API

  global varlist_i

  varlist_i = ['name', 'id', 'jcode', 'inc_date', 'diss_date', 'type',
               'parent_name', 'parent_id', 'parent_jcode', 'parent_inc_date', 'parent_diss_date', 'parent_type',
               'ub_name', 'ub_id', 'ub_jcode', 'ub_inc_date', 'ub_diss_date', 'ub_type',
               'uc_name', 'uc_id', 'uc_jcode', 'uc_inc_date', 'uc_diss_date', 'uc_type']

  for v in varlist_i:

    df.insert(loc=0, column = 'opencorp_'+ v, value = ["" for i in range(df.shape[0])])


In [None]:

# We will be retrieving parent companies, country codes,
  # 1. API documentation is available here: http://api.opencorporates.com/documentation/API-Reference
  # 2. The Investigator’s Handbook includes useful resources and tutorials: https://blog.opencorporates.com/2017/10/31/the-investigators-handbook-a-guide-to-using-opencorporates/
  # 3. OpenCorporates Licence requires attribution when our data has been used. You can view the licence in full here: https://opencorporates.com/legal/licence
# Set up token and URL

api_token = "dgCNB1noMNEECG5S7383"

search_url = 'https://api.opencorporates.com/v0.4.8/'


In [None]:
def _empty(x):

  ''' evaluates whether input is empty '''

  try:
    status = (x==None)
  except:
    pass

  if status==False:
    try:
      status = (pd.isnull(x))
    except:
      pass

  if status==False:
    try:
      status = len(x)==0
    except:
      pass

  if status==False:
    try:
      status = x==""
    except:
      pass

  if status==False:
    try:
      status = (pd.isna(x))
    except:
      pass

  if status==False:
    return False
  else:
    return True


In [None]:
data_params = {
'q': "3cm holdings llc",
'jurisdiction_code': 'us_de',
'api_token':api_token,
'order':'score',
'normalise_company_name': True}


r = requests.get(search_url+ '/companies/search', params = data_params)

d = r.json()
print(d)

{'api_version': '0.4.8', 'results': {'companies': [{'company': {'name': '3CM HOLDINGS, LLC', 'company_number': '5133012', 'jurisdiction_code': 'us_de', 'incorporation_date': '2012-04-02', 'dissolution_date': None, 'company_type': 'Limited Liability Company', 'registry_url': 'https://icis.corp.delaware.gov/Ecorp/EntitySearch/NameSearch.aspx', 'branch': None, 'branch_status': None, 'inactive': None, 'current_status': None, 'created_at': '2015-09-14T11:01:09+00:00', 'updated_at': '2020-02-22T06:09:21+00:00', 'retrieved_at': '2019-08-26T14:05:15+00:00', 'opencorporates_url': 'https://opencorporates.com/companies/us_de/5133012', 'previous_names': [{'company_name': 'HIGH POINT INFRASTRUCTURE PARTNERS, LLC'}], 'source': {'publisher': 'Delaware Department of State: Division of Corporations', 'url': 'http://www.corp.delaware.gov/', 'retrieved_at': '2019-08-26T14:05:15+00:00'}, 'registered_address': None, 'registered_address_in_full': None, 'industry_codes': [], 'restricted_for_marketing': None,

In [None]:
def trycatch(d):
  try:
    x_bool = (len(d['results']['companies'])==0)
    return x_bool
  except:
    return False

In [None]:
def _populate_de(i):
  '''Subroutine for finding the ultimate parent company of an entity'''
  '''Specifies DE jurisdiction'''

  # Find the company name and the id
  # The search call is quite loose. We are more restrictive by adding a * which specifies
  # an exact ordering of words

  ###
  # Search by stn_orig_parent_name
  ###

  data_params = {
      'q': df.loc[i, 'stn_orig_parent_name'],
      'jurisdiction_code': "us_de",
      'api_token':api_token,
      'order': 'score',
      'normalise_company_name': True}

  r = requests.get(search_url+ '/companies/search', params = data_params)

  d = r.json()

  child_name = None
  child_id = None


  if trycatch(d)==False:

    ###
    # Search by stn_orig_parent_name, no jurisdiction code
    ###
    data_params = {
      'q': df.loc[i, 'stn_orig_parent_name'],
      'api_token':api_token,
      'order':'score',
      'normalise_company_name': True}


    r = requests.get(search_url+ '/companies/search', params = data_params)

    d = r.json()

    child_name = None
    child_id = None


  else:

    pass

  ###
  # Search by stn_orig_parent_name with *
  ###
  if trycatch(d)==False:
    data_params = {
        'q': df.loc[i, 'stn_orig_parent_name'] + "*",
        'jurisdiction_code': "us_de",
        'api_token':api_token,
        'order': 'score',
        'normalise_company_name': True}

    r = requests.get(search_url+ '/companies/search', params = data_params)

    d = r.json()

    child_name = None
    child_id = None
  else:
    pass


  if trycatch(d)==False:

    ###
    # Search by stn_orig_parent_name, no jurisdiction code with *
    ###
    data_params = {
      'q': df.loc[i, 'stn_orig_parent_name'] + "*",
      'api_token':api_token,
      'order':'score',
      'normalise_company_name': True}


    r = requests.get(search_url+ '/companies/search', params = data_params)

    d = r.json()

    child_name = None
    child_id = None


  else:

    pass

  ###
  # Search by stn_name
  ###
  if trycatch(d)==False:

    ###
    # Search by stn_name
    ###
    data_params = {
      'q': df.loc[i, 'stn_name'],
      'jurisdiction_code': "us_de",
      'api_token':api_token,
      'order':'score',
      'normalise_company_name': True}


    r = requests.get(search_url+ '/companies/search', params = data_params)

    d = r.json()

    child_name = None
    child_id = None


  else:

    pass

  ###
  # Search by stn_name, no jurisdiction code
  ###
  if trycatch(d)==False:

    ###
    # Search by stn_orig_parent_name, no jurisdiction code
    ###
    data_params = {
      'q': df.loc[i, 'stn_name'],
      'api_token':api_token,
      'order':'score',
      'normalise_company_name': True}


    r = requests.get(search_url+ '/companies/search', params = data_params)

    d = r.json()

    child_name = None
    child_id = None


  else:

    pass

  ###
  # Search by stn_name with *
  ###
  if trycatch(d)==False:

    ###
    # Search by stn_name
    ###
    data_params = {
      'q': df.loc[i, 'stn_name'] + "*",
      'jurisdiction_code': "us_de",
      'api_token':api_token,
      'order':'score',
      'normalise_company_name': True}


    r = requests.get(search_url+ '/companies/search', params = data_params)

    d = r.json()

    child_name = None
    child_id = None


  else:

    pass

  ###
  # Search by stn_name, no jurisdiction code with *
  ###
  if trycatch(d)==False:

    ###
    # Search by stn_orig_parent_name, no jurisdiction code
    ###
    data_params = {
      'q': df.loc[i, 'stn_name'] + "*",
      'api_token':api_token,
      'order':'score',
      'normalise_company_name': True}


    r = requests.get(search_url+ '/companies/search', params = data_params)

    d = r.json()

    child_name = None
    child_id = None


  else:

    pass


  # If the call was returned, populate the child information with first child with active/or none status

  try:

    for item in d['results']['companies']:

      if (item['company']['current_status']==None)|(item['company']['current_status']=='Active'):

        child_name, child_id, child_jurisdiction_code, child_inc_date, child_diss_date, child_type = item['company']['name'], item['company']['company_number'], item['company']['jurisdiction_code'], item['company'][ 'incorporation_date'], item['company']['dissolution_date'], item['company']['company_type']

        df.loc[i, 'opencorp_name'], df.loc[i, 'opencorp_id'], df.loc[i, 'opencorp_jcode'], df.loc[i, 'opencorp_inc_date'], df.loc[i, 'opencorp_diss_date'], df.loc[i, 'opencorp_type'] = child_name, child_id, child_jurisdiction_code, child_inc_date, child_diss_date, child_type

        break

  except:

    pass

  # Look for parent of child
  if _empty(df.loc[i, 'opencorp_id'])==False:

    child_name, child_id, child_jurisdiction_code = df.loc[i, 'opencorp_name'], df.loc[i, 'opencorp_id'], df.loc[i, 'opencorp_jcode']

    child_id = str(child_id)


    # Find the company name and the id
    data_params = {
      'api_token':api_token,
      }

    r = requests.get(search_url + '/companies/' + child_jurisdiction_code + '/' + child_id, params = data_params)
    d = r.json()

    find_parent(d)

  return


In [None]:
from pandas.core.dtypes.dtypes import PandasExtensionDtype
def find_parent(d):
  # Find the parent company, which will be registered as the 'controlling_entity'
  # Or in cases of foreign entities, a "home_company"

  # Controlling entity

  if _empty(d['results']['company']['controlling_entity'])==False:

      try:

        item =  d['results']['company']['controlling_entity']

        parent_name = item['name']
        df.loc[i, 'opencorp_parent_name'] = parent_name


        # Find the parent profile
        data_params = {
          'api_token':api_token,
          }

        r = requests.get(search_url + '/companies/' + item['jurisdiction_code'] + '/' + item['company_number'], params = data_params)
        d = r.json()
        item = d['results']['company']

        parent_name, parent_id, parent_jurisdiction_code, parent_inc_date, parent_diss_date, parent_type = item['name'], item['company_number'],  item['jurisdiction_code'], item['incorporation_date'], item['dissolution_date'], item['company_type']

        df.loc[i, 'opencorp_parent_name'], df.loc[i, 'opencorp_parent_id'], df.loc[i, 'opencorp_parent_jcode'] ,  df.loc[i, 'opencorp_parent_inc_date'], df.loc[i, 'opencorp_parent_diss_date'], df.loc[i, 'opencorp_parent_type']  = parent_name, parent_id, parent_jurisdiction_code, parent_inc_date, parent_diss_date, parent_type

      except:

        pass

      if _empty(df.loc[i, 'opencorp_parent_name'])==True:

        # Home company

        if _empty(d['results']['company']['home_company'])==False:

          try:

            item =  d['results']['company']['home_company']

            parent_name = item['name']
            df.loc[i, 'opencorp_parent_name'] = parent_name


            # Find the parent profile
            data_params = {
              'api_token':api_token,
              }

            r = requests.get(search_url + '/companies/' + item['jurisdiction_code'] + '/' + item['company_number'], params = data_params)
            d = r.json()
            item = d['results']['company']


            parent_name, parent_id, parent_jurisdiction_code, parent_inc_date, parent_diss_date, parent_type = item['name'], item['company_number'],  item['jurisdiction_code'], item['incorporation_date'], item['dissolution_date'], item['company_type']

            df.loc[i, 'opencorp_parent_name'], df.loc[i, 'opencorp_parent_id'], df.loc[i, 'opencorp_parent_jcode'] ,  df.loc[i, 'opencorp_parent_inc_date'], df.loc[i, 'opencorp_parent_diss_date'], df.loc[i, 'opencorp_parent_type']  = parent_name, parent_id, parent_jurisdiction_code, parent_inc_date, parent_diss_date, parent_type


          except:

            pass

  # Ultimate controlling company

  try:
    if _empty(d['results']['company']['ultimate_controlling_company'])==False:

      if len(d['results']['company']['ultimate_controlling_company'])>0:

        try:
          item = d['results']['company']['ultimate_controlling_company'][0]
          if _empty(item)==False:

            parent_name = item['name']

            df.loc[i, 'opencorp_uc_name'] = parent_name

            # Find the parent profile
            data_params = {
              'api_token':api_token,
              }

            r = requests.get(search_url + '/companies/' + item['jurisdiction_code'] + '/' + item['company_number'], params = data_params)
            d = r.json()
            item = d['results']['company']


            parent_name, parent_id, parent_jurisdiction_code, parent_inc_date, parent_diss_date, parent_type = item['name'], item['company_number'],  item['jurisdiction_code'], item['incorporation_date'], item['dissolution_date'], item['company_type']

            df.loc[i, 'opencorp_uc_name'], df.loc[i, 'opencorp_uc_id'], df.loc[i, 'opencorp_uc_jcode'] ,  df.loc[i, 'opencorp_uc_inc_date'], df.loc[i, 'opencorp_uc_diss_date'], df.loc[i, 'opencorp_uc_type']  = parent_name, parent_id, parent_jurisdiction_code, parent_inc_date, parent_diss_date, parent_type
        except:
          pass

  except:
    pass
  # Ultimate beneficial owner

  try:
    if _empty(d['results']['company']['ultimate_beneficial_owners'])==False:

      if len(d['results']['company']['ultimate_beneficial_owners'])>0:

        try:

          item = d['results']['company']['ultimate_beneficial_owners'][0]

          if _empty(item)==False:

            parent_name = item['name']

            df.loc[i, 'opencorp_ub_name'] = parent_name

            # Find the parent profile
            data_params = {
              'api_token':api_token,
              }

            r = requests.get(search_url + '/companies/' + item['jurisdiction_code'] + '/' + item['company_number'], params = data_params)
            d = r.json()
            item = d['results']['company']

            parent_name, parent_id, parent_jurisdiction_code, parent_inc_date, parent_diss_date, parent_type = item['name'], item['company_number'],  item['jurisdiction_code'], item['incorporation_date'], item['dissolution_date'], item['company_type']

            df.loc[i, 'opencorp_ub_name'], df.loc[i, 'opencorp_ub_id'], df.loc[i, 'opencorp_ub_jcode'] ,  df.loc[i, 'opencorp_ub_inc_date'], df.loc[i, 'opencorp_ub_diss_date'], df.loc[i, 'opencorp_ub_type']  = parent_name, parent_id, parent_jurisdiction_code, parent_inc_date, parent_diss_date, parent_type

        except:

          pass
  except:
    pass

In [None]:
# Get Data
#get_data()
df = pd.read_pickle("/content/drive/My Drive/OwnershipPatterns/data_out/" + OUTPUT_FILE_NAME )

In [None]:
data_params = {
'q': "wtg gas processing",
'jurisdiction_code': 'us_de',
'api_token':api_token,
'order':'score',
'normalise_company_name': True}


r = requests.get(search_url+ '/companies/search', params = data_params)

d = r.json()
print(d)

{'api_version': '0.4.8', 'results': {'companies': [], 'page': 1, 'per_page': 30, 'total_pages': 0, 'total_count': 0}}


In [None]:
i = 2958
_populate_de(i)
print(df.iloc[i])

In [None]:
df = pd.read_pickle("/content/drive/My Drive/OwnershipPatterns/data_out/" + OUTPUT_FILE_NAME )

In [None]:
for i in range(2437, 2909):
#for i in range(17, 25):
#for i in range(330, 331):

  # Print iteration no.

  print(i)

  # Populate

  _populate_de(i)

  # Every 500 iterations, pickle the data

  if i%500==0:

    df.to_pickle("/content/drive/My Drive/OwnershipPatterns/data_out/" + OUTPUT_FILE_NAME )

  # Print to inspect the result every 100 iterations

  if i%100==0:

    print(df.iloc[i])

# Final export

df.to_pickle("/content/drive/My Drive/OwnershipPatterns/data_out/" + OUTPUT_FILE_NAME)

df.to_excel("/content/drive/My Drive/OwnershipPatterns/data_out/" + OUTPUT_FILE_NAME + '.xlsx', sheet_name='Sheet_name_1')