In [None]:
# script: edgar_cik_company_lookup
# author: Jennifer Lammers Zimmer, jamm@umich.edu
# last updated: 01/26/2023
# This script uses the Edgar Python package to query the SEC database to find
# CIKs for companies in a CSV file. A lookup on names is performed first to find
# the official Edgar name and then Edgar is queried again to find the CIKs with the 
# official names. The final output is written to a CSV file and includes official 
# company name and CIK.

#There are two parts to this file, PART A and PART B.
#
# Use PART A to look up SEC listed company names
# Use PART A first if you need to verify company names with official SEC listing or 
# find company names that don't have a match. PART A will help you create a file for 
# review and will minimize errors in the CIK look up in PART B.

# Use PART B to look up company CIKs
# You can use PART B after you have cleaned up output from PART A or, you can dive right in
# with your own file list and use PART B. STEP 3.1 which will perform a fuzzy company name
# lookup and CIK lookup at the same time. 

# Edgar is a Python Package to query the SEC Edgar Database
# https://pypi.org/project/edgar/
# 
# 


In [None]:
# Imports
# Install and Call Edgar Package
# note: pip install only needed if running on Colabs or you don't have Edgar installed
# in your local environment already. 

!pip install Edgar

from edgar import Edgar
edgar = Edgar()
import csv

PART A. Find correct company names at the SEC

In [None]:
# PART A. STEP 1.
# CREATE LIST OF COMPANY NAMES FROM CSV
# Read in companies from a csv file to create a list of lists. Each list will contain one company name.
# If running on Colabs, you will need to upload the file to the runtime to use it
# or pull it from a Google Drive:
# This section of code gets the data from my google drive
#
#from google.colab import drive
#drive.mount('/content/drive')
#f = open('/content/drive/MyDrive/Colab Notebooks/filename.csv', encoding="utf8")
#frs_text = f.read()
#f.close()

# Expected file layout is one column with header 
# and each company name on it's own row. Eg:
#       Companyname
#       Company1
#       Company2
#       Company3

rows = []
with open("Board_SEC_list_formerlegalnames.csv", 'r') as file:
    csvreader = csv.reader(file)
    header = next(csvreader)
    for row in csvreader:
        rows.append(row)
print(header)
print(rows)

In [None]:
# # PART A. STEP 2
# CREATE LIST OF COMPANY NAMES AS STRINGS
# read through the list of lists companies (rows - this is from the csv read above), 
# take the first list in the rows list (subitem) and get the contents (item)
# and append that company (item) to the new company_list as a string and not a list.
# 
company_list = [item for sublist in rows for item in sublist]
print(company_list)
    

In [None]:
# # PART A. OPTIONAL FOR TESTING
# check to make sure the file got entered into the list
# len(company_list)

In [None]:
## PART A. OPTIONAL - take a look at the list to make sure it looks like we think it should.
# change the number in the brackets to see different forms. Create a range with a colon, e.g. [23:30]
# NOTE: Python starts counting (indexing) with 0 so the first item in the list would be doclist[0].

#print(company_list[700:])

In [None]:
# # PART A. OPTIONAL - FOR TESTING
# # PART A. STEP 2.1
# CREATE A TEST COMPANY LIST
# create a smaller subset to test with
# change the input varible name in the FIND COMPANY NAME BY LOOKUP AT SEC 'for' statement (company_list)
# to match the variable name set here (clist2) if using the test version

#clist2 = company_list[700:]
#print(clist2)

In [None]:
# # PART A. STEP 3.
# FIND EXACT COMPANY NAME BY LOOKUP AT SEC
# use the .find_company_name method to search for exact company names in Edgar using the 
# list created in STEP 2 or STEP 2.2. 
# Writes out the original company name then it's match on Edgar to a CSV file, name_matches.csv, for review.
# Returns a list, 'lookup_list', with the official company names from Edgar
# for use with STEP 6. GET CIKs or TEST COMPANY LIST functions.

# More information on the classes and methods for Edgar can be found at
# https://pypi.org/project/edgar/
#
# The github repository for the code in the package is at 
# https://github.com/joeyism/py-edgar

# Parameters you can change:
#  field_names - you can relabel if you wish
#  company_list - can be set to clist2 if using a smaller test set

#lookup_list = [] # for testing
field_names1 = ['my_company','SEC_company_match']


with open('name_matches.csv', 'a') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(field_names1)
    for company in company_list:
        compar = []  
        compar.append(company)
        possible_companies = edgar.find_company_name(company)
        #print(company) ##for testing
        #print(len(possible_companies)) ##for testing
        if len(possible_companies) >= 1:
            for item in possible_companies:
                #print(item) ##for testing
                #print(len(item)) ##for testing
                compar.append(item)
        elif len(possible_companies) < 1:
            item2 = "not found"
            compar.append(item2)
      #print(compar) ## use for testing to view output
        writer.writerow(compar)

        #lookup_list.append(possible_companies) #for testing

#always close your file!        
csvfile.close()


In [None]:

###################################
# 
# At this point, you will want to stop and review the 'name_matches.csv' file output from
# Step 3. There will be lots of forms of a company and some items not found. You will need to 
# decide which version of the company name you want to use in the next section and
# you may want to look up the ones 'not found' on the SEC Edgar website to see if 
# there is actually a match you want to use.
#
###################################

Part B. Find company CIKs from a list of companies in a CSV file.

In [None]:
## PART B. STEP 1. - IF USING EXTERNAL FILE AND SKIPPING SEC COMPANY MATCHING IN PART A
# OR USING CLEANED OUTPUT FROM PART A. STEP 3.

# CREATE COMPANY LIST FROM KNOWN SEC MATCHES FILE
# Use this block of code to ingest a file with known SEC name matches for further processing with 
# the GET CIK WITH EXACT MATCH.
# This is good if there were questionable entries from the FIND COMPANY NAME results
# that needed to be cleaned prior to finding the CIKs OR if a user already 
# has the verified names.
lookup_list = []
with open("SEC_companies.csv", 'r') as file:
    csvreader = csv.reader(file)
    header = next(csvreader)
    for row in csvreader:
        lookup_list.append(row)
print(header)
print(lookup_list)

In [None]:
# PART B. OPTIONAL FILE IMPORT TESTING BLOCK
# take a look at the list to see the format
# print(lookup_list[:20])

# check the number of company names in the list
# print(len(lookup_list))

In [None]:
# PART B. STEP 1.2 - OPTIONAL TEST COMPANY LIST  
# create a smaller list for testing. 
# NOTE: If there is a large list of companies, you may want to break it up into 
# several smaller lists and run the script several times. This will execute faster. 
# Also probably won't piss of the SEC as much. ;)

#lookup2 = lookup_list[:2]

#take a look at the list to see the format
#print(lookup2)

In [None]:
# PART B. STEP 2 
# CREATE UNPACKED LIST OF COMPANIES FOR LOOKUP

# unpack the list of lists to create a list of strings for the search at Edgar
# NOTE: change the variable name to point to list you are using, eg, if you 
# are using the full list from FIND COMPANY NAME, use 'lookup_list'. 
# If you are using TEST COMPANY LIST use 'lookup2'.

lookup3 = [company for sublist in lookup_list for company in sublist]
print(lookup3)

In [None]:
##########################################
# ########### STOP ######################
# For the next part, select *either* STEP 3 or STEP 3.1.
# Decision is based upon input given in PART B STEP 1.
#
# If you started at PART B with your own file with company names and did not do PART A for 
# the EXACT SEC match, then skip to STEP 3.1.
#
# If you used cleaned output from PART A, continue to STEP 3.
#########################################

In [None]:
## # PART B. STEP 3.  
# GET CIK WITH EXACT MATCH
#
#  Use this code if you used the CREATE COMPANY LIST FROM KNOWN SEC MATCHES FILE
#  in PART B. STEP 1. 
#  Uses the output from PART B. STEP 2 
#

field_names2 = ['SEC_company','CIK']
#create a CSV file for the ouput
with open('name_cik_matches.csv', 'a') as csvfile:
      writer = csv.writer(csvfile)
      writer.writerow(field_names2)

      for company in lookup3:
          compar = []  
          compar.append(company)
          if company == 'not found':
            cik = ''
            compar.append(cik)
          else: 
            cik = edgar.get_cik_by_company_name(company)
              #print(company) ##for testing
              #print(len(possible_companies)) ##for testing
            compar.append(cik)
            #print(compar) ## use for testing to view output
          writer.writerow(compar)

              
        
#always close your file!        
csvfile.close()



In [None]:
#######################################
# Congrats! 
# Your file is now ready to be used with part 2 of this script, Part_2_sec_form_d_lookup.ipynb.
# #############################################

In [None]:
# ## OPTIONAL PART B. STEP 3.1.  
# GET CIKs WITH FUZZY MATCH
# create a list of field names for the CSV header row
# Use this if you don't want to do the SEC EXACT MATCHING in PART A. Your results
# may not be as good and you may have errors. 
# Uses output from PART B. STEP 1 and STEP 2 as input.

# For each item in the list of official company names, use the  
# .match_company_by_company_name method of Edgar to query the Edgar database.
# Returns a Dictionary with 3 values: matched company name, cik, match score.
# 'top' attribute can be set to return a certain number of matches for each name in the list.
# set to 1 here since we have the official names from Edgar.
# NOTE: There are some issues with name matching for names that have commas. It looks like the fuzzy
# matching will break up strings if it finds a match on the first part of the word
# and then searchs the rest as a separate entity. 

# Parameters that you can change:
#    field_names - you can set your own, but there must be 3
#    top - the number of possible matches for the name you gave

field_names = ['company_name', 'cik', 'score']

#create a CSV file for the ouput
with open('name_matches_fuzzy.csv', 'a') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()


    for comp in lookup3: 
        #print(comp)
        possible_match = edgar.match_company_by_company_name(comp, top=3)
        writer.writerows(possible_match)
        
        
#always close your file!        
csvfile.close()




In [None]:
#######################################
# Congrats! 
# Your file is now ready to be used with part 2 of this script, Part_2_sec_form_d_lookup.ipynb.
# You may want to look over the output first before plunging in
# #############################################