In [1]:
from requests_html import HTMLSession
import pandas as pd
import time

In [2]:
# setup url with latest 13F filing uploads

page = 1

url = 'https://www.sec.gov/cgi-bin/current?'
params = f"q1={page}&q2=0&q3=13F-HR"

In [3]:
# start session and download the 1st page as HTML

session = HTMLSession()

r = session.get(url + params)

In [4]:
# clean data

# find relevant text
text = r.html.find('pre', first=True).text

# extract the date of the filings
date = text[:100].split()[7]

# turn data into complete items
filings = text.replace(date, '').split('\n ')[-1].split('  ')
filings

['13F-HR 1862931 Advisory Services & Investments, LLC',
 '13F-HR/A 1453072 Alyeska Investment Group, L.P.',
 '13F-HR 1769288 Atwater Malick LLC',
 '13F-HR 1654599 BEACON INVESTMENT ADVISORY SERVICES, INC.',
 '13F-HR/A 1714678 Beaton Management Co. Inc.',
 '13F-HR/A 1691982 Bowie Capital Management, LLC',
 '13F-HR 1849561 CHILDRESS CAPITAL ADVISORS, LLC',
 '13F-HR/A 1423053 CITADEL ADVISORS LLC',
 '13F-HR/A 1423053 CITADEL ADVISORS LLC',
 '13F-HR 1080628 COLONY GROUP LLC',
 '13F-HR 1863523 CSM Advisors, LLC',
 '13F-HR/A 1389403 Chou Associates Management Inc.',
 '13F-HR/A 1732687 Factorial Partners, LLC',
 '13F-HR 1844719 First Round Capital Management III, LLC',
 '13F-HR 1766504 GREENLEA LANE CAPITAL MANAGEMENT, LLC',
 '13F-HR/A 1604350 Glendon Capital Management LP',
 '13F-HR/A 1164688 OAK HILL ADVISORS LP',
 '13F-HR 1665518 PHYSICIANS FINANCIAL SERVICES, INC.',
 '13F-HR 1566887 Ratan Capital Management LP',
 '13F-HR 1853019 Rollins Financial',
 '13F-HR 1325261 STONNINGTON GROUP, LLC'

In [5]:
# test the cleaning

# split into items, get 1 item
entry = text.replace(date, '').split('\n ')[-1].split('  ')[0]

# the filing type is the 1st item
filtype = entry.split()[0]

# cik is the 2nd item
cik = entry.split()[1]

# the name is all other items joined with spaces
name = ' '.join(entry.split()[2:])

# let's take a look
name, cik, filtype

('Advisory Services & Investments, LLC', '1862931', '13F-HR')

In [6]:
# we're good, create a dictionary

firms = {'CIK': [filing.split()[1].zfill(10) for filing in filings],
        'Name': [' '.join(filing.split()[2:]) for filing in filings],
        'Type': [filing.split()[0] for filing in filings]}

In [7]:
# take a look at it as a dataframe

df = pd.DataFrame(firms)
df.head()

Unnamed: 0,CIK,Name,Type
0,1862931,"Advisory Services & Investments, LLC",13F-HR
1,1453072,"Alyeska Investment Group, L.P.",13F-HR/A
2,1769288,Atwater Malick LLC,13F-HR
3,1654599,"BEACON INVESTMENT ADVISORY SERVICES, INC.",13F-HR
4,1714678,Beaton Management Co. Inc.,13F-HR/A


In [None]:
# we've run a loop for the 1st page, now do this for 85 days
# this way, we cover a quarter and should have the CIK codes
# of every investor filing a 13F

url = 'https://www.sec.gov/cgi-bin/current?'

for page in range(2,85):
    
    params = f"q1={page}&q2=0&q3=13F-HR"
    r = session.get(url + params)
    time.sleep(0.11)
    
    try:
        text = r.html.find('pre', first=True).text
        date = text[:100].split()[7]
        filings = text.replace(date, '').split('\n ')[-1].split('  ')

        firms = {'CIK': [filing.split()[1].zfill(10) for filing in filings],
                'Name': [' '.join(filing.split()[2:]) for filing in filings],
                'Type': [filing.split()[0] for filing in filings]}
        df = df.append(pd.DataFrame(firms), ignore_index=True)
    except:
        print(f'No fillings on {date}, or other error')

In [None]:
# check how many unique CIK's we got

len(df['CIK'].unique())

In [None]:
# drop duplicates and save as a csv for further analysis

df.drop_duplicates('CIK').drop(columns='Type').to_csv('../datasets/cikList.csv')