In [9]:
import pandas as pd
from sec_edgar_downloader import Downloader
import time
import os

### We must scrape the 13F filings from the SEC's website using the CIK codes.

In [10]:
# read the CIK list, format CIK's as 10-digit strings, and take a look

codes = pd.read_csv('cikList.csv')
codes['CIK'] = codes['CIK'].astype(str).apply(lambda cik: cik.zfill(10))
codes.drop(columns='Unnamed: 0', inplace=True)
codes.head()

Unnamed: 0,CIK,Name
0,1846160,AAFMAA Wealth Management & Trust LLC
1,1842554,"ACT Advisors, LLC."
2,885118,AMERICAN ASSETS INC
3,1812198,Aaron Wealth Advisors LLC
4,1699506,"Ackerman Capital Advisors, LLC"


### In order to have a manageable dataset, we subsample 1000 CIK codes and proceed to download filings for these firms

In [6]:
# sample 1000 CIK's at random
# we subsample because the data is large enough

cikCodes = codes['CIK'].sample(1000)
cikCodes

3124    0001845930
3246    0001388312
1449    0001094749
2477    0001788700
3932    0001618366
           ...    
3208    0001630413
5321    0001508512
2946    0001331693
445     0001761871
662     0001162781
Name: CIK, Length: 1000, dtype: object

In [21]:
# save the sampled CIK's to a csv so that we know which ones we have data for

cikCodes.to_csv('../datasets/downloadedMDA.csv')

In [11]:
# just check where we are so we can save files in correct locations

!pwd

/home/jradhima/kul_course


### We will use sec-edgar-downloader, a Python package that lets us download the 13F filings

Here, we download the filings for the 4th quarter of year 2017. These can be filed up to 45 days after the end of the quarter, so we search for filings after 2018-01-01 and before 2018-03-25.

The downloader sometimes fails to fetch the filings. Sometimes this is due to random reasons but sometimes it's because the investors we are looking for were not operating during that period (we got the codes for the last period, in 2021, so some investors didn't exist in 2017). To overcome the randomness in the process, we loop over all the CIK codes we failed to fetch data for, until we get what we want.

In [17]:
# we will download filings for the 4th quarter of 2017
# filings will be stored in directory '/sec2017'

loader = Downloader('sec2017')

# this is more or less useless, we may want to know which companies we downloaded filings for
success = []

# this is needed afterwards, we will iterate over CIK's we failed to download data because
# the downloaded sometimes fails to download anything for no clear reason
fail = []

# run over our 1000 sampled codes and...
for code in cikCodes:
    
    print(f"Trying CIK Number: {code}")
    
    # try downloading for a CIK
    try:
        # get 13F-HR filings for the period between 'before' and 'after'
        # since this is 3 months (a quarter) we should only get 1 filing
        num = loader.get('13F-HR', code, after="2018-01-01", before="2018-03-25")
        print(f"Downloaded {num} files from company {code} \n")
        success.append(code)
    except:
        # we couldn't download anything, add CIK to list for us to try again later
        print(f"Something went wrong with company {code} \n")
        fail.append(code)
        
    # sleep so we don't have problems with the SEC
    time.sleep(0.11)

    
# by now we have run once over all CIK's and have downloaded filings for some
# we will try again for all CIK's we couldn't get a filing, until we get one

# we need a counter to not get stuck in an infinite loop, some CIK's might not
# have filing info or other issues may apply
count = 0

# while there are companies we don't have data for, scrape and scrape again
while len(fail) > 0:
    
    print(f'Failed to get data for {len(fail)} companies.\nTrying again.\n')
    
    # each time we loop over all the companies we failed to get data the previous time
    # increment the counter
    count += 1
    
    # 50 is an arbitrary threshold, depending on our patience and how much we want
    # to get all possible data, this is the total number of iterations we are willing to try
    if count > 50:
        break
    
    # for every company we failed to get data, try again
    for code in fail:
        print(f"Trying CIK Number: {code}")
    
        try:
            num = loader.get('13F-HR', code, after="2020-01-01", before="2020-03-25")
            print(f"Downloaded {num} files from company {code} \n")
            success.append(code)
            fail.remove(code)
        except:
            print(f"Something went wrong with company {code} \n")

        time.sleep(0.11)

Trying CIK Number: 0001845930
Downloaded 0 files from company 0001845930 

Trying CIK Number: 0001388312
Downloaded 1 files from company 0001388312 

Trying CIK Number: 0001094749
Downloaded 1 files from company 0001094749 

Trying CIK Number: 0001788700
Downloaded 0 files from company 0001788700 

Trying CIK Number: 0001618366
Downloaded 1 files from company 0001618366 

Trying CIK Number: 0001665446
Skipping filing detail download for '0001665446-18-000001' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001665446/000166544618000001/primary_doc.xml.
Downloaded 1 files from company 0001665446 

Trying CIK Number: 0001418342
Downloaded 0 files from company 0001418342 

Trying CIK Number: 0001307617
Downloaded 0 files from company 0001307617 

Trying CIK Number: 0001347504
Downloaded 1 files from company 0001347504 

Trying CIK Number: 0001035674
Downloaded 1 files from company 0001035674 

Trying CIK Number: 0001777015
Downloaded 0 fi

Downloaded 1 files from company 0001696494 

Trying CIK Number: 0001324022
Downloaded 1 files from company 0001324022 

Trying CIK Number: 0000814133
Downloaded 1 files from company 0000814133 

Trying CIK Number: 0001623883
Downloaded 1 files from company 0001623883 

Trying CIK Number: 0001731296
Skipping filing detail download for '0001140361-18-007342' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001731296/000114036118007342/primary_doc.xml.
Downloaded 1 files from company 0001731296 

Trying CIK Number: 0001776382
Downloaded 0 files from company 0001776382 

Trying CIK Number: 0001803156
Downloaded 0 files from company 0001803156 

Trying CIK Number: 0001724246
Downloaded 0 files from company 0001724246 

Trying CIK Number: 0001619390
Downloaded 1 files from company 0001619390 

Trying CIK Number: 0001841827
Downloaded 0 files from company 0001841827 

Trying CIK Number: 0001839498
Downloaded 0 files from company 0001839498 



Downloaded 0 files from company 0001802946 

Trying CIK Number: 0000098561
Downloaded 1 files from company 0000098561 

Trying CIK Number: 0001649647
Skipping filing detail download for '0000947871-18-000146' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001649647/000094787118000146/primary_doc.xml.
Downloaded 1 files from company 0001649647 

Trying CIK Number: 0001793904
Downloaded 0 files from company 0001793904 

Trying CIK Number: 0000820289
Skipping filing detail download for '0000820289-18-000001' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0000820289/000082028918000001/primary_doc.xml.
Downloaded 1 files from company 0000820289 

Trying CIK Number: 0001320652
Downloaded 0 files from company 0001320652 

Trying CIK Number: 0000749044
Downloaded 0 files from company 0000749044 

Trying CIK Number: 0001569139
Downloaded 1 files from company 0001569139 

Trying CIK Number: 0

Downloaded 0 files from company 0001798860 

Trying CIK Number: 0001784093
Downloaded 0 files from company 0001784093 

Trying CIK Number: 0001109448
Downloaded 1 files from company 0001109448 

Trying CIK Number: 0001520710
Downloaded 1 files from company 0001520710 

Trying CIK Number: 0000872259
Downloaded 1 files from company 0000872259 

Trying CIK Number: 0001794467
Downloaded 0 files from company 0001794467 

Trying CIK Number: 0001697162
Skipping full submission download for '0001140361-18-008136' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001697162/000114036118008136/0001140361-18-008136.txt.
Downloaded 1 files from company 0001697162 

Trying CIK Number: 0001138897
Downloaded 1 files from company 0001138897 

Trying CIK Number: 0001540531
Skipping full submission download for '0000905718-18-000209' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001540531/0000905718180

Downloaded 1 files from company 0001540951 

Trying CIK Number: 0001650142
Downloaded 1 files from company 0001650142 

Trying CIK Number: 0001740842
Downloaded 0 files from company 0001740842 

Trying CIK Number: 0001582732
Downloaded 1 files from company 0001582732 

Trying CIK Number: 0001568303
Skipping full submission download for '0001568303-18-000003' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001568303/000156830318000003/0001568303-18-000003.txt.
Skipping filing detail download for '0001568303-18-000003' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001568303/000156830318000003/primary_doc.xml.
Downloaded 1 files from company 0001568303 

Trying CIK Number: 0001426960
Downloaded 0 files from company 0001426960 

Trying CIK Number: 0001512162
Downloaded 1 files from company 0001512162 

Trying CIK Number: 0000894205
Downloaded 1 files from company 0000894205 

Trying CI

Skipping full submission download for '0001144204-18-008526' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001729985/000114420418008526/0001144204-18-008526.txt.
Downloaded 1 files from company 0001729985 

Trying CIK Number: 0001024716
Skipping full submission download for '0000913849-18-000107' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001024716/000091384918000107/0000913849-18-000107.txt.
Skipping filing detail download for '0000913849-18-000107' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001024716/000091384918000107/primary_doc.xml.
Downloaded 1 files from company 0001024716 

Trying CIK Number: 0001082491
Downloaded 1 files from company 0001082491 

Trying CIK Number: 0001732074
Downloaded 1 files from company 0001732074 

Trying CIK Number: 0001140315
Skipping filing detail download for '0001144204-18-007311' due 

Downloaded 0 files from company 0001540361 

Trying CIK Number: 0001787027
Downloaded 0 files from company 0001787027 

Trying CIK Number: 0001812178
Downloaded 0 files from company 0001812178 

Trying CIK Number: 0001760437
Downloaded 0 files from company 0001760437 

Trying CIK Number: 0001566801
Downloaded 0 files from company 0001566801 

Trying CIK Number: 0000923338
Downloaded 1 files from company 0000923338 

Trying CIK Number: 0000928047
Downloaded 1 files from company 0000928047 

Trying CIK Number: 0001082215
Downloaded 1 files from company 0001082215 

Trying CIK Number: 0001665633
Skipping full submission download for '0001665633-18-000001' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001665633/000166563318000001/0001665633-18-000001.txt.
Downloaded 1 files from company 0001665633 

Trying CIK Number: 0001767432
Downloaded 0 files from company 0001767432 

Trying CIK Number: 0001740837
Downloaded 0 files from company 00

Downloaded 0 files from company 0001766504 

Trying CIK Number: 0001682677
Downloaded 1 files from company 0001682677 

Trying CIK Number: 0001799425
Downloaded 0 files from company 0001799425 

Trying CIK Number: 0001599511
Downloaded 1 files from company 0001599511 

Trying CIK Number: 0001812093
Downloaded 0 files from company 0001812093 

Trying CIK Number: 0001589943
Downloaded 1 files from company 0001589943 

Trying CIK Number: 0001309469
Downloaded 1 files from company 0001309469 

Trying CIK Number: 0001009209
Skipping full submission download for '0001580642-18-000870' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001009209/000158064218000870/0001580642-18-000870.txt.
Downloaded 1 files from company 0001009209 

Trying CIK Number: 0001577919
Skipping full submission download for '0000950123-18-002195' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001577919/0000950123180

Downloaded 1 files from company 0001462284 

Trying CIK Number: 0001799367
Downloaded 0 files from company 0001799367 

Trying CIK Number: 0001593051
Skipping filing detail download for '0001140361-18-005785' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001593051/000114036118005785/primary_doc.xml.
Downloaded 1 files from company 0001593051 

Trying CIK Number: 0001035912
Downloaded 1 files from company 0001035912 

Trying CIK Number: 0001509974
Skipping full submission download for '0001398344-18-001586' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001509974/000139834418001586/0001398344-18-001586.txt.
Downloaded 1 files from company 0001509974 

Trying CIK Number: 0001579111
Skipping full submission download for '0001085146-18-000580' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001579111/000108514618000580/0001085146-18-

Downloaded 1 files from company 0001632802 

Trying CIK Number: 0001018674
Downloaded 1 files from company 0001018674 

Trying CIK Number: 0001051359
Skipping filing detail download for '0001051359-18-000001' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001051359/000105135918000001/primary_doc.xml.
Downloaded 1 files from company 0001051359 

Trying CIK Number: 0001838660
Downloaded 0 files from company 0001838660 

Trying CIK Number: 0000778963
Skipping full submission download for '0000778963-18-000001' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0000778963/000077896318000001/0000778963-18-000001.txt.
Downloaded 1 files from company 0000778963 

Trying CIK Number: 0001299351
Downloaded 0 files from company 0001299351 

Trying CIK Number: 0001710539
Downloaded 1 files from company 0001710539 

Trying CIK Number: 0001286534
Downloaded 1 files from company 0001286534 

Trying CI

Downloaded 1 files from company 0001731717 

Trying CIK Number: 0001783464
Downloaded 0 files from company 0001783464 

Trying CIK Number: 0001719281
Downloaded 1 files from company 0001719281 

Trying CIK Number: 0001113000
Downloaded 1 files from company 0001113000 

Trying CIK Number: 0001107261
Downloaded 1 files from company 0001107261 

Trying CIK Number: 0001019231
Downloaded 1 files from company 0001019231 

Trying CIK Number: 0001753219
Downloaded 0 files from company 0001753219 

Trying CIK Number: 0001080171
Downloaded 1 files from company 0001080171 

Trying CIK Number: 0001583672
Downloaded 1 files from company 0001583672 

Trying CIK Number: 0001056549
Downloaded 1 files from company 0001056549 

Trying CIK Number: 0001036325
Downloaded 1 files from company 0001036325 

Trying CIK Number: 0001584639
Downloaded 1 files from company 0001584639 

Trying CIK Number: 0001671657
Downloaded 1 files from company 0001671657 

Trying CIK Number: 0001488542
Skipping full submission 

Downloaded 1 files from company 0001102578 

Trying CIK Number: 0001066816
Downloaded 1 files from company 0001066816 

Trying CIK Number: 0001706164
Downloaded 1 files from company 0001706164 

Trying CIK Number: 0001125243
Skipping full submission download for '0001125243-18-000001' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001125243/000112524318000001/0001125243-18-000001.txt.
Downloaded 1 files from company 0001125243 

Trying CIK Number: 0001767049
Downloaded 0 files from company 0001767049 

Trying CIK Number: 0001670627
Downloaded 1 files from company 0001670627 

Trying CIK Number: 0001783773
Downloaded 0 files from company 0001783773 

Trying CIK Number: 0001232621
Downloaded 1 files from company 0001232621 

Trying CIK Number: 0001512173
Downloaded 1 files from company 0001512173 

Trying CIK Number: 0001638520
Downloaded 0 files from company 0001638520 

Trying CIK Number: 0001046192
Downloaded 1 files from company 00

Downloaded 0 files from company 0001767841 

Trying CIK Number: 0001728001
Skipping full submission download for '0001140361-18-007635' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001728001/000114036118007635/0001140361-18-007635.txt.
Downloaded 1 files from company 0001728001 

Trying CIK Number: 0001320168
Downloaded 1 files from company 0001320168 

Trying CIK Number: 0001589282
Skipping filing detail download for '0001589282-18-000002' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001589282/000158928218000002/primary_doc.xml.
Downloaded 1 files from company 0001589282 

Trying CIK Number: 0000051762
Downloaded 1 files from company 0000051762 

Trying CIK Number: 0001799500
Downloaded 0 files from company 0001799500 

Trying CIK Number: 0001844877
Downloaded 0 files from company 0001844877 

Trying CIK Number: 0001850858
Downloaded 0 files from company 0001850858 

Trying CI

Downloaded 1 files from company 0001635925 

Trying CIK Number: 0001631134
Downloaded 0 files from company 0001631134 

Trying CIK Number: 0001802534
Downloaded 0 files from company 0001802534 

Trying CIK Number: 0001214822
Downloaded 1 files from company 0001214822 

Trying CIK Number: 0001641438
Skipping filing detail download for '0001641438-18-000002' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001641438/000164143818000002/primary_doc.xml.
Downloaded 1 files from company 0001641438 

Trying CIK Number: 0001646821
Downloaded 1 files from company 0001646821 

Trying CIK Number: 0001387869
Skipping full submission download for '0001387131-18-000498' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001387869/000138713118000498/0001387131-18-000498.txt.
Downloaded 1 files from company 0001387869 

Trying CIK Number: 0001463217
Downloaded 1 files from company 0001463217 

Trying CI

Skipping filing detail download for '0001597878-18-000001' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001597878/000159787818000001/primary_doc.xml.
Downloaded 1 files from company 0001597878 

Trying CIK Number: 0000885118
Downloaded 1 files from company 0000885118 

Trying CIK Number: 0001666335
Downloaded 0 files from company 0001666335 

Trying CIK Number: 0001764387
Downloaded 0 files from company 0001764387 

Trying CIK Number: 0001641440
Skipping full submission download for '0001606587-18-000195' due to network error: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/0001641440/000160658718000195/0001606587-18-000195.txt.
Downloaded 1 files from company 0001641440 

Trying CIK Number: 0001697539
Downloaded 1 files from company 0001697539 

Trying CIK Number: 0001659851
Downloaded 0 files from company 0001659851 

Trying CIK Number: 0001454937
Skipping full submission download for '0001454937-18-0

In [19]:
# check out how many companies we got

len(os.listdir('sec2017/sec-edgar-filings'))

# now the data is already stored in its directories, we don't need to do anything further
# unless we just want to download filings for another period

# for another year or timeframe, go up and rerun the notebook changing values where needed

646