In [1]:
import edgar
import os
from pathlib2 import Path
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm

## generate df with all companies and URLs

In [2]:
def get_project_dir():
    try:
        project_dir = Path.cwd() / '/' / 'My Drive' / 'Jotham' / 'Personal Docs' / 'ML for finance' / 'SEC Sentiment Analysis' / 'sec-sentiment'
        os.chdir(project_dir)
    except BaseException as e:
        project_dir = Path.cwd() / '/' / 'Volumes' / 'GoogleDrive' / 'My Drive' / 'Jotham' / 'Personal Docs' / 'ML for finance' / 'SEC Sentiment Analysis' / 'sec-sentiment'
        os.chdir(project_dir)
    return project_dir

In [3]:
os.chdir(os.path.join(get_project_dir(), 'sec-filings-index'))

In [4]:
# filing_year = 2013   # choose year to get filings from
# edgar.download_index(os.getcwd(), filing_year)

In [5]:
# Get list of all DFs 
table_list = []

for i in os.listdir():
    if i.endswith('.tsv'):
        table_list.append(pd.read_csv(i, sep='|', header=None, encoding='latin-1', parse_dates=[3], dtype={0: int}))

In [6]:
# append all dfs into a single df

df = pd.DataFrame(columns=[0,1,2,3,4,5])   # downloaded file has 6 columns

for i in range(len(table_list)):
        df = pd.concat([df, table_list[i]], ignore_index=True, axis=0)

df.columns= ['cik', 'company_name', 'filing_type', 'filing_date', 'url', 'url2']

## Check if dataframe correctly generated

In [7]:
count_list = []
for i in range(len(table_list)):
    count_list.append(len(table_list[i]))

if df.shape[0] == sum(count_list):
    print('df tallies with individual files. Total rows = {}'.format(df.shape[0]))
else:
    print('ERROR. df does not tally!!')

df tallies with individual files. Total rows = 6061186


## download data

In [8]:
def download_filings(cik_num_list, from_date='2014-01-01'):
    """Function to filter the appropriate filings and download them in the folder"""
    
    # filter df with company CIK,filing type (10-K and 10-Q) and date  
    df_filtered = df [(df['cik'].isin(cik_num_list)) & 
                      ((df['filing_type']=='10-K') | (df['filing_type'] == '10-Q')) & 
                      (df['filing_date'] > from_date)]
    
    company_names = df_filtered['company_name'].unique().tolist()
    
    # check if folders for each company already exists    
    sec_filings_dir = os.path.join(get_project_dir(), 'sec-filings-downloaded')  # dir to download SEC filingsa
    os.chdir(sec_filings_dir)

    for company in company_names:
        company_dir = os.path.join(sec_filings_dir, company)

        if not os.path.exists(company_dir):
            os.makedirs(company_dir)
            print('\n created dir: {}'.format(company))
        else:
            print('\n{} directory exists'.format(company))
            
        os.chdir(company_dir)
        
        # create company specific df to iterate over    
        df_filtered_co = df_filtered[df_filtered['company_name'] == company]  # get df with the company only
        df_filtered_co['filing_date'] = df_filtered_co['filing_date'].astype(str)   # convert to 'object' to name file

        for i in range(len(df_filtered_co)):
            url_prefix = 'https://www.sec.gov/Archives/'
            row = df_filtered_co.iloc[i,:]
            url = url_prefix + row['url']
            response = requests.get(url, stream=True)
            
            filing_name = row['filing_date'] + str('_') + row['filing_type']
            if os.path.isfile(filing_name):
                print('{} file already exists'.format(filing_name))
            else:
                print('Downloading: {}'.format(filing_name))
                with open('{}'.format(filing_name), 'wb') as handle:
                    for data in tqdm(response.iter_content()):
                        handle.write(data)
                        
    

In [11]:
download_filings([1000045, 1000229, 1000230])


NICHOLAS FINANCIAL INC directory exists


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


2018-08-14_10-Q file already exists
2019-02-14_10-Q file already exists
2018-02-09_10-Q file already exists
2018-11-14_10-Q file already exists
2017-08-09_10-Q file already exists
2018-06-27_10-K file already exists
2017-11-09_10-Q file already exists
2017-06-14_10-K file already exists
2016-11-09_10-Q file already exists
2017-02-09_10-Q file already exists
2016-08-09_10-Q file already exists
2016-06-14_10-K file already exists
2015-08-10_10-Q file already exists
2015-11-09_10-Q file already exists
2016-02-09_10-Q file already exists
2015-06-15_10-K file already exists
2015-02-09_10-Q file already exists
2014-08-11_10-Q file already exists
2014-11-10_10-Q file already exists
2014-06-16_10-K file already exists
2014-02-10_10-Q file already exists

CORE LABORATORIES N V directory exists
2018-07-27_10-Q file already exists
2019-02-11_10-K file already exists
Downloading: 2018-02-12_10-K


908958it [00:06, 130814.01it/s]


Downloading: 2018-10-25_10-Q


407635it [00:03, 124641.16it/s]


Downloading: 2017-07-26_10-Q


959519it [00:06, 144148.35it/s]


Downloading: 2018-04-27_10-Q


370597it [00:02, 126365.01it/s]


Downloading: 2017-10-25_10-Q


369063it [00:05, 67399.77it/s]


Downloading: 2017-04-21_10-Q


332111it [00:07, 45100.43it/s] 


Downloading: 2016-10-21_10-Q


972655it [00:41, 23399.76it/s]


Downloading: 2017-02-10_10-K


939804it [00:13, 68090.46it/s]


Downloading: 2016-07-22_10-Q


970152it [00:05, 183919.02it/s]


Downloading: 2016-04-22_10-Q


920922it [00:05, 172573.69it/s]


Downloading: 2015-07-27_10-Q


955715it [00:06, 159174.90it/s]


Downloading: 2015-10-23_10-Q


958585it [00:05, 163622.90it/s]


Downloading: 2016-02-12_10-K


2147768it [00:31, 67839.74it/s]


Downloading: 2015-04-29_10-Q


1044046it [00:06, 158703.42it/s]


Downloading: 2015-02-17_10-K


2475140it [00:33, 73912.18it/s]


Downloading: 2014-07-25_10-Q


456614it [00:02, 160348.52it/s]


Downloading: 2014-11-04_10-Q


469797it [00:03, 147537.59it/s]


Downloading: 2014-04-25_10-Q


964979it [00:05, 163029.94it/s]


Downloading: 2014-02-13_10-K


2185937it [00:27, 79065.07it/s]



OPTICAL CABLE CORP directory exists
Downloading: 2018-09-11_10-Q


175847it [00:01, 143282.63it/s]


Downloading: 2018-03-13_10-Q


163228it [00:01, 148494.80it/s]


Downloading: 2018-12-19_10-K


1356360it [00:08, 165298.82it/s]


Downloading: 2017-09-12_10-Q


183658it [00:01, 134052.33it/s]


Downloading: 2018-06-11_10-Q


183227it [00:01, 132840.54it/s]


Downloading: 2017-12-20_10-K


707192it [00:04, 147053.94it/s]


Downloading: 2017-06-13_10-Q


188605it [00:06, 28133.84it/s]


Downloading: 2016-12-20_10-K


662329it [00:04, 146756.50it/s]


Downloading: 2017-03-08_10-Q


179353it [00:10, 16338.27it/s]


Downloading: 2016-09-13_10-Q


192002it [00:01, 103801.67it/s]


Downloading: 2016-06-07_10-Q


194482it [00:05, 32851.31it/s]


Downloading: 2015-09-11_10-Q


200584it [00:01, 119514.62it/s]


Downloading: 2016-01-28_10-K


586403it [00:11, 50032.00it/s]


Downloading: 2016-03-14_10-Q


180356it [00:01, 129553.49it/s]


Downloading: 2015-06-12_10-Q


260052it [00:02, 92613.03it/s] 


Downloading: 2015-03-10_10-Q


247231it [00:01, 129932.62it/s]


Downloading: 2014-09-10_10-Q


264875it [00:03, 72702.52it/s]


Downloading: 2014-12-19_10-K


1280940it [00:18, 68957.52it/s]


Downloading: 2014-06-11_10-Q


258902it [00:04, 58803.02it/s]


Downloading: 2014-03-17_10-Q


268249it [00:04, 64252.13it/s]
