This module is used to combine the index files for SEC filings
- the identifiers are extracted quarter by quarter using the local code: pull_edgar_links_1 

In [None]:
# !pip install sec-api

import pandas as pd
from tqdm import tqdm
import warnings
import re
import os
import numpy as np
import boto3
import awswrangler as wr
import io

warnings.filterwarnings('ignore')
tqdm.pandas()

#setup
s3_resource = boto3.resource('s3')

# directories
bucket = '[your-bucket-name]'  # Replace with your bucket name
output_prefix = f's3://{bucket}/'

In [None]:
# extract index files

# load html files

edgar_dat_all = pd.DataFrame()

yrstart = int(input('start year: '))
yrend = int(input('end year: '))

yrlist = range(yrstart, yrend+1)
qtrlist = range(1,5)

for yr in reversed(yrlist):
    for qtr in qtrlist:
        
        print('extract text for year {} quarter {}'.format(str(yr), str(qtr)))

        # read file
        loadpath = f's3://{bucket}/edgar_masterhtml1/' + str(yr) + '-QTR' + str(qtr) + '.txt'
        edgar_dat = wr.s3.read_csv(path=loadpath,
                        sep='|',
                        lineterminator='\n',
                        header=0)
        edgar_dat = edgar_dat.where(pd.notnull(edgar_dat), '')

        # combine
        edgar_dat_all = edgar_dat_all.append(edgar_dat, ignore_index=True)


In [None]:
# save file

savepath = f's3://{bucket}/misc/edgar_masterhtml1_combined.gzip'
wr.s3.to_parquet(
    df=edgar_dat_all,
    path=savepath,
)

In [None]:
# read combined index file

s3_client = boto3.client('s3')
file = f"misc/edgar_masterhtml1_combined.gzip"
obj = s3_client.get_object(Bucket=bucket,Key=file)
df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
df.head()

In [None]:
# export as stata 
savepath = f's3://{bucket}/misc/edgar_masterhtml1_combined.csv'
wr.s3.to_csv(
    df=df,
    path=savepath,
    sep='|'
)