In [1]:
## File for reading and then exporting all SEC data 
## Creates two dataframes - cusip_symbol_df and ftd_df 
## One contains a list of all symbols and their cusips, and the latter contains all FTD data from 
## January 2020 until October 2021 

In [2]:
import pandas as pd

import glob

from pathlib import Path
import csv

In [3]:
## Load pickle for exports and imports of data  
import pickle 
def load_obj(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    
def save_obj(obj, path ):
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [4]:
## Read all SEC data and create a big DataFrame 

## Declare Header Variables to make Code work 
header = "SETTLEMENT DATE|CUSIP|SYMBOL|QUANTITY (FAILS)|DESCRIPTION|PRICE"

## Make FTD read Function 
def read_ftd_file(csv_file_path):
    df = pd.read_csv(
    Path(csv_file_path),
    index_col=None
    )
    
    df = df.iloc[:-2 , :]

    Header = "Header"
    df = df.rename(columns={header:Header})
    df = df.Header.str.split("|",expand=True)

    df = df.rename(columns={0:'Date',1:'CUSIP',2:'SYMBOL',3:'QUANTITY_FAILS',4:'DESCRIPTION',5:'PRICE'})
    
    ## Description column has commas in data and is not relevant anyways 
    ## Price column has errors and missing data. Source daily price from IEX later
    df.drop(['DESCRIPTION','PRICE'], axis=1,inplace=True)
    df['Date'] = pd.to_datetime(df['Date'])
#     df.set_index('Date',inplace=True)
#     df = df.reset_index()
    df = df.sort_values(["SYMBOL", "Date"], ascending = (True, True))
    df = df.reset_index(drop=True)
    
    
    return df

In [5]:
## Set Path variables for SEC data files 
path = r'C:\Users\watso\OneDrive\Projects\FintechProject2\SEC_Files_CSV' # USE YOUR PATH 
all_files = glob.glob(path + "/*.csv")

ftd_df = pd.DataFrame()

for filename in all_files:
    if filename == 0:
        ftd_df = read_ftd_file(filename)
    else:
        ftd_df2 = read_ftd_file(filename)
        ftd_df = pd.concat([ftd_df,ftd_df2],axis='rows')

ftd_df

Unnamed: 0,Date,CUSIP,SYMBOL,QUANTITY_FAILS
0,2020-01-02,G48833126,3126REORGPAY,36
1,2020-01-03,G48833126,3126REORGPAY,36
2,2020-01-06,G48833126,3126REORGPAY,36
3,2020-01-07,G48833126,3126REORGPAY,36
4,2020-01-10,00846U101,A,126
...,...,...,...,...
62280,2021-10-20,98986M103,ZYXI,266
62281,2021-10-26,98986M103,ZYXI,215
62282,2021-10-27,98986M103,ZYXI,95
62283,2021-10-28,98986M103,ZYXI,2104


In [6]:
ftd_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2411268 entries, 0 to 62284
Data columns (total 4 columns):
 #   Column          Dtype         
---  ------          -----         
 0   Date            datetime64[ns]
 1   CUSIP           object        
 2   SYMBOL          object        
 3   QUANTITY_FAILS  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 92.0+ MB


In [7]:
cusip_symbol_df = ftd_df[['SYMBOL','CUSIP']]
cusip_symbol_df = cusip_symbol_df.drop_duplicates()
cusip_symbol_df = cusip_symbol_df.sort_values('SYMBOL')
cusip_symbol_df = cusip_symbol_df.reset_index(drop=True)
cusip_symbol_df

Unnamed: 0,SYMBOL,CUSIP
0,,Q0819E102
1,1208PS,466391208
2,3126REORGPAY,G48833126
3,4101REORGPYMT,599724101
4,4207REORGPYMNT,723664207
...,...,...
21778,ZYXI,98986M103
21779,ZZHGY,98955F105
21780,ZZLL,98880P202
21781,ZZZOD,98959W203


In [8]:
## Export DFs
ftd_df.to_csv('../Resources/ftd_all_data.csv')
cusip_symbol_df.to_csv('../Resources/symbol_all_list.csv')
save_obj(ftd_df, '../Resources/ftd_all_data.pkl')
save_obj(cusip_symbol_df, '../Resources/symbol_all_list.pkl')