In [1]:
import pandas as pd

import glob

from pathlib import Path
import csv

In [2]:
## Load pickle for exports and imports of data  
import pickle 
def load_obj(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    
def save_obj(obj, path ):
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [3]:
def read_ftd_file(file_path):
    df = pd.read_table(
    Path(file_path),
    sep="|",
    index_col=None
    )
    
    df = df.iloc[:-2 , :] ## Drop last two rows as they are useless to keep
    df.drop(['DESCRIPTION','PRICE'], axis=1,inplace=True)
    df.rename(columns={'SETTLEMENT DATE':'Date','QUANTITY (FAILS)':'QUANTITY_FAILS'},
             inplace=True)
    df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values(["SYMBOL", "Date"], ascending = (True, True),inplace=True)
    df.set_index('Date',inplace=True)
    
    return df

In [4]:
## Do Loop to read all .txt files in folder 
## Requires import glob to do 
path = r'C:\Users\watso\OneDrive\Projects\FintechProject2_TWbranch\Fintech_Project_2\SEC_Text' # USE YOUR PATH 
all_files = glob.glob(path + "/*.txt")

ftd_df = pd.DataFrame()
loop_count = 0
error_count = 0
error_list = []

for filename in all_files:
    if filename == 0:
        ftd_df = read_ftd_file(filename)
    else:
        ftd_df2 = read_ftd_file(filename)
        ftd_df = pd.concat([ftd_df,ftd_df2],axis='rows')
    loop_count+=1
    if ftd_df.isnull().values.any() == True:
        error_list.append(filename)
        error_count +=1

## Below code shows only 3 null values out of 7 million 
## Dropping the 3 null values

ftd_df.dropna(inplace=True)

ftd_df.reset_index(inplace=True)
ftd_df

Unnamed: 0,Date,CUSIP,SYMBOL,QUANTITY_FAILS
0,2016-01-04,427093117,3117DVIPAY,32148.0
1,2016-01-05,427093117,3117DVIPAY,32148.0
2,2016-01-06,427093117,3117DVIPAY,32148.0
3,2016-01-07,427093117,3117DVIPAY,32148.0
4,2016-01-08,427093117,3117DVIPAY,32148.0
...,...,...,...,...
7022993,2021-10-20,98986M103,ZYXI,266.0
7022994,2021-10-26,98986M103,ZYXI,215.0
7022995,2021-10-27,98986M103,ZYXI,95.0
7022996,2021-10-28,98986M103,ZYXI,2104.0


In [5]:
filename

'C:\\Users\\watso\\OneDrive\\Projects\\FintechProject2_TWbranch\\Fintech_Project_2\\SEC_Text\\cnsfails202110b.txt'

In [6]:
ftd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7022998 entries, 0 to 7022997
Data columns (total 4 columns):
 #   Column          Dtype         
---  ------          -----         
 0   Date            datetime64[ns]
 1   CUSIP           object        
 2   SYMBOL          object        
 3   QUANTITY_FAILS  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 214.3+ MB


In [7]:
ftd_df.isnull().values.any()

False

In [8]:
cusip_symbol_df = ftd_df[['SYMBOL','CUSIP']]
cusip_symbol_df = cusip_symbol_df.drop_duplicates()
cusip_symbol_df = cusip_symbol_df.sort_values('SYMBOL')
cusip_symbol_df = cusip_symbol_df.reset_index(drop=True)
cusip_symbol_df

Unnamed: 0,SYMBOL,CUSIP
0,0034RIGHTS,Y21990034
1,0122PIK,812350122
2,0297RTS,G72990297
3,0329REORG,G33990329
4,0888RTSPYMNT,529900888
...,...,...
33911,ZZLL,98880P103
33912,ZZLL,98880P202
33913,ZZLLD,98880P202
33914,ZZZOD,98959W203


In [9]:
cusip_symbol_df.isnull().values.any()

False

In [10]:
print(ftd_df[ftd_df.isnull().any(axis=1)]['QUANTITY_FAILS'].head())

Series([], Name: QUANTITY_FAILS, dtype: float64)


In [11]:
print(ftd_df[ftd_df.isnull().any(axis=1)]['SYMBOL'].head())

Series([], Name: SYMBOL, dtype: object)


In [12]:
## Export DFs
ftd_df.to_csv('../Resources/ftd_all_data.csv')
cusip_symbol_df.to_csv('../Resources/symbol_all_list.csv')
save_obj(ftd_df, '../Resources/ftd_all_data.pkl')
save_obj(cusip_symbol_df, '../Resources/symbol_all_list.pkl')