In [1]:
import os

### Parsing the XML files and saving the data into csv files

In order to parse the downloaded data, we use a Python class created by Pepe Tan and distributed under the MIT Licence. It reads the text of the filing and saves the information in a dataframe.

In [2]:
# we found this neat python class somebody made, it's perfect for our needs
# in order to parse the XML data and save it in a DataFrame

"""
Author: Pepe Tan
Date: 2020-10-06
MIT License
"""


import pandas as pd
from bs4 import BeautifulSoup
#from ticker_class import Ticker
from datetime import datetime



class Filing13F:
    """ 
        Class containing common stock portfolio information from an institutional investor.
        1. Parsed from 13F-HR filing from SEC Edgar database.

    """
    
    # If True prints out results in console
    debug = False
    
    
    def __init__(self,filepath=''):
        """ Initialize object """
        self.filepath = filepath # Path of file
        
        # Directly call parse_file() when filepath is provided with __init__
        if self.filepath:
            self.parse_file(self.filepath)
            

    def parse_file(self, filepath=''):
        """ Parses relevant information from 13F-HR text file """
        self.filepath = filepath # Path of file
        
        if self.debug:
            print(self.filepath)
            
        # Opens document and passes to BeautifulSoup object.
        doc = open(filepath)
        soup = BeautifulSoup(doc, 'html.parser') # OBS! XML parser will not work with SEC txt format
        
        # Print document structure and tags in console
        if self.debug:
            print(soup.prettify())
            
            for tag in soup.find_all(True):
                print(tag.name)
        
        ## --- Parse content using tag strings from txt document: <tag> content </tag>
        # OBS html.parser uses tags in lowercase
        
        # Name of filing company
        self.company = soup.find('filingmanager').find('name').string
        # Company identifier: Central Index Key
        self.CIK = soup.find('cik').string
        # Form type: 13F-HR
        self.formtype = soup.find('type').string
        # 13F-HR file number
        self.fileNumber = soup.find('form13ffilenumber').string
        # Reporting date (e.g. 03-31-2020)
        self.period_of_report_date = datetime.strptime(soup.find('periodofreport').string, '%m-%d-%Y').date()
        # Filing date (up to 45 days after reporting date)
        self.filing_date = datetime.strptime(soup.find('signaturedate').string, '%m-%d-%Y').date()
                
        ## --- Parse stock list: Each stock is marked with an infoTable parent tag
        stocklist = soup.find_all('infotable') # List of parent tag objects
        
        # Initialize lists
        name = []     # Company name
        cusip = []    # CUSIP identifier
        value = []    # Total value of holdings
        amount = []   # Amount of stocks
        #price_per_share = []  # Share price on reporting day != purchase price
        poc = []      # Put/Call options
        symbol = []   # Trading symbol
        
        # Fill lists with each stock
        for s in stocklist:
            # Company name & Title of class (e.g. COM, Class A, etc)
            n = s.find("nameofissuer").string
            n = n.replace('.','') # Remove dots
            
            c = s.find("titleofclass").string
            if c != "COM":
                name.append(n+" ("+c+")")
            else:
                name.append(n)
                
            # CUSIP identifier
            cusip.append(s.find("cusip").string)
            # Total value of holdings
            v = int(s.find("value").string)
            value.append(v)
            # Amount of stocks
            ssh = int(s.find("shrsorprnamt").find("sshprnamt").string)
            amount.append(ssh)
            # Share price on reporting day (OBS! != purchase price)
            #price_per_share.append(round(v*1000/ssh,2))    
            
            # Put/Call options
            put_or_call = s.find("putcall")
            if put_or_call:
                poc.append(put_or_call.string)
            else:
                poc.append('No')
            

        # Create dictionary        
        #stock_dict = {"filed name":name,  "cusip":cusip, "value":value, "amount":amount,
        #       "price_per_share":price_per_share, "put_or_call":poc}
        
        # Create dictionary        
        stock_dict = {"filed name":name,  "cusip":cusip, "value":value, "amount":amount, "put_or_call":poc}
        # Store in dataframe
        data = pd.DataFrame(stock_dict)
        data['owner'] = self.company
        data['cik'] = self.CIK
        data['report_date'] = self.period_of_report_date
        
        # Drop rows with put/call option
        indexes =  data[  data['put_or_call'] != 'No' ].index
        data.drop(indexes, inplace=True)
        # data.set_index('symbol', inplace=True)
        #data.set_index('filed name', inplace=True)
        
        self.data = data
        
        return

In [3]:
!pwd

/home/jradhima/kul_course/financial-network-analysis/notebooks


### Traverse over all sibdirectories, find the 13F submissions, parse and save them in a list

In [4]:
dfs = []
count = 1

# traverse everything starting from folder 'sec-edgar-filings'
for pathnames, dirnames, filenames in os.walk('../../sec2021/sec-edgar-filings/'):
    # check if every file if it's a submission file
    for file in filenames:
        if file == 'full-submission.txt':
            
            filepath = pathnames + os.sep + file
            
            # some xml files have a 'ns1:' prefix which is annoying, replace it
            with open(filepath) as f:
                newText=f.read().replace('ns1:', '')

            with open(filepath+'new', "w") as f:
                f.write(newText)
            
            # create a Filing object, parse it, and add data to dataframe
            filing = Filing13F()
            filing.parse_file(filepath+'new')
            dfs.append(filing.data)
            
            count += 1
            if count % 50 == 0:
                print(f"Already parsed {count} filings!")
    if count % 751 == 0:
        break

Already parsed 50 filings!
Already parsed 50 filings!
Already parsed 50 filings!
Already parsed 100 filings!
Already parsed 100 filings!
Already parsed 100 filings!
Already parsed 150 filings!
Already parsed 150 filings!
Already parsed 150 filings!
Already parsed 200 filings!
Already parsed 200 filings!
Already parsed 200 filings!
Already parsed 200 filings!
Already parsed 200 filings!
Already parsed 200 filings!
Already parsed 250 filings!
Already parsed 250 filings!
Already parsed 250 filings!
Already parsed 300 filings!
Already parsed 300 filings!
Already parsed 300 filings!
Already parsed 350 filings!
Already parsed 350 filings!
Already parsed 350 filings!
Already parsed 350 filings!
Already parsed 350 filings!
Already parsed 350 filings!
Already parsed 350 filings!
Already parsed 350 filings!
Already parsed 350 filings!
Already parsed 400 filings!
Already parsed 400 filings!
Already parsed 400 filings!
Already parsed 450 filings!
Already parsed 450 filings!
Already parsed 450 fili

In [5]:
# check how many investors' filings we parsed

len(dfs)

750

### Look at the data

Looking at the data, we see that the pipeline works as we need it to. We get all the information we want and can proceed with saving the data and starting the analysis

In [6]:
dfs[0].head()

Unnamed: 0,filed name,cusip,value,amount,put_or_call,owner,cik,report_date
0,89BIO INC,282559103,1072,44007,No,"SILVERARC CAPITAL MANAGEMENT, LLC",1816307,2020-12-31
1,ACTINIUM PHARMACEUTICALS INC,00507W206,702,90000,No,"SILVERARC CAPITAL MANAGEMENT, LLC",1816307,2020-12-31
2,ADAMAS PHARMACEUTICALS INC,00548A106,1909,440955,No,"SILVERARC CAPITAL MANAGEMENT, LLC",1816307,2020-12-31
3,AFFIMED N V,N01045108,2328,400000,No,"SILVERARC CAPITAL MANAGEMENT, LLC",1816307,2020-12-31
4,AKERO THERAPEUTICS INC,00973Y108,908,35179,No,"SILVERARC CAPITAL MANAGEMENT, LLC",1816307,2020-12-31


In [7]:
# stick them all together

df = pd.concat(dfs, ignore_index=True)

In [8]:
df.shape

(335978, 8)

In [9]:
df.head()

Unnamed: 0,filed name,cusip,value,amount,put_or_call,owner,cik,report_date
0,89BIO INC,282559103,1072.0,44007.0,No,"SILVERARC CAPITAL MANAGEMENT, LLC",1816307,2020-12-31
1,ACTINIUM PHARMACEUTICALS INC,00507W206,702.0,90000.0,No,"SILVERARC CAPITAL MANAGEMENT, LLC",1816307,2020-12-31
2,ADAMAS PHARMACEUTICALS INC,00548A106,1909.0,440955.0,No,"SILVERARC CAPITAL MANAGEMENT, LLC",1816307,2020-12-31
3,AFFIMED N V,N01045108,2328.0,400000.0,No,"SILVERARC CAPITAL MANAGEMENT, LLC",1816307,2020-12-31
4,AKERO THERAPEUTICS INC,00973Y108,908.0,35179.0,No,"SILVERARC CAPITAL MANAGEMENT, LLC",1816307,2020-12-31


In [10]:
# save to csv for further analysis

df.to_csv('../datasets/filingsEnd2020.csv')