In [1]:
import requests
import zipfile
import pandas as pd
import pypdf
import io
import re

YEAR = 2023

In [2]:
def getDisclosuresLink(year: int) -> str:
    """
    Returns the link to the financial disclosures for a given year.
    """
    yearFD = str(year) + "FD"
    baseLink = "https://disclosures-clerk.house.gov/public_disc/financial-pdfs/"
    return (baseLink + yearFD + ".zip", yearFD)

def getFilingsDataFrame(zipStream: io.BytesIO, yearFD: str) -> pd.DataFrame:
    """
    Creates a dataframe listing all the financial disclosures politicians made for a given year.
    """
    with zipfile.ZipFile(zipStream, 'r') as zip_ref:
        with zip_ref.open(yearFD + ".txt") as file:
            df = pd.read_csv(file, sep='\t')
            df = df[df["FilingType"] == "P"]
            return df.loc[:, ["Last", "First", "FilingType", "DocID"]]

def getDocumentLink(year: int, documentId: int) -> str:
    """
    Gets the link to the pdf for a specific disclosure (documentId). Needs the year to match
    """
    baseLink = "https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/"
    return baseLink + str(year) + '/' + str(documentId) + '.pdf'

def createTransactionDf(matches: re.Match, documentText: str) -> pd.DataFrame:
    """
    Creates a dataframe with columns ["Sale or Purchase", "Ticker", "Asset Type", "Amount", "Date"].

    Uses the collection of stock ticker matches to parse a document.
    """
    cleanTransactions = []
    for i in range(len(matches)):
        (start, _) = matches[i].span()

        # The start of the next transactions is the end of the current one
        if i == len(matches) - 1:
            end = len(documentText)
        else:
            (end, _) = matches[i+1].span()

        transaction = documentText[start: end]
        transaction = transaction.replace(',', '') # Get rid of all commas to better parse numbers
        cleanTransactions.append(parseTransaction(transaction, matches[i].group()))
    return pd.DataFrame(cleanTransactions, columns=["Sale or Purchase", "Ticker", "Asset Type", "Amount", "Date"])

def parseTransaction(transaction: str, ticker = "") -> list:
    """
    Parse a transaction string into a list of values ["Sale or Purchase", "Ticker", "Asset Type", "Amount", "Date"].
    """
    foundSquareBraces = False
    transactionType = ""
    amount = None
    assetType = "stock"

    # Parse out the transaction type (Sale or Purchase) as its always found after the square braces
    for i in range(len(transaction)):
        char = transaction[i]
        if char == ']':
            foundSquareBraces = True
            transactionType = transaction[i-2 : i]
            continue
        if foundSquareBraces and char.isupper():
            break

    # Get date, amount, and asset type
    date = getFirstMatch(r"\b\d{1,2}/\d{1,2}/\d{4}\b", transaction) # Parse date in the formate MM/DD/YYYY
    if transactionType == 'ST':
        amount = getFirstMatch(r"\d+(?=\sshares|\sstocks)", transaction) # Parse amount of shares - "#### shares" or "#### stocks"
    elif transactionType == 'OP':
        amount = getFirstMatch(r"\d+(?=\scall|\scalls)", transaction)
        assetType = "call"
        if amount is None:
            amount = getFirstMatch(r"\d+(?=\sput|\sputs)", transaction)
            assetType = "put"

    return [char, ticker, assetType, amount, date]

def getFirstMatch(regex: str, text: str) -> str:
    """
    Gets the first match of a regex in a string. None if no match is found.
    """
    matches = re.findall(regex, text)
    if len(matches) > 0:
        return matches[0]
    return None

def processDocument(documentText) -> pd.DataFrame:
    """
    Process a document and return a dataframe with the transactions found in it.
    """
    # Parse tickers [A-Z] but not inlcude the parentheses as part of the match
    tickerIter = re.finditer(r"(?<=\()[A-Z]*(?=\))", documentText) 
    
    tickerList = list(tickerIter) #TODO: Consider using more-itertools to peek ahead instead of casting to a list
    return createTransactionDf(tickerList, documentText)


In [3]:
dataLink, yearFD = getDisclosuresLink(YEAR)
response = requests.get(dataLink)

if (response.status_code != 200):
    print("Error retrieving data: " + str(response.status_code))
    exit()

df = getFilingsDataFrame(io.BytesIO(response.content), yearFD)
df = df[df["Last"] == "Pelosi"] # Filter to only get Pelosi's transactions

transactions = pd.DataFrame(columns=["Sale or Purchase", "Ticker", "Asset Type", "Amount", "Date"])

for i, data in df.iterrows():
    documentLink = getDocumentLink(YEAR, data['DocID'])
    pdfResponse = requests.get(documentLink)
    pdfResponse.status_code

    if (pdfResponse.status_code != 200):
        print("Error retrieving data: " + str(response.status_code))
        exit()

    pdf = pypdf.PdfReader(io.BytesIO(pdfResponse.content))
    print("Processing document: " + documentLink)
    documentText = ""

    for page in pdf.pages:
        documentText += page.extract_text()
    transactions = pd.concat([transactions, processDocument(documentText)], axis=0, ignore_index=True)

Processing document: https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2023/20022320.pdf
Processing document: https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2023/20022664.pdf
Processing document: https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2023/20023080.pdf
Processing document: https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2023/20023192.pdf
Processing document: https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2023/20024186.pdf
Processing document: https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2023/20022260.pdf


In [5]:
transactions

Unnamed: 0,Sale or Purchase,Ticker,Asset Type,Amount,Date
0,S,RBLX,call,100,01/20/2023
1,P,AAPL,stock,10000,03/17/2023
2,S,AAPL,stock,2900,05/08/2023
3,P,AAPL,stock,5000,06/15/2023
4,P,MSFT,stock,5000,06/15/2023
5,P,NVDA,call,50,11/22/2023
6,S,GOOGL,stock,10000,12/28/2022
7,S,GOOGL,stock,10000,12/20/2022
8,S,GOOGL,stock,10000,12/21/2022
9,S,NFLX,stock,1000,12/29/2022
