In [1]:
import pandas as pd
import numpy as np
import PyPDF2


def read_pdf(file_path):
    """Reads the content of a PDF file and returns it as a string
    Parameters:
    ------------
    file_path: str
        path to the PDF file to be read
    Returns:
    ------------
    text: str
        content of the PDF file
    """ 
       
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfFileReader(file)
        num_pages = pdf_reader.numPages

        text = ''
        for page_num in range(num_pages):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()

    return text

In [2]:

xml_file_path = r'C:\Users\thoma\Downloads\2022FD\2022FD.xml'
import pandas as pd
from pathlib import Path

df = pd.read_xml(xml_file_path)

unique_people = df["Last"].unique()

#select only a subset where the filing type is "P"
df_subset = df[df['FilingType'] == 'P']

#drop columns: Prefix, Suffix
df_subset = df_subset.drop(columns=['Prefix', 'Suffix', "StateDst"])


#reset index
df_subset = df_subset.reset_index(drop=True)

#convert year column to datetime
df_subset['Year'] = pd.to_datetime(df_subset['Year'], format='%Y')

#add a new_column called "corresponding url" that contains the url of the pdf file
df_subset["corresponding_url"] = df_subset.apply(lambda row: f"https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/{row['Year'].year}/{row['DocID']}.pdf", axis=1)


In [3]:
df_subset

Unnamed: 0,Last,First,FilingType,Year,FilingDate,DocID,corresponding_url
0,Aderholt,Robert B.,P,2022-01-01,9/23/2022,20021740,https://disclosures-clerk.house.gov/public_dis...
1,Aderholt,Robert B.,P,2022-01-01,12/13/2022,20022132,https://disclosures-clerk.house.gov/public_dis...
2,Allen,Richard W.,P,2022-01-01,2/13/2022,20020448,https://disclosures-clerk.house.gov/public_dis...
3,Allen,Richard W.,P,2022-01-01,3/9/2022,20020585,https://disclosures-clerk.house.gov/public_dis...
4,Allen,Richard W.,P,2022-01-01,9/10/2022,20021681,https://disclosures-clerk.house.gov/public_dis...
...,...,...,...,...,...,...,...
619,Wittman,Robert J.,P,2022-01-01,9/10/2022,20021679,https://disclosures-clerk.house.gov/public_dis...
620,Wittman,Robert J.,P,2022-01-01,10/5/2022,20021807,https://disclosures-clerk.house.gov/public_dis...
621,Wittman,Robert J.,P,2022-01-01,12/8/2022,20022101,https://disclosures-clerk.house.gov/public_dis...
622,Womack,Steve,P,2022-01-01,11/29/2022,20022049,https://disclosures-clerk.house.gov/public_dis...


In [4]:
import requests
import os
import time

for index, row in df_subset.iterrows():

    correspondig_url = row["corresponding_url"]
    #download the pdf file
    
    name = row["Last"]
    filing_data = row["FilingDate"]
    
    #change the order of the data, first the year then the month then the day
    filing_data = filing_data.split("/")
    filing_data = filing_data[2] + "-" + filing_data[0] + "-" + filing_data[1]

    url = correspondig_url
    print(url)
    print()
    #get the content of the pdf file

    
    response = requests.get(url)

    if response.status_code == 200:
        print("Downloading...")


        #exit the connection
        response.close()


    



https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20021740.pdf

Downloading...
<Response [200]>
https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20022132.pdf

Downloading...
<Response [200]>
https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20020448.pdf

Downloading...
<Response [200]>
https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20020585.pdf

Downloading...
<Response [200]>
https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20021681.pdf

Downloading...
<Response [200]>
https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20020708.pdf

Downloading...
<Response [200]>
https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20020367.pdf

Downloading...
<Response [200]>
https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20020474.pdf

Downloading...
<Response [200]>
https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20021134.pdf

Downloading...
<Response [200]>
https://disclosures-clerk.ho

KeyboardInterrupt: 

: 

In [None]:
df["Last"].unique()


array(['Hollier', 'Aadland', 'Acevedo-Arreguin', ..., 'Zinke', 'Zoll',
       'Zumbluskas'], dtype=object)