In [1]:
import pdfplumber
import pandas as pd

In [2]:
import requests

file_path = "gwinnett_county_permits.pdf"
url = "https://www.gwinnettcounty.com/static/departments/planning/pdf/2025_activity/gwinnett-county-building-permits-12302024-01032025.pdf"

response = requests.get(url) 

if response.status_code == 200: 
    with open(file_path, "wb") as f:
        f.write(response.content)
        print("success downloading")
else:
    print("failed")

success downloading


In [3]:
# Need to format downloaded pdf to be useable for pdfplumber
with open(file_path, "rb") as f:
    content = f.read() 

start_index = content.find(b'%PDF-1.7')

pdf_content = content[start_index:] 

with open(file_path, "wb") as f: 
    f.write(pdf_content)
    

In [4]:
with pdfplumber.open(file_path) as pdf: 
    pages = pdf.pages

    full_text = "" 
    for i in range(len(pages)): 
        full_text += pages[i].extract_text()

    with open("test.txt", "w", encoding="utf-8") as f: 
        f.write(full_text)
        import os 
        print(os.getcwd())
        print("Success")
        
    

C:\dsektop\data-cleaning-permits\gwinnett_county_data_scrape
Success


Relevant headers to extract: 
- Issue Date
- Case Number
- Project Name
- Census Code
- ST Address, City
- Comm/res
- Use of CONST
- Contractor
- Sewer/Septic (There are two labels for this in the data) 
- Use (Especially important to find if multiresidential)
- Type of Work
- Estimated Cost

In [9]:
# Approach:
# 1 Separate and store each case into a list of cases by using separator "CASE NUMBER" 
# 2 Initialize a list of the column headers. Then parse and add data to relevant columns
# 3 Turn the list into df 

# 1 
cases_list = full_text.split("CASE NUMBER")
cases_list = cases_list[1:]
# print("\n\n".join(cases_list))

# 2
# Parse data row by row 
rows = []
for i in range(len(cases_list)):
    case = cases_list[i]
    
    # case number 
    stop = case.index(" PARCEL") 
    case_num = case[:stop].strip() 
    # issue date
    start = case.index("ISSUED ON ") 
    stop = case.index("PROJECT") 
    issue_date = case[start + len("ISSUED ON "):stop].strip() 
    # project name 
    project_name = case[case.index("PROJECT:") + len("PROJECT: "):case.index("CENSUS")].strip()

    # FIX case number if formatting is weird
    case_num_append = case[case.index("PROJECT:") + len(project_name)+4:case.index("CENSUS")].strip() 
    if case_num_append.isdigit():
        case_num += case_num_append.strip()
        
    # census code
    census_code = case[case.index("CENSUS CODE:") + len("CENSUS CODE:"):case.index("CONTRACTOR")].strip()
    # address and city
    address, city = case[case.index("CITY:") + len("CITY:"):case.index("TENANT:")].strip().split(",")
    address, city = address.strip(), city.strip()
    # comm/res
    comm_or_res = case[case.index("COMM/RES:") + len("COMM/RES:"):case.index("USE")].strip()
    # use of construction 
    construction_purpose = case[case.index("USE OF CONST.:") + len("USE OF CONST.:"):].strip()
    unwanted = "TOTAL PERMITS ISSUED:"
    if unwanted in construction_purpose: 
        construction_purpose = construction_purpose[:construction_purpose.index(unwanted)]
    # contractor 
    contractor = case[case.index("CONTRACTOR:") + len("CONTRACTOR:") : case.index("ZONING")].strip()
    # sewer/septic address
    first_instance = case.index("Sewer/Septic:") 
    sewer_septic_address = case[first_instance + len("Sewer/Septic:") : case.index("HEATED")].strip()
    # whether sewer or septic
    include_strings = ["sewer", "septic"]
    sewer_or_septic = case[case.index("Sewer/Septic", first_instance+1) + len("Sewer/Septic:"):case.index("COMM")].strip()
    if not (sewer_or_septic.lower() in include_strings):
        sewer_or_septic = ""
    # use
    use = case[case.index("USE:") + len("USE:") : case.index("City")].strip()
    # type of work
    type_of_work = case[case.index("TYPE OF WORK:") + len("TYPE OF WORK:") : case.index("ONLY)")].strip()
    # estimated cost
    estimated_cost = case[case.index("ESTIMATED COST:") + len("ESTIMATED COST:") : case.index("BLOCK:")].strip()


    # Add everything to a dict and append it to the list 
    case_dict = {
        'case_number': case_num,
        'issue_date': issue_date,
        'census_code': census_code,
        'address': address,
        'city': city,
        'comm_or_res': comm_or_res,
        'construction_purpose': construction_purpose,
        'sewer_septic_address': sewer_septic_address,
        'sewer_or_septic': sewer_or_septic,
        'use': use,
        'estimated_cost': estimated_cost
    }

    rows.append(case_dict)

# 3
df = pd.DataFrame(rows)
df.head()

Unnamed: 0,case_number,issue_date,census_code,address,city,comm_or_res,construction_purpose,sewer_septic_address,sewer_or_septic,use,estimated_cost
0,BLD2024-01744,1/3/2025,"434 Alterations(Res), Deck, Porch, Remodel",338 RYSTON WAY,LAWRENCEVILLE,Residential,,,Sewer,,"$30,000.00"
1,BLD2024-03690,1/2/2025,"O/S Fire Repair, Sign, Above Ground Pool, Mech",2001 ELMPARK LN,SNELLVILLE,Residential,RESIDENTIAL REPAIRS,"STONE MOUNTAIN, GA 30087",,Single Family House,"$60,000.00"
2,BLD2024-03706,1/2/2025,"434 Alterations(Res), Deck, Porch, Remodel",2815 THE TERRACES WAY,DACULA,Residential,,"ACWORTH, GA 30101",Sewer,,"$30,000.00"
3,BLD2024-03744,1/2/2025,"434 Alterations(Res), Deck, Porch, Remodel",1865 TYLER TRCE,LAWRENCEVILLE,Residential,,"ACWORTH, GA 30101",Sewer,,"$45,000.00"
4,BLD2024-03821,1/3/2025,"434 Alterations(Res), Deck, Porch, Remodel",2358 BEAUCHAMP CT,BUFORD,Residential,,,Sewer,,"$36,000.00"


Unfortunately, most data in the pdf does not have "NO. OF UNITS", therefore we cannot extract specifically permits for residential buildings that have specific number of units

In [6]:
# Next Steps:
# 1 Filter relevant data by 'townhouse'

df_townhouses = df[df['use'].str.lower() == 'townhouse']
df_townhouses.head()

Unnamed: 0,case_number,issue_date,census_code,address,city,comm_or_res,construction_purpose,sewer_septic_address,sewer_or_septic,use,estimated_cost
74,RESBLD2024-04390,1/2/2025,"102 Single Family - Attached, Townhouse",591 PORCHLIGHT DR,LOGANVILLE,Residential,NEW TOWNHOUSE\n,"ATLANTA, GA 30326",Sewer,Townhouse,"$361,988.95"
75,RESBLD2024-04391,1/2/2025,"102 Single Family - Attached, Townhouse",581 PORCHLIGHT DR,LOGANVILLE,Residential,NEW TOWNHOUSE,"ATLANTA, GA 30326",Sewer,Townhouse,"$362,982.97"
76,RESBLD2024-04392,1/2/2025,"102 Single Family - Attached, Townhouse",571 PORCHLIGHT DR,LOGANVILLE,Residential,NEW TOWNHOUSE,"ATLANTA, GA 30326",Sewer,Townhouse,"$367,290.39"
77,RESBLD2024-04393,1/2/2025,"102 Single Family - Attached, Townhouse",561 PORCHLIGHT DR,LOGANVILLE,Residential,NEW TOWNHOUSE\n,"ATLANTA, GA 30326",Sewer,Townhouse,"$367,290.39"
78,RESBLD2024-04394,1/2/2025,101 Single Family - Detatched,551 PORCHLIGHT DR,LOGANVILLE,Residential,NEW TOWNHOUSE,"ATLANTA, GA 30326",Sewer,Townhouse,"$362,982.97"


In [7]:
df.to_csv("cleaned_gwinnett_permits.csv")
df_townhouses.to_csv("townhouses.csv")