In [14]:
# Cell 1: Import required libraries
!pip install PyPDF2

import pandas as pd
import re
import PyPDF2
import warnings
warnings.filterwarnings('ignore')




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [15]:
# Cell 2: Define helper functions
def read_pdf(pdf_path):
    """Read text content from PDF file"""
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        print(f"Processing {num_pages} pages...")
        
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text() + "\n"
            
    return text

def clean_text(text):
    """Clean up common text issues"""
    return text.strip().replace('\n', ' ').replace('  ', ' ')

In [16]:
#Cell 3: Define the main extraction function
def extract_providers(text):
    providers = []
    current_provider = None
    lines = text.split('\n')
    
    for i, line in enumerate(lines):
        # Skip empty lines and page headers
        if not line.strip() or 'Primary care providers (PCPs)' in line or 'Total Number of Practitioners:' in line:
            continue
            
        # New provider entry typically starts with ALL CAPS name and credentials
        if re.match(r'^[A-Z][A-Z\s,\.]+(?:MD|DO|NP|PA|FNP)', line):
            if current_provider:
                providers.append(current_provider)
            
            current_provider = {
                'Provider_Name': line.strip(),
                'Practice_Name': '',
                'Specialty': 'FAMILY MEDICINE',
                'Phone': '',
                'Hours': '',
                'Street_Address': '',
                'City': '',
                'State': 'MA',
                'Zip': '',
                'Type_of_Provider': line.split(',')[-1].strip() if ',' in line else '',
                'Languages': [],
                'Hospital_Affiliations': [],
                'Provider_Affiliations': '',
                'Accepting_New_Patients': '',
                'Special_Services': [],
                'Symbols': []
            }
            continue
            
        if not current_provider:
            continue

        # Extract various fields...
        if not current_provider['Practice_Name'] and not re.match(r'\d', line) and not 'Hospital Affiliations:' in line:
            if re.match(r'^[A-Z][a-z]', line):
                current_provider['Practice_Name'] = clean_text(line)
                continue

        phone_match = re.search(r'\d{3}-\d{3}-\d{4}', line)
        if phone_match and not current_provider['Phone']:
            current_provider['Phone'] = phone_match.group()
            continue

        if re.search(r'(?:M|T|W|Th|F|Sa|Su).*(?:a|p)', line) and not re.search(r'\d{3}-\d{3}-\d{4}', line):
            current_provider['Hours'] = clean_text(line)
            continue

        if re.search(r'\d+.*(?:St|Dr|Rd|Ave|Way|Pkwy|Ln)', line) and not current_provider['Street_Address']:
            current_provider['Street_Address'] = clean_text(line)
            continue

        city_zip_match = re.search(r'([A-Za-z\s]+),\s*MA\s*(\d{5})', line)
        if city_zip_match and not current_provider['Zip']:
            current_provider['City'] = city_zip_match.group(1).strip()
            current_provider['Zip'] = city_zip_match.group(2)
            continue

        if 'Languages Spoken By The Provider:' in line:
            languages = line.split('Languages Spoken By The Provider:')[1].strip()
            current_provider['Languages'].append(languages)
            continue

        if 'Hospital Affiliations:' in line:
            affiliations = line.split('Hospital Affiliations:')[1].strip()
            current_provider['Hospital_Affiliations'].append(affiliations)
            continue

        if 'Provider Affiliations:' in line:
            current_provider['Provider_Affiliations'] = line.split('Provider Affiliations:')[1].strip()
            continue

        if 'Accepting New Patients?' in line:
            current_provider['Accepting_New_Patients'] = line.split('Accepting New Patients?')[1].strip()
            continue

        if re.match(r'^[A-Z][a-z].*(?:Issues|Disorders|Services)', line):
            current_provider['Special_Services'].append(clean_text(line))
            continue

        if re.match(r'^[a-z, ]+$', line):
            current_provider['Symbols'].append(line.strip())

    # Add the last provider
    if current_provider:
        providers.append(current_provider)

    # Convert to DataFrame and clean up
    df = pd.DataFrame(providers)
    
    # Join list fields with semicolons
    list_columns = ['Languages', 'Hospital_Affiliations', 'Special_Services', 'Symbols']
    for col in list_columns:
        df[col] = df[col].apply(lambda x: '; '.join(x) if x else '')

    return df


In [17]:
# Cell 4: Run the extraction
# Replace 'your_file.pdf' with your actual PDF filename
pdf_path = 'c:\\Users\\jlott\\Downloads\\PCP2.pdf'
text_content = read_pdf(pdf_path)
df = extract_providers(text_content)

Processing 8 pages...


In [18]:
# Cell 5: Display basic information about the extracted data
print(f"Total number of providers extracted: {len(df)}")
print("\nProvider types:")
print(df['Type_of_Provider'].value_counts())

Total number of providers extracted: 46

Provider types:
Type_of_Provider
MD              20
DO               5
NP               4
NANCY MD         2
SAMANTHA NP      1
NISHA FNP        1
JESSEN DO        1
RABIA MD         1
PATRICIA MD      1
FNP              1
ERICA NP         1
VIMAL MD         1
JESSICA MD       1
KRISTIN FNP      1
STEPHANIE NP     1
PA               1
KELLY MD         1
ARASH MD         1
RACHEL MD        1
Name: count, dtype: int64


In [20]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
# Cell 6: Save to Excel (optional)
output_file = 'c:\\Users\\jlott\\Downloads\\provider_directory.xlsx'
df.to_excel(output_file, index=False)
print(f"\nData saved to {output_file}")


Data saved to c:\Users\jlott\Downloads\provider_directory.xlsx


In [22]:
# Cell 7: Display sample of the data
df.head()

Unnamed: 0,Provider_Name,Practice_Name,Specialty,Phone,Hours,Street_Address,City,State,Zip,Type_of_Provider,Languages,Hospital_Affiliations,Provider_Affiliations,Accepting_New_Patients,Special_Services,Symbols
0,"NORTELUS, SAMANTHA NP",Manet Community Health Ctr,FAMILY MEDICINE,617-376-3000,Provider Affiliations: Manet,,North Quincy,MA,2171,SAMANTHA NP,,,,Limited,,"b, h, ml"
1,"OKALLO, JOSHUA L., MD",Manet Community Health Center,FAMILY MEDICINE,617-376-3000,Provider Affiliations: Manet,,North Quincy,MA,2171,MD,,,,Yes,,
2,"SHINER, ROBERT M., MD",Manet Community Health Ctr,FAMILY MEDICINE,617-376-3000,Provider Affiliations: Manet,,North Quincy,MA,2171,MD,,,,Yes,,
3,"TANIZZI, MEREDITH J., NP",Atrius Health Inc,FAMILY MEDICINE,781-682-0630,M T W Th F 8a-5p,90 Libbey Industrial Pkwy Ste 106,Weymouth,MA,2189,NP,,,Atrius Health Inc,Limited,,
4,"LVEY, AMANDA A., DO",Southcoast Physicians Group Inc,FAMILY MEDICINE,508-973-2216,Tobey Hospital - A Southcoast,,North Datmouth,MA,2740,DO,,Charlton,Southcoast,Limited,"Co-Occuring Disorders, Disabilites",


In [23]:
# Cell 8: Analysis cells (run these separately as needed)
# Number of providers by city
print("Providers by City:")
print(df['City'].value_counts())

# Languages offered
print("\nLanguages Available:")
languages = df['Languages'].str.split(';').explode().str.strip()
print(languages.value_counts())

# New patient acceptance status
print("\nNew Patient Acceptance Status:")
print(df['Accepting_New_Patients'].value_counts())

Providers by City:
City
                  11
Buzzards Bay       5
Hyannis            5
Orleans            4
North Quincy       3
Bourne             3
Harwich Port       3
Provincetown       3
South Yarmouth     2
Wellfleet          2
Weymouth           1
North Datmouth     1
Norwood            1
Harwich            1
Sandwich           1
Name: count, dtype: int64

Languages Available:
Languages
    46
Name: count, dtype: int64

New Patient Acceptance Status:
Accepting_New_Patients
Yes        27
No         14
Limited     5
Name: count, dtype: int64
