# Imports 

In [2]:
import json
import openai
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
import pandas as pd

GPT_MODEL = "gpt-3.5-turbo-0613"

# Utils

In [3]:
async def run_scrape(url):
    data = ""
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)

        page = await browser.new_page()
        await page.goto(url)

        page_source = await page.content()
        soup = BeautifulSoup(page_source, "html.parser")
        
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        data = ' '.join(chunk for chunk in chunks if chunk)

        await browser.close()
    return data

def extract_doctor_info(schema, llm, scrape_results):
    extraction_chain = create_extraction_chain(schema, llm)
    return extraction_chain.run(scrape_results)

In [4]:
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613", openai_api_key=openai_key)

schema = {
    "properties": {
        "name": {"type": "string"},
        "clinic_location_name": {"type": "string"},
        "address": {"type": "string"}
    },
    "required": ["name", "clinic_location_name", "address"],
}

In [54]:
url = "https://www.forresthealth.org/doctors/profile/?id=226"

In [60]:
extract_doctor_info(schema, llm, output)

[{'name': 'Katherine P. Alexis, MD',
  'clinic_location_name': 'Oak Grove Family Clinic',
  'address': '5192 Old Highway 11 Hattiesburg, Mississippi 39402'}]

In [5]:
output = 'David Yowell - Forrest Health\nSkip to Content\nJavaScript has been disabled within your browser, the content or the functionality of this web page can be\nlimited or unavailable.\nSearch\nCall\nMenu\nAbout\nGiving\nBlog\nContact\nSpirit of Women\nSearch Site\n555-555-5555\nMain Navigation\nClasses & Events\nFind a Doctor\nOur Locations\nOur Services\nPatients & Visitors\nYou are here:\nForrest Health System\nFind a Doctor\nPhysician Profile\nAAA\nNew Search\nDavid R. Yowell, PhD\nProfile\nProfile\nGender:Male\nSpecialties\nPsychology\nService Line\nBehavioral Health\nOffice Information\nLocations\nHattiesburg Clinic - Psychology and Counseling\n102 Medical Park\nHattiesburg, Mississippi 39401'

In [30]:
ralph = "Ralph Sumner Abraham, IV, MD - Forrest Health Skip to Content JavaScript has been disabled within your browser, the content or the functionality of this web page can be limited or unavailable. Search Call Menu About Giving Blog Contact Spirit of Women Search Site 555-555-5555 Main Navigation Classes & Events Find a Doctor Our Locations Our Services Patients & Visitors You are here: Forrest Health System Find a Doctor Physician Profile AAA New Search Ralph Sumner Abraham IV, MD Profile Office Information Doctor's Contact Information Office Phone: 601-288-2150 Locations Relias Healthcare 201 W. Main Street Tupelo, Mississippi 38804 Main: 662-432-4106 Forrest General Hospital 6051 US Hwy. 49 Hattiesburg, Mississippi 39401 Main: 601-288-7000 Profile Office Information Share your experience and write a review! Careers & Volunteers Volunteers Careers Student Programs Medical Professionals Medical Staff Services Commitment to C.A.R.E FGH Family Medicine Residency Program Media and Newsroom Newsroom Blog Events Media Assets Vendors Vendor Registration Food Truck Vendor Form Community Involvement Annual Report Sponsorship Form Community Health Needs Assessment Spirit of Women Newsletter NOTICE OF PRIVACY PRACTICES · PUBLIC RECORDS REQUEST · FINANCIAL ASSISTANCE · NON-DISCRIMINATION NOTICE· NO SURPRISE BILLING RIGHTS · BCBS TRANSPARENCY IN COVERAGE · GOOD FAITH ESTIMATE NOTICE REQUEST FROM LAW ENFORCEMENT FOR RELEASE OF PROTECTED HEALTH INFORMATION · HATTIESBURG CLINIC ACO 6051 US HIGHWAY 49, HATTIESBURG MS 39401 · 601-288-7000 · © FORREST HEALTH · ALL RIGHTS RESERVED · Employment Policy: It is the policy of Forrest Health to recruit and select candidates for employment without regard to race, color, sex (including sexual orientation and gender identity), religion, national origin, age, disability or other status protected by applicable federal or state statutes. A Board of Trustees appointed by the Forrest County Board of Supervisors is charged with the oversight of Forrest Health. The system is completely self supporting and does not operate on local taxes.Forrest Health facilities are approved by the U.S. Department of Health and Human Services for participation in Medicare and Medicaid Programs."

In [6]:
output

'David Yowell - Forrest Health\nSkip to Content\nJavaScript has been disabled within your browser, the content or the functionality of this web page can be\nlimited or unavailable.\nSearch\nCall\nMenu\nAbout\nGiving\nBlog\nContact\nSpirit of Women\nSearch Site\n555-555-5555\nMain Navigation\nClasses & Events\nFind a Doctor\nOur Locations\nOur Services\nPatients & Visitors\nYou are here:\nForrest Health System\nFind a Doctor\nPhysician Profile\nAAA\nNew Search\nDavid R. Yowell, PhD\nProfile\nProfile\nGender:Male\nSpecialties\nPsychology\nService Line\nBehavioral Health\nOffice Information\nLocations\nHattiesburg Clinic - Psychology and Counseling\n102 Medical Park\nHattiesburg, Mississippi 39401'

In [9]:
import os
import glob
import ast

directory_path = './doctors'  # Replace with the actual directory path

# Find all .py files in the directory
file_pattern = os.path.join(directory_path, '*.py')
py_files = glob.glob(file_pattern)

# Loop over each .py file
for file_path in py_files:
    with open(file_path, 'r') as file:
        # Read the content of the file
        content = file.read()

        # Extract the data as a Python dictionary
        output_urls = []
        try:
            module = ast.parse(content)
            assign = module.body[0]
            if isinstance(assign, ast.Assign) and len(assign.targets) == 1 and isinstance(assign.targets[0], ast.Name) and assign.targets[0].id == 'data':
                data_node = assign.value
                if isinstance(data_node, ast.Str):
                    data_dict = ast.literal_eval(data_node.s)
                    # Process the data_dict
                    if isinstance(data_dict, list):
                        for item in data_dict:
                            if isinstance(item, dict) and 'url' in item:
                                url = item['url']
                                print(url)
                                output_urls.append(url)
                                # Process the URL as needed
                                # ...
                else:
                    print(f"Invalid format in file: {file_path}")
            else:
                print(f"No 'data' variable found in file: {file_path}")
        except SyntaxError:
            print(f"Invalid syntax in file: {file_path}")


https://www.forresthealth.org/doctors/profile/?id=301
https://www.forresthealth.org/doctors/profile/?id=525
https://www.forresthealth.org/doctors/profile/?id=318
https://www.forresthealth.org/doctors/profile/?id=1049
https://www.forresthealth.org/doctors/profile/?id=527
https://www.forresthealth.org/doctors/profile/?id=518
https://www.forresthealth.org/doctors/profile/?id=16
https://www.forresthealth.org/doctors/profile/?id=1039
https://www.forresthealth.org/doctors/profile/?id=35
https://www.forresthealth.org/doctors/profile/?id=311
https://www.forresthealth.org/doctors/profile/?id=1009
https://www.forresthealth.org/doctors/profile/?id=595
https://www.forresthealth.org/doctors/profile/?id=691
https://www.forresthealth.org/doctors/profile/?id=658
https://www.forresthealth.org/doctors/profile/?id=519
https://www.forresthealth.org/doctors/profile/?id=226
https://www.forresthealth.org/doctors/profile/?id=666
https://www.forresthealth.org/doctors/profile/?id=523
https://www.forresthealth.o

In [10]:
output_urls

['https://www.forresthealth.org/doctors/profile/?id=1009',
 'https://www.forresthealth.org/doctors/profile/?id=595',
 'https://www.forresthealth.org/doctors/profile/?id=691',
 'https://www.forresthealth.org/doctors/profile/?id=658',
 'https://www.forresthealth.org/doctors/profile/?id=519',
 'https://www.forresthealth.org/doctors/profile/?id=226',
 'https://www.forresthealth.org/doctors/profile/?id=666',
 'https://www.forresthealth.org/doctors/profile/?id=523',
 'https://www.forresthealth.org/doctors/profile/?id=1005',
 'https://www.forresthealth.org/doctors/profile/?id=451']

In [26]:
urls = [
    "https://www.forresthealth.org/doctors/profile/?id=278",
    "https://www.forresthealth.org/doctors/profile/?id=226",
    "https://www.forresthealth.org/doctors/profile/?id=1009"
]

In [27]:
import pandas as pd
import asyncio

# Create an empty list to store the data
data_list = []


for doctor_profile in urls:
    result = await run_scrape(doctor_profile)
    data_list.append({'url': doctor_profile, 'data': result})


In [28]:
df = pd.DataFrame(data_list)

In [29]:
df

Unnamed: 0,url,data
0,https://www.forresthealth.org/doctors/profile/...,David Yowell - Forrest Health Skip to Content ...
1,https://www.forresthealth.org/doctors/profile/...,Katherine Alexis - Forrest Health Skip to Cont...
2,https://www.forresthealth.org/doctors/profile/...,"Ralph Sumner Abraham, IV, MD - Forrest Health ..."


In [30]:
df.iloc[0]['data']

'David Yowell - Forrest Health Skip to Content JavaScript has been disabled within your browser, the content or the functionality of this web page can be limited or unavailable. Search Call Menu About Giving Blog Contact Spirit of Women Search Site 555-555-5555 Main Navigation Classes & Events Find a Doctor Our Locations Our Services Patients & Visitors You are here: Forrest Health System Find a Doctor Physician Profile AAA New Search David R. Yowell, PhD Profile Profile Gender:Male Specialties Psychology Service Line Behavioral Health Office Information Locations Hattiesburg Clinic - Psychology and Counseling 102 Medical Park Hattiesburg, Mississippi 39401 Profile Office Information Share your experience and write a review! Careers & Volunteers Volunteers Careers Student Programs Medical Professionals Medical Staff Services Commitment to C.A.R.E FGH Family Medicine Residency Program Media and Newsroom Newsroom Blog Events Media Assets Vendors Vendor Registration Food Truck Vendor Form

In [31]:
def shorten_blurb(text, sentence):
    # idx start
    start_index = text.find(sentence)

    cleaned = text[:start_index]

    return cleaned

In [32]:
df['data'] = df['data'].apply(lambda x: shorten_blurb(x, 'Share your experience and write a review!'))

In [33]:
df.iloc[0]['data']

'David Yowell - Forrest Health Skip to Content JavaScript has been disabled within your browser, the content or the functionality of this web page can be limited or unavailable. Search Call Menu About Giving Blog Contact Spirit of Women Search Site 555-555-5555 Main Navigation Classes & Events Find a Doctor Our Locations Our Services Patients & Visitors You are here: Forrest Health System Find a Doctor Physician Profile AAA New Search David R. Yowell, PhD Profile Profile Gender:Male Specialties Psychology Service Line Behavioral Health Office Information Locations Hattiesburg Clinic - Psychology and Counseling 102 Medical Park Hattiesburg, Mississippi 39401 Profile Office Information '

In [34]:
df['doctor_data'] = df['data'].apply(lambda x: extract_doctor_info(schema, llm, x))

In [36]:
df

Unnamed: 0,url,data,doctor_data
0,https://www.forresthealth.org/doctors/profile/...,David Yowell - Forrest Health Skip to Content ...,"[{'name': 'David Yowell', 'clinic_location_nam..."
1,https://www.forresthealth.org/doctors/profile/...,Katherine Alexis - Forrest Health Skip to Cont...,"[{'name': 'Katherine P. Alexis, MD', 'clinic_l..."
2,https://www.forresthealth.org/doctors/profile/...,"Ralph Sumner Abraham, IV, MD - Forrest Health ...","[{'name': 'Ralph Sumner Abraham, IV, MD', 'cli..."


In [39]:
from pprint import pprint
pprint(df.iloc[2]['doctor_data'])

[{'address': '201 W. Main Street Tupelo, Mississippi 38804',
  'clinic_location_name': 'Relias Healthcare',
  'name': 'Ralph Sumner Abraham, IV, MD'},
 {'address': '6051 US Hwy. 49 Hattiesburg, Mississippi 39401',
  'clinic_location_name': 'Forrest General Hospital',
  'name': 'Ralph Sumner Abraham IV, MD'}]


In [43]:
# Function to process each list within the column
final_list = []
def process_list(lst):
    for item in lst:
        # Do something with each item in the list
        final_list.append(item)
        
df['doctor_data'].apply(lambda x: process_list(x))

output_df = pd.DataFrame(final_list)

df_filtered = output_df[output_df['address'].str.contains('Hattiesburg')]

In [44]:
df['doctor_data'].apply(lambda x: process_list(x))

0    None
1    None
2    None
Name: doctor_data, dtype: object

In [45]:
final_list

[{'name': 'David Yowell',
  'clinic_location_name': 'Hattiesburg Clinic - Psychology and Counseling',
  'address': '102 Medical Park Hattiesburg, Mississippi 39401'},
 {'name': 'Katherine P. Alexis, MD',
  'clinic_location_name': 'Oak Grove Family Clinic',
  'address': '5192 Old Highway 11 Hattiesburg, Mississippi 39402'},
 {'name': 'Ralph Sumner Abraham, IV, MD',
  'clinic_location_name': 'Relias Healthcare',
  'address': '201 W. Main Street Tupelo, Mississippi 38804'},
 {'name': 'Ralph Sumner Abraham IV, MD',
  'clinic_location_name': 'Forrest General Hospital',
  'address': '6051 US Hwy. 49 Hattiesburg, Mississippi 39401'}]

In [47]:
output_df = pd.DataFrame(final_list)

In [51]:
output_df

Unnamed: 0,name,clinic_location_name,address
0,David Yowell,Hattiesburg Clinic - Psychology and Counseling,"102 Medical Park Hattiesburg, Mississippi 39401"
1,"Katherine P. Alexis, MD",Oak Grove Family Clinic,"5192 Old Highway 11 Hattiesburg, Mississippi 3..."
2,"Ralph Sumner Abraham, IV, MD",Relias Healthcare,"201 W. Main Street Tupelo, Mississippi 38804"
3,"Ralph Sumner Abraham IV, MD",Forrest General Hospital,"6051 US Hwy. 49 Hattiesburg, Mississippi 39401"


In [52]:
df_filtered = output_df[output_df['address'].str.contains('Hattiesburg')]

In [53]:
df_filtered

Unnamed: 0,name,clinic_location_name,address
0,David Yowell,Hattiesburg Clinic - Psychology and Counseling,"102 Medical Park Hattiesburg, Mississippi 39401"
1,"Katherine P. Alexis, MD",Oak Grove Family Clinic,"5192 Old Highway 11 Hattiesburg, Mississippi 3..."
3,"Ralph Sumner Abraham IV, MD",Forrest General Hospital,"6051 US Hwy. 49 Hattiesburg, Mississippi 39401"


In [54]:
df_filtered.to_csv('output.csv', index=False)