In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from fuzzywuzzy import fuzz
import time

In [2]:
url = 'https://www.dhs.state.il.us/OneNetLibrary/27896/documents/Scorecards/CILA-Quality-Scorecard-Data-2019-02-28.html'
 
response = requests.get(url)
 
print(response.status_code)

# 200 response means it can access the site

200


### Retrieving distinct service provider names

In [3]:
agency_soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
table = agency_soup.find('table', id="AgencyTable")
# print(table)

In [5]:
for service_data in table.find_all('tbody'):
    rows = service_data.find_all('tr')
    # print(rows)

In [6]:
name_data = []
for row in rows:
    agency_name = row.find_all('th')[0].text
    # print(agency_name)
    name_data.append(agency_name)  

df = pd.DataFrame(name_data)
df.to_csv("idd_providers_illinois.csv", index=False)

### Retrieving indistinct certified service providers with additional info

In [7]:
detailed_agency_soup = BeautifulSoup(response.text, 'html.parser')

In [8]:
cila_table = detailed_agency_soup.find('table', id="CilaTable")
# print(cila_table)

In [9]:
for service_data in cila_table.find_all('tbody'):
    rows = service_data.find_all('tr')
#    print(rows)

In [10]:
cila_data = []
for row in rows[1:]:  # Skip the header
    cells = row.find_all("td")
    provider = {
        "Name": cells[0].get_text(strip=True),
        "Zipcode": cells[1].get_text(strip=True),
        "CoEd": cells[3].get_text(strip=True),
        "Wheelchair Access": cells[6].get_text(strip=True),
        "24HR Support": cells[7].get_text(strip=True)
    }
    cila_data.append(provider)

# Convert to DataFrame
detailed_df = pd.DataFrame(cila_data)
detailed_df.to_csv("distinct_idd_providers_illinois.csv", index=False)

### Validating Providers

In [12]:
providers = pd.read_csv("idd_providers_illinois.csv")

providers = providers.head(3)
def search_provider_online(provider_name):
    # Google Custom Search API endpoint (replace with actual keys)
    search_url = f"https://www.googleapis.com/customsearch/v1?key=CUSTOM_SEARCH_API_KEYw&cx=CUSTOM_SEARCH_ENGINE_ID&q={provider_name}"
    try:
        response = requests.get(search_url)
        response.raise_for_status()  # Check for HTTP errors
        results = response.json().get("items", [])
        return results
    except requests.exceptions.RequestException as e:
        print(f"Error during Google Search API request: {e}")
        return []

def get_provider_details(provider_name):
    # Yelp API endpoint (replace with actual key)
    url = f"https://api.yelp.com/v3/businesses/search?term={provider_name}&location=Illinois"
    headers = {"Authorization": "Bearer {YELP_API_KEY}"}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error during Yelp API request: {e}")
        return {}

# Process each provider
for index, row in providers.iterrows():
    provider_name = row["0"]
    
    # Step 1: Verify existence using fuzzy matching
    search_results = search_provider_online(provider_name)
    is_active = any(fuzz.partial_ratio(provider_name.lower(), result['title'].lower()) > 80 for result in search_results)
    
    # Step 2: Enrich information if active
    if is_active:
        details = get_provider_details(provider_name)
        if "businesses" in details and len(details["businesses"]) > 0:
            business = details["businesses"][0]  # Take the first result
            providers.loc[index, "Address"] = business.get("location", {}).get("address1", "")
            providers.loc[index, "Phone"] = business.get("phone", "")
            providers.loc[index, "Yelp URL"] = business.get("url", "")
    
    # Delay to avoid rate limits
    time.sleep(2)

# Save updated providers data
providers.to_csv("verified_and_enriched_providers.csv", index=False)