In [1]:
import os
import requests
import time
import pandas as pd
import re
from dotenv import load_dotenv
from fuzzywuzzy import fuzz

In [2]:
# Load API key from .env
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_PLACES_API_KEY")
YELP_API_KEY = os.getenv("YELP_API_KEY")

if GOOGLE_API_KEY is None:
    raise ValueError("Google API key not found in the environment")
    
if YELP_API_KEY is None:
    raise ValueError("YELP API key not found in the environment")

In [3]:
parent_folder = 'webscraping outputs'

files = [
    f'{parent_folder}/Illinois_Adult_Day_Health_Providers.csv',
    f'{parent_folder}/Illinois_Behavior_Intervention_Providers.csv',
    f'{parent_folder}/Illinois_CILA_Providers.csv',
    f'{parent_folder}/Illinois_CLF_Providers.csv',
    f'{parent_folder}/Illinois_Community_Day_Providers.csv',
    f'{parent_folder}/Illinois_Home_Vehicle_Mod_Providers.csv',
    f'{parent_folder}/Illinois_ICF_Providers.csv'
]

In [4]:
# Read all CSV files with data we need to verify as dataframes, storing them in a single array
dfs = []

for file in files:
    df = pd.read_csv(file)
    dfs.append(df)

In [5]:
def find_provider_by_address_or_phone_google(address, phone_number):
    base_url = 'https://maps.googleapis.com/maps/api/place/findplacefromtext/json'
    
    # Search by address
    address_params = {
        'input': address,
        'inputtype': 'textquery',
        'fields': 'place_id,name,formatted_address',
        'key': GOOGLE_API_KEY
    }
    
    address_response = requests.get(base_url, params=address_params).json()
    
    # If a result is found by phone, return it
    if address_response.get('status') == 'OK':
        return address_response

    # If no result is found, search by phone number
    phone_params = {
        'input': phone_number,
        'inputtype': 'phonenumber',
        'fields': 'place_id,name,formatted_phone_number',
        'key': GOOGLE_API_KEY
    }
    phone_response = requests.get(base_url, params=phone_params).json()

    # Return the address response (may also return an empty result)
    return address_response

def find_provider_by_name_google(provider_name):
    base_url = 'https://maps.googleapis.com/maps/api/place/findplacefromtext/json'
    name_params = {
        'input': provider_name,
        'inputtype': 'textquery',
        'fields': 'place_id,name,formatted_address',
        'key': GOOGLE_API_KEY
    }
    name_response = requests.get(base_url, params=name_params).json()
    return name_response

def find_provider_by_name_yelp(provider_name, location):
    url = "https://api.yelp.com/v3/businesses/search"
    headers = {"Authorization": f"Bearer {YELP_API_KEY}"}
    params = {
        "term": provider_name,
        "location": "Illinois",  # Location string like "City, State"
        "limit": 1  # Adjust as needed
    }

    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return response.json()  # Returns a dictionary of results
    else:
        print(f"Yelp API Error: {response.status_code} - {response.text}")
        return None

In [6]:
# Normalize phone numbers
def normalize_phone(phone):
    """Normalize phone numbers by removing non-digit characters."""
    return re.sub(r'\D', '', phone or '')

# Function to compare names with fuzzy matching
def verify_name_match(provider_name, candidate_name, threshold=50):
    if not candidate_name:  # Handle missing candidate_name
        return False
    similarity = fuzz.partial_ratio(provider_name.lower().strip(), candidate_name.lower().strip())
    return similarity >= threshold  # Match if similarity exceeds threshold

# Main function to verify the provider
def verify_provider(provider_name, address, phone_number, city, state):
    # First attempt: Google Places API
    result = find_provider_by_address_or_phone_google(address, phone_number)
    if result and result.get('status') == 'OK':
        candidates = result.get('candidates', [])
        for candidate in candidates:
            candidate_name = candidate.get('name', '').strip()
            candidate_phone = candidate.get('formatted_phone_number', '').strip()

            name_matches = verify_name_match(provider_name, candidate_name)
            phone_matches = normalize_phone(phone_number) == normalize_phone(candidate_phone)

            if name_matches or phone_matches:  # Match found
                return True, candidate

    # Second attempt: Yelp API as fallback
    location = f"{city}, {state}"
    yelp_result = find_provider_by_name_yelp(provider_name, location)
    if yelp_result:
        for business in yelp_result.get("businesses", []):
            candidate_name = business.get('name', '').strip()
            candidate_phone = business.get('display_phone', '').strip()

            name_matches = verify_name_match(provider_name, candidate_name)
            phone_matches = normalize_phone(phone_number) == normalize_phone(candidate_phone)

            if name_matches or phone_matches:  # Match found
                return True, business  # Return Yelp result if match found

    # No match found in either API
    return False, None

In [7]:
# # Illinois_Adult_Day_Health_Providers.csv
# adult_day_health_df = dfs[0][dfs[0]['State'] == 'IL']
# verified_providers = []

# for index, row in adult_day_health_df.iterrows():
#     provider_name = row['Provider Name']
#     provider_address = row['Street Address']
#     provider_phone_number = row['Phone Number']

#     exists, data = verify_provider(provider_name, provider_address, provider_phone_number)

#     if exists:
#         verified_providers.append({
#             'Provider Name': provider_name,
#             'Address': provider_address,
#             'Phone Number': provider_phone_number,
#             'verified': exists,
#         })
#         print(f"Place Verified: {provider_name}")
#     else:
#         print(f"Place Not Found: {provider_name}")

#     # Sleep to respect rate limits
#     time.sleep(1)

# # Save results
# verified_df = pd.DataFrame(verified_providers)
# verified_df.to_csv('Verified_Illinois_Adult_Day_Health_Providers.csv', index=False)

In [8]:
# Filter Illinois Adult Day Health Providers from the dataset
adult_day_health_df = dfs[3][dfs[3]['State'] == 'IL']
verified_providers = []

for index, row in adult_day_health_df.iterrows():
    # Extract provider details
    provider_name = row['Provider Name']
    provider_address = row['Street Address']
    provider_phone_number = row['Phone Number']
    city = row['City']
    state = row['State']

    print(f"\nProcessing: {provider_name}, {provider_address}, {provider_phone_number}")

    # Verify provider using Google API and fallback to Yelp API
    exists, data = verify_provider(provider_name, provider_address, provider_phone_number, city, state)

    if exists:
        verified_providers.append({
            'Provider Name': provider_name,
            'Address': provider_address,
            'Phone Number': provider_phone_number,
            'Verified': exists,
            'Matched Data': data  # Store the matched result for debugging
        })
        print(f"✅ Place Verified: {provider_name}")
    else:
        print(f"❌ Place Not Found: {provider_name}")

    # Sleep to respect rate limits
    time.sleep(1)

# Convert results to DataFrame and save to CSV
verified_df = pd.DataFrame(verified_providers)
verified_df.to_csv('Verified_CLF_Providers.csv', index=False)

print("\nVerification completed. Results saved to 'Verified_Illinois_Adult_Day_Health_Providers.csv'.")



Processing: AVENUES TO INDEPENDENCE, 515 Busse Hwy, 847-292 0870


NameError: name 'find_provider_yelp' is not defined

In [None]:
# # Illinois_Adult_Day_Health_Providers.csv
# clf_df = dfs[3][dfs[3]['State'] == 'IL']
# verified_providers = []

# for index, row in clf_df.iterrows():
#     provider_name = row['Provider Name']
#     provider_address = row['Street Address']
#     provider_phone_number = row['Phone Number']

#     exists, data = verify_provider(provider_name, provider_address, provider_phone_number)

#     if exists:
#         verified_providers.append({
#             'Provider Name': provider_name,
#             'Address': provider_address,
#             'Phone Number': provider_phone_number,
#             'verified': exists,
#         })
#         print(f"Place Verified: {provider_name}")
#     else:
#         print(f"Place Not Found: {provider_name}")

#     # Sleep to respect rate limits
#     time.sleep(1)

# # Save results
# verified_df = pd.DataFrame(verified_providers)
# verified_df.to_csv('Verified_CLF_Providers.csv', index=False)