In [1]:
import time
import googlemaps
import pandas as pd
import requests
from bs4 import BeautifulSoup
from itertools import product

def miles_to_meters(miles):
    return miles * 1_609.344

def extract_email_from_website(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            emails = set()
            for mailto in soup.select('a[href^=mailto]'):
                emails.add(mailto.get('href').replace('mailto:', ''))
            if emails:
                return ', '.join(emails)
        return None
    except Exception as e:
        return None

def create_non_overlapping_grid(center_lat, center_lng, radius_miles):
    step_miles = 1.5 * radius_miles  # Slightly smaller step size for more grid points
    step_lat = step_miles / 69  # Approx. 69 miles per degree of latitude
    step_lng = step_miles / (69 * abs(center_lat))  # Adjust for latitude compression

    lat_range = [center_lat + i * step_lat for i in range(-2, 3)]  # More grid points
    lng_range = [center_lng + i * step_lng for i in range(-2, 3)]

    return list(product(lat_range, lng_range))

def map_category(place_types):
    type_map = {
        'computer_store': 'Computer Store',
        'electronics_store': 'Electronics Store',
        'computer_repair': 'Computer Repair Service',
        'point_of_interest': 'Point of Interest',
        'establishment': 'Establishment'
    }
    for place_type in place_types:
        if place_type in type_map:
            return type_map[place_type]
    return 'Other'

API_KEY = 'AIzaSyCgIqmQtlJAoWP18glXkyIiX6jFpijDuA4'
map_client = googlemaps.Client(API_KEY)

# Grid centers to cover the entire United States
grid_centers = [
    (37.0902, -95.7129),  # Central USA
]

radius_miles = 100  # Larger search radius to cover more area
all_detailed_businesses = []
seen_place_ids = set()  # To avoid duplicates

search_queries = [
    'small private cardiology',
    'small private fertility',
]

for center_lat, center_lng in grid_centers:
    grid = create_non_overlapping_grid(center_lat, center_lng, radius_miles)
    
    for search_string in search_queries:
        business_list = []
        
        for (lat, lng) in grid:
            while True:
                response = map_client.places_nearby(
                    location=(lat, lng),
                    keyword=search_string,
                    radius=miles_to_meters(radius_miles)
                )
                
                business_list.extend(response.get('results', []))
                next_page_token = response.get('next_page_token')
                
                if not next_page_token:
                    break
                
                time.sleep(3)
                response = map_client.places_nearby(
                    location=(lat, lng),
                    keyword=search_string,
                    radius=miles_to_meters(radius_miles),
                    page_token=next_page_token
                )
                business_list.extend(response.get('results', []))
            
        # Fetch detailed information for each business and avoid duplicates
        for business in business_list:
            place_id = business.get('place_id')
            
            if place_id in seen_place_ids:
                continue
            seen_place_ids.add(place_id)
            
            details = map_client.place(place_id=place_id)
            result = details.get('result', {})
            
            title = result.get('name')
            placeURL = 'https://www.google.com/maps/place/?q=place_id:' + place_id
            website = result.get('website')
            rating = result.get('rating')
            reviewCount = result.get('user_ratings_total')
            category = map_category(result.get('types', [])) if 'types' in result else 'Other'
            address = result.get('formatted_address')
            phoneNumber = result.get('formatted_phone_number')
            email = extract_email_from_website(website) if website else None
            
            business_info = {
                'title': title,
                'placeURL': placeURL,
                'website': website,
                'rating': rating,
                'reviewCount': reviewCount,
                'category': search_string.split(' ')[-1],  # Extract category from search string
                'address': address,
                'phoneNumber': phoneNumber,
                'searchQuery': search_string,
                'email': email
            }
            
            all_detailed_businesses.append(business_info)

# Convert the results to a DataFrame and save as a CSV
df = pd.DataFrame(all_detailed_businesses)
print('Done')

TransportError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [None]:
''' 
For small HVAC/roofing/plumbing also maybe add in cable, window cleaning, ladder scaffolding
For small medical spa/self care maybe also add in laser hair removal clinic, facial clinic, nail salon
For small laundromat maybe also add in dry cleaning 

Radius of Search: The total search radius is set to 20 miles from the center. This means that the code will collect businesses within a radius of 20 miles from the initial center point.

Step Size: The grid is divided into smaller cells with a 5-mile step size. This means that the search area is divided into a grid where each cell is roughly 5 miles by 5 miles, and the Google Places API is queried at each of these grid points.
'''

' \nFor small HVAC/roofing/plumbing also maybe add in cable, window cleaning, ladder scaffolding\nFor small medical spa/self care maybe also add in laser hair removal clinic, facial clinic, nail salon\nFor small laundromat maybe also add in dry cleaning \n\nRadius of Search: The total search radius is set to 20 miles from the center. This means that the code will collect businesses within a radius of 20 miles from the initial center point.\n\nStep Size: The grid is divided into smaller cells with a 5-mile step size. This means that the search area is divided into a grid where each cell is roughly 5 miles by 5 miles, and the Google Places API is queried at each of these grid points.\n'

In [None]:
df.head()

Unnamed: 0,title,placeURL,website,rating,reviewCount,category,address,phoneNumber,searchQuery,email
0,"Peter Banitt, MD, FACC",https://www.google.com/maps/place/?q=place_id:...,https://www.oregonclinic.com/our-team/peter-ba...,4.7,49.0,cardiology,"19260 SW 65th Ave Suite 420, Tualatin, OR 9706...",(503) 692-0405,small private cardiology,
1,"William M. Davies, MD, FACC",https://www.google.com/maps/place/?q=place_id:...,https://www.providence.org/doctors/cardiology/...,5.0,1.0,cardiology,"1510 Division St Suite 200, Oregon City, OR 97...",(503) 962-1000,small private cardiology,
2,"Cameron Ramsay, MD",https://www.google.com/maps/place/?q=place_id:...,https://www.legacyhealth.org/providers/cameron...,3.0,2.0,cardiology,"Medical Plaza Office Bldg 1, 19250 SW 65th Ave...",(503) 413-7162,small private cardiology,
3,Salem Health Heart & Vascular Center,https://www.google.com/maps/place/?q=place_id:...,http://www.salemhealth.org/services/heart,4.6,22.0,cardiology,"665 Winter St SE, Salem, OR 97301, USA",(503) 814-1700,small private cardiology,
4,The Oregon Clinic Cardiology - Tualatin,https://www.google.com/maps/place/?q=place_id:...,https://www.oregonclinic.com/locations/cardiol...,3.6,30.0,cardiology,"19260 SW 65th Ave Suite 420, Tualatin, OR 9706...",(503) 692-0405,small private cardiology,


In [None]:
df.shape

(261, 10)

In [None]:
df = df.drop_duplicates(keep='first')
df.shape

(260, 10)

In [None]:
df = df.drop_duplicates(subset='phoneNumber', keep='first')
df.shape

(168, 10)

In [None]:
for index, phone in df['phoneNumber'].items():
    if pd.notnull(phone):

        new_phone = ''
        for character in phone:
            if character.isnumeric():
                new_phone += character

        df.at[index, 'phoneNumber'] = new_phone 

for index, phone in df['phoneNumber'].items():
    if pd.notnull(phone):
        df.at[index, 'phoneNumber'] = '+1 ' + phone[:3] + '-' + phone[3:6] + '-' + phone[6:]

In [None]:
df.to_excel('cardio-ferti-allstate.xlsx', index=True) ###############

In [None]:
df.head()

Unnamed: 0,title,placeURL,website,rating,reviewCount,category,address,phoneNumber,searchQuery,email
0,"Peter Banitt, MD, FACC",https://www.google.com/maps/place/?q=place_id:...,https://www.oregonclinic.com/our-team/peter-ba...,4.7,49.0,cardiology,"19260 SW 65th Ave Suite 420, Tualatin, OR 9706...",+1 503-692-0405,small private cardiology,
1,"William M. Davies, MD, FACC",https://www.google.com/maps/place/?q=place_id:...,https://www.providence.org/doctors/cardiology/...,5.0,1.0,cardiology,"1510 Division St Suite 200, Oregon City, OR 97...",+1 503-962-1000,small private cardiology,
2,"Cameron Ramsay, MD",https://www.google.com/maps/place/?q=place_id:...,https://www.legacyhealth.org/providers/cameron...,3.0,2.0,cardiology,"Medical Plaza Office Bldg 1, 19250 SW 65th Ave...",+1 503-413-7162,small private cardiology,
3,Salem Health Heart & Vascular Center,https://www.google.com/maps/place/?q=place_id:...,http://www.salemhealth.org/services/heart,4.6,22.0,cardiology,"665 Winter St SE, Salem, OR 97301, USA",+1 503-814-1700,small private cardiology,
6,"Dr. Barath N. Krishnamurthy, MD",https://www.google.com/maps/place/?q=place_id:...,,4.3,7.0,cardiology,"610 Hawthorne Ave SE, Salem, OR 97301, USA",+1 503-814-4440,small private cardiology,
