In [2]:
import time
import googlemaps
import pandas as pd
import requests
from bs4 import BeautifulSoup
from itertools import product

def miles_to_meters(miles):
    return miles * 1_609.344

def extract_email_from_website(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            emails = set()
            for mailto in soup.select('a[href^=mailto]'):
                emails.add(mailto.get('href').replace('mailto:', ''))
            if emails:
                return ', '.join(emails)
        return None
    except Exception as e:
        return None

def create_non_overlapping_grid(center_lat, center_lng, radius_miles):
    step_miles = 2 * radius_miles  # Non-overlapping step size
    step_lat = step_miles / 69  # Approx. 69 miles per degree of latitude
    step_lng = step_miles / (69 * abs(center_lat))  # Adjust for latitude compression

    lat_range = [center_lat + i * step_lat for i in range(-1, 2)]
    lng_range = [center_lng + i * step_lng for i in range(-1, 2)]

    return list(product(lat_range, lng_range))

def map_category(place_types):
    type_map = {
        'computer_store': 'Computer Store',
        'electronics_store': 'Electronics Store',
        'computer_repair': 'Computer Repair Service',
        'point_of_interest': 'Point of Interest',
        'establishment': 'Establishment'
    }
    for place_type in place_types:
        if place_type in type_map:
            return type_map[place_type]
    return 'Other'

API_KEY = 'AIzaSyBeIS9MPnXxndKeJuiopxKGxn9TU21UoFU'
map_client = googlemaps.Client(API_KEY)

center_lat, center_lng = 40.7062, -73.6187  # texas
radius_miles = 25  # Search radius
grid = create_non_overlapping_grid(center_lat, center_lng, radius_miles)

search_queries = [
    'small dry cleaner',
    'small spa',
    'small clothing',
    'small salon',
    'small gym',
    'small pet',
    'small software technology',
    'small shop',
    'small clinic',
    'small cable company',
    'small ladder scaffolding',
    'small skin care',
    'small laser hair removal',
    'small beauty',
    'small car wash',
    'small towing',
    'small barber',
    'small massage',
    'small HVAC',
    'small plumbing',
    'small childcare',
    'small art gallery',
    'small casino',
    'small nightclub',
    'small repair',
    'small security',
    'small staffing agency'
]

all_detailed_businesses = []
result_limit = 100
seen_place_ids = set()

for search_string in search_queries:
    business_list = []
    
    for (lat, lng) in grid:
        if len(business_list) >= result_limit:
            break
        
        response = map_client.places_nearby(
            location=(lat, lng),
            keyword=search_string,
            radius=miles_to_meters(radius_miles)
        )
        
        business_list.extend(response.get('results'))
        next_page_token = response.get('next_page_token')
        
        while next_page_token and len(business_list) < result_limit:
            time.sleep(2)
            response = map_client.places_nearby(
                location=(lat, lng),
                keyword=search_string,
                radius=miles_to_meters(radius_miles),
                page_token=next_page_token
            )
            business_list.extend(response.get('results'))
            next_page_token = response.get('next_page_token')
            
            if len(business_list) >= result_limit:
                break
    
    for business in business_list:
        place_id = business.get('place_id')
        
        if place_id in seen_place_ids:
            continue
        seen_place_ids.add(place_id)
        
        details = map_client.place(place_id=place_id)
        result = details.get('result', {})
        
        title = result.get('name')
        placeURL = 'https://www.google.com/maps/place/?q=place_id:' + place_id
        website = result.get('website')
        rating = result.get('rating')
        reviewCount = result.get('user_ratings_total')
        category = map_category(result.get('types', [])) if 'types' in result else 'Other'
        address = result.get('formatted_address')
        phoneNumber = result.get('formatted_phone_number')
        email = extract_email_from_website(website) if website else None
        
        business_info = {
            'title': title,
            'placeURL': placeURL,
            'website': website,
            'rating': rating,
            'reviewCount': reviewCount,
            'category': search_string.split(' ')[-1],  # Extract category from search string
            'address': address,
            'phoneNumber': phoneNumber,
            'searchQuery': search_string,
            'email': email
        }
        
        all_detailed_businesses.append(business_info)

df = pd.DataFrame(all_detailed_businesses)
print('done')

AIzaSyCgIqmQtlJAoWP18glXkyIiX6jFpijDuA4

ApiError: REQUEST_DENIED (This API project is not authorized to use this API.)

In [None]:
all_detailed_businesses.append(business_info)
df = pd.DataFrame(all_detailed_businesses)

In [None]:
''' 
For small HVAC/roofing/plumbing also maybe add in cable, window cleaning, ladder scaffolding
For small medical spa/self care maybe also add in laser hair removal clinic, facial clinic, nail salon
For small laundromat maybe also add in dry cleaning 

Radius of Search: The total search radius is set to 20 miles from the center. This means that the code will collect businesses within a radius of 20 miles from the initial center point.

Step Size: The grid is divided into smaller cells with a 5-mile step size. This means that the search area is divided into a grid where each cell is roughly 5 miles by 5 miles, and the Google Places API is queried at each of these grid points.
'''

' \nFor small HVAC/roofing/plumbing also maybe add in cable, window cleaning, ladder scaffolding\nFor small medical spa/self care maybe also add in laser hair removal clinic, facial clinic, nail salon\nFor small laundromat maybe also add in dry cleaning \n\nRadius of Search: The total search radius is set to 20 miles from the center. This means that the code will collect businesses within a radius of 20 miles from the initial center point.\n\nStep Size: The grid is divided into smaller cells with a 5-mile step size. This means that the search area is divided into a grid where each cell is roughly 5 miles by 5 miles, and the Google Places API is queried at each of these grid points.\n'

In [None]:
df.head()

Unnamed: 0,title,placeURL,website,rating,reviewCount,category,address,phoneNumber,searchQuery,email
0,West Side Laundry,https://www.google.com/maps/place/?q=place_id:...,,4.6,7.0,cleaner,"107 Montmorenci Ave, Ridgway, PA 15853, USA",,small dry cleaner,
1,MC'S Power Washing & Soft Washing Services LLC.,https://www.google.com/maps/place/?q=place_id:...,,5.0,17.0,cleaner,"Maple Ave, Ridgway, PA 15853, USA",(814) 594-9047,small dry cleaner,
2,Keystone Ace Hardware,https://www.google.com/maps/place/?q=place_id:...,http://www.acehardware.com/store-details/16368...,4.4,131.0,cleaner,"14 Main St, Ridgway, PA 15853, USA",(814) 773-5055,small dry cleaner,
3,Dollar General,https://www.google.com/maps/place/?q=place_id:...,https://www.dollargeneral.com/store-directory/...,4.0,193.0,cleaner,"249 Main St, Ridgway, PA 15853, USA",(814) 788-2669,small dry cleaner,
4,West Side Laundry,https://www.google.com/maps/place/?q=place_id:...,,4.6,7.0,cleaner,"107 Montmorenci Ave, Ridgway, PA 15853, USA",,small dry cleaner,


In [None]:
df.shape

(2767, 10)

In [None]:
df = df.drop_duplicates(keep='first')
df.shape

(324, 10)

In [None]:
df = df.drop_duplicates(subset='phoneNumber', keep='first')
df.shape

(233, 10)

In [None]:
for index, phone in df['phoneNumber'].items():
    if pd.notnull(phone):

        new_phone = ''
        for character in phone:
            if character.isnumeric():
                new_phone += character

        df.at[index, 'phoneNumber'] = new_phone 

for index, phone in df['phoneNumber'].items():
    if pd.notnull(phone):
        df.at[index, 'phoneNumber'] = '+1 ' + phone[:3] + '-' + phone[3:6] + '-' + phone[6:]

In [None]:
df.to_excel('pull4b.xlsx', index=True) ###############

In [None]:
df.head()

Unnamed: 0,title,placeURL,website,rating,reviewCount,category,address,phoneNumber,searchQuery,email
0,West Side Laundry,https://www.google.com/maps/place/?q=place_id:...,,4.6,7.0,cleaner,"107 Montmorenci Ave, Ridgway, PA 15853, USA",,small dry cleaner,
1,MC'S Power Washing & Soft Washing Services LLC.,https://www.google.com/maps/place/?q=place_id:...,,5.0,17.0,cleaner,"Maple Ave, Ridgway, PA 15853, USA",+1 814-594-9047,small dry cleaner,
2,Keystone Ace Hardware,https://www.google.com/maps/place/?q=place_id:...,http://www.acehardware.com/store-details/16368...,4.4,131.0,cleaner,"14 Main St, Ridgway, PA 15853, USA",+1 814-773-5055,small dry cleaner,
3,Dollar General,https://www.google.com/maps/place/?q=place_id:...,https://www.dollargeneral.com/store-directory/...,4.0,193.0,cleaner,"249 Main St, Ridgway, PA 15853, USA",+1 814-788-2669,small dry cleaner,
22,Family Dollar,https://www.google.com/maps/place/?q=place_id:...,https://www.familydollar.com/locations/pa/ridg...,4.5,20.0,cleaner,"102 N Broad St, Ridgway, PA 15853, USA",+1 814-389-1121,small dry cleaner,
