In [45]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
API_KEY = "AIzaSyCg3PlS6P25_slejej8FTjkXtN6yWaKrwI"

In [47]:
class VancouverCitiesScraper:
    def __init__(self):
        self.base_url = "https://www.municipality-canada.com"
        self.target_url = f"{self.base_url}/en/regional-district-greater-vancouver.html"
    
    def fetch_city_data(self):
        response = requests.get(self.target_url)
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract hrefs and titles from h2 tags with class "alv"
        cities = [{"title": a["title"], "href": a["href"]}
                  for h2 in soup.find_all("h2", class_="alv") 
                  for a in h2.find_all("a", title=True)]
        
        return self.preprocessing_location(cities)

    def get_coordinates(self, url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the latitude and longitude inside the <td class="geo"> tag
        geo_tag = soup.find("td", class_="geo")
        if geo_tag:
            latitude = geo_tag.find("span", class_="latitude").text.strip()
            longitude = geo_tag.find("span", class_="longitude").text.strip()
            return latitude, longitude
        
        time.sleep(2)
        return None, None

    def preprocessing_location(self, cities):
        df = []
        for city in cities:
            full_url = self.base_url + city["href"]
            print(full_url)
            latitude, longitude = self.get_coordinates(full_url)
            df.append({"city": city["title"], "latitude": latitude, "longitude": longitude})
        return df

class GoogleApiExtraction:
    def __init__(self, cities, api_key):
        self.location = [[city['latitude'], city['longitude']] for city in cities]
        self.api_key = api_key
        self.keywords = ["marketing agency", "web development", "SEO agencies", "SEO agency"]
        self.params = {
            "radius": 3000,
            "key": self.api_key
        }
        self.all_results = {}
        self.url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

    def agencies_establishments(self):
        for loc in self.location:
            print(f"This is loc: {loc}")
            self.params["location"] = f"{loc[0]},{loc[1]}"
            print(f"Working on {self.params['location']}")

            for keyword in self.keywords:
                self.params["keyword"] = keyword
                while True:
                    response = requests.get(self.url, params=self.params)
                    data = response.json()
                
                    if "results" in data:
                        if self.params["location"] not in self.all_results:
                            self.all_results[self.params["location"]] = []
                        self.all_results[self.params["location"]].extend(data["results"])
                
                    next_page_token = data.get("next_page_token")
                    if not next_page_token:
                        break
                
                    time.sleep(2)  # Delay before fetching next page
                    self.params["pagetoken"] = next_page_token
                    print(f"Results fetched = {len(self.all_results[self.params['location']])}")

        print(f"Total results fetched = {sum(len(v) for v in self.all_results.values())}")
        df1 = self.format_data(self.all_results)
        return df1

    def format_data(self, all_results):
        formatted_data = []
        for results in all_results.values():
            for result in results:
                formatted_data.append({
                    "place_id": result.get("place_id", "N/A"),
                })

        # Convert to pandas DataFrame
        data_df = pd.DataFrame(formatted_data)
        detailed_data = []
        print(data_df.head())
        for place_id in data_df["place_id"]:
            url = f"https://maps.googleapis.com/maps/api/place/details/json?place_id={place_id}&key={self.api_key}"
            response = requests.get(url)
            data = response.json()

            if "result" in data:
                detailed_data.append({
                    "place_id": place_id, 
                    "name": data["result"].get("name", "N/A"),
                    "website": data["result"].get("website", "N/A"),
                    "formatted_phone_number": data["result"].get("formatted_phone_number", "N/A"),
                    "international_phone_number": data["result"].get("international_phone_number", "N/A"),
                    "operational": data["result"].get("business_status", "N/A"),
                    "total_ratings": data["result"].get("user_ratings_total", "N/A"),
                    "avg_rating": data["result"].get("rating", "N/A"),
                    "vicinity": data["result"].get("vicinity", "N/A"),
                    "types": data["result"].get("types", "N/A"),
                    "lat": data["result"].get("geometry", {}).get("location", {}).get("lat", "N/A"),
                    "lng": data["result"].get("geometry", {}).get("location", {}).get("lng", "N/A"),
                })
            else:
                print(f"Warning: No data found for place_id {place_id}")
                
        print("Appended")
        return detailed_data

scraper = VancouverCitiesScraper()
cities = scraper.fetch_city_data()
print(f"here's city {cities}")

extractor = GoogleApiExtraction(cities,API_KEY)
dt = extractor.agencies_establishments()
data = pd.DataFrame(dt)
final_data = data.drop_duplicates(subset="place_id")
final_data.to_csv("vancouver Marketing web dev seo agencies extracted.csv",index=False)
final_data.head()

https://www.municipality-canada.com/en/city-vancouver.html
https://www.municipality-canada.com/en/city-surrey.html
https://www.municipality-canada.com/en/city-burnaby.html
https://www.municipality-canada.com/en/city-richmond-british-columbia.html
https://www.municipality-canada.com/en/city-coquitlam.html
https://www.municipality-canada.com/en/district-municipality-township-of-langley.html
https://www.municipality-canada.com/en/city-delta.html
https://www.municipality-canada.com/en/district-municipality-district-of-north-vancouver.html
https://www.municipality-canada.com/en/city-maple-ridge.html
https://www.municipality-canada.com/en/city-new-westminster.html
https://www.municipality-canada.com/en/city-port-coquitlam.html
https://www.municipality-canada.com/en/city-north-vancouver.html
https://www.municipality-canada.com/en/district-municipality-west-vancouver.html
https://www.municipality-canada.com/en/city-port-moody.html
https://www.municipality-canada.com/en/city-langley.html
https:

Unnamed: 0,place_id,name,website,formatted_phone_number,international_phone_number,operational,total_ratings,avg_rating,vicinity,types,lat,lng
0,ChIJ9y8f9NZzhlQRx4Z6xOId3Y8,Major Tom,https://www.majortom.com/,(604) 642-6765,+1 604-642-6765,OPERATIONAL,48,4.7,"1090 Homer Street #490, Vancouver","[point_of_interest, establishment]",49.276281,-123.121141
1,ChIJGVqnZ5BxhlQRuwZEG9KFb6w,Jackai Agency,https://joshuajackai.com/,,,OPERATIONAL,29,4.9,"128 West Cordova Street #3804, Vancouver","[point_of_interest, establishment]",49.282975,-123.108084
2,ChIJCzX3czJzhlQRlsP9-4e9CwY,Tiny Planet Digital,http://tinyplanet.digital/,(778) 807-3786,+1 778-807-3786,OPERATIONAL,38,5.0,"939 Homer Street #2204, Vancouver","[point_of_interest, establishment]",49.278115,-123.119485
3,ChIJFUxWDBtzhlQRLoGAuS5hU-g,Meshroad Marketing,https://meshroad.com/vancouver,(236) 309-4455,+1 236-309-4455,OPERATIONAL,35,5.0,"997 Seymour Street, Vancouver","[point_of_interest, establishment]",49.278871,-123.121973
4,ChIJZ3sI8MdzhlQRe0TH3bhx9Bk,Archive Digital,http://archivedigital.com/,(604) 493-2002,+1 604-493-2002,OPERATIONAL,17,4.7,"999 Canada Place #404, Vancouver","[point_of_interest, establishment]",49.28752,-123.113867


In [49]:
data.tail()

Unnamed: 0,place_id,name,website,formatted_phone_number,international_phone_number,operational,total_ratings,avg_rating,vicinity,types,lat,lng
1715,ChIJl9ro2LVzhlQRDmxPhjZeupo,Juiced Digital Agency,https://juiceddigital.com/,(604) 617-0259,+1 604-617-0259,OPERATIONAL,3,5.0,"202, 1965 West 4th Avenue, Vancouver","[point_of_interest, establishment]",49.268309,-123.149615
1716,ChIJQYhMonlxhlQRI5OfsoLiKN0,Wallop,https://wallop.ca/?utm_source=google_my_busine...,(604) 408-6326,+1 604-408-6326,OPERATIONAL,11,4.9,"555 Burrard Street Floor 16, Vancouver","[point_of_interest, establishment]",49.286371,-123.119108
1717,ChIJGwxKClZxhlQRSCY2whiGiT4,Vancoweb Real Estate Marketing Agency,https://vancoweb.ca/,(778) 681-4551,+1 778-681-4551,OPERATIONAL,10,5.0,"151 West Hastings Street Unit 317, Vancouver","[point_of_interest, establishment]",49.282465,-123.10887
1718,ChIJq6omYixyhlQRJXunl-0Wd9w,IDEA MARKETING,https://ideamarketing.ca/,(236) 513-8111,+1 236-513-8111,OPERATIONAL,21,4.9,"1250 Burnaby Street, Vancouver","[point_of_interest, establishment]",49.281521,-123.135606
1719,ChIJyQKtqHlxhlQRNnAfwKjq-Cg,Agency Blink,https://agencyblink.com/,(604) 630-4960,+1 604-630-4960,OPERATIONAL,9,4.9,"937 East Hastings Street, Vancouver","[store, point_of_interest, establishment]",49.281254,-123.083919


In [51]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1720 entries, 0 to 1719
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   place_id                    1720 non-null   object 
 1   name                        1720 non-null   object 
 2   website                     1720 non-null   object 
 3   formatted_phone_number      1720 non-null   object 
 4   international_phone_number  1720 non-null   object 
 5   operational                 1720 non-null   object 
 6   total_ratings               1720 non-null   int64  
 7   avg_rating                  1720 non-null   float64
 8   vicinity                    1720 non-null   object 
 9   types                       1720 non-null   object 
 10  lat                         1720 non-null   float64
 11  lng                         1720 non-null   float64
dtypes: float64(3), int64(1), object(8)
memory usage: 161.4+ KB
